From a5b11540f47d605189a06ac15b56117193b4758f Mon Sep 17 00:00:00 2001 From: Sean Luo Date: Mon, 17 Jun 2019 14:45:21 -0500 Subject: [PATCH] new branch --- exercise-001-corpus/word_count/main.go | 96 +++++++++++++++++++++ exercise-001-corpus/word_count/main_test.go | 30 +++++++ 2 files changed, 126 insertions(+) create mode 100644 exercise-001-corpus/word_count/main.go create mode 100644 exercise-001-corpus/word_count/main_test.go diff --git a/exercise-001-corpus/word_count/main.go b/exercise-001-corpus/word_count/main.go new file mode 100644 index 0000000..96889f0 --- /dev/null +++ b/exercise-001-corpus/word_count/main.go @@ -0,0 +1,96 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "sort" + "strings" +) + +/* +* This program counts the words in a text file and outputs the counts in decreasing order. +* Command line: word_count + */ +func main() { + fileName := os.Args[1] // extract file name of text + + // get dictionary + rawWordCount, err := ScanWords(fileName) + if err != nil { + panic(err) + } + + // sort dictionary + orderedDict := rankByWordCount(rawWordCount) + for _, pair := range orderedDict { + fmt.Println(pair.Value, pair.Key) + } +} + +/* +* ScanWords -- This function takes in a file name and returns a dictionary of word-count pairs + */ +func ScanWords(path string) (map[string]int, error) { + // open file + file, err := os.Open(path) + if err != nil { + return nil, err + } + + defer file.Close() + + scanner := bufio.NewScanner(file) + scanner.Split(bufio.ScanWords) + + dict := make(map[string]int) + // go through all words, forcing lowercase & removing punctuations + for scanner.Scan() { + curWord := strings.ToLower(scanner.Text()) // force lowercase + curWord = strings.Replace(curWord, ".", "", -1) // remove special characters + curWord = strings.Replace(curWord, ",", "", -1) + curWord = strings.Replace(curWord, "?", "", -1) + curWord = strings.Replace(curWord, ":", "", -1) + if curWord[0] == '"' { + curWord = curWord[1:] + } + if curWord[len(curWord)-1] == '"' { + curWord = curWord[:len(curWord)-1] + } + + if count, isIn := dict[curWord]; isIn { + dict[curWord] = count + 1 // word has been seen before, increment count + } else { + dict[curWord] = 1 // new word, add to dict + } + } + + return dict, nil +} + +/* +* Below is an implementation for sorting a map by value + */ +func rankByWordCount(wordFrequencies map[string]int) PairList { + pl := make(PairList, len(wordFrequencies)) + i := 0 + for k, v := range wordFrequencies { + pl[i] = Pair{k, v} + i++ + } + sort.Sort(sort.Reverse(pl)) + return pl +} + +// Pair struct used for sorting map by value +type Pair struct { + Key string + Value int +} + +// PairList struct used to contain Pair types +type PairList []Pair + +func (p PairList) Len() int { return len(p) } +func (p PairList) Less(i, j int) bool { return p[i].Value < p[j].Value } +func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] } diff --git a/exercise-001-corpus/word_count/main_test.go b/exercise-001-corpus/word_count/main_test.go new file mode 100644 index 0000000..1662f76 --- /dev/null +++ b/exercise-001-corpus/word_count/main_test.go @@ -0,0 +1,30 @@ +package main + +import ( + "testing" +) + +func TestWordCount(t *testing.T) { + dict, err := ScanWords("../7oldsamr.txt") + if err != nil { + panic(err) + } + v1 := dict["the"] + v2 := dict["and"] + v3 := dict["to"] + if v1 != 36 { + t.Error( + "For 'the' expected 36, got", v1, + ) + } + if v2 != 18 { + t.Error( + "For 'and' expected 18, got", v2, + ) + } + if v3 != 14 { + t.Error( + "For 'to' expected 14, got", v3, + ) + } +}