-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdawg_utils.go
146 lines (127 loc) · 3.12 KB
/
dawg_utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
* Copyright (c) 2024 Johan Stenstam, johan.stenstam@internetstiftelsen.se
*/
package tapir
import (
"bufio"
"encoding/csv"
"fmt"
"io"
"log"
"os"
"slices"
"strings"
"github.com/miekg/dns"
"github.com/smhanov/dawg"
)
func ParseCSV(srcfile string, dstmap map[string]TapirName, dontsort bool) ([]string, error) {
ifd, err := os.Open(srcfile)
if err != nil {
return nil, err
}
defer func() {
err := ifd.Close()
if err != nil {
log.Fatal(err)
}
}()
sortedDomains := []string{}
csvReader := csv.NewReader(ifd)
// Skip the first line containing the header
_, err = csvReader.Read()
if err != nil {
return nil, err
}
var name string
for {
record, err := csvReader.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
name = dns.Fqdn(record[1])
if dontsort {
dstmap[name] = TapirName{Name: name}
} else {
// Make sure the domain is fully qualified (includes
// the root domain dot at the end) as this is expected
// by miekg/dns when comparing against a dns question
// section name
sortedDomains = append(sortedDomains, name)
}
}
if dontsort {
return []string{}, nil
}
fmt.Println("Creating sorted domain list from CSV")
// The names need to be sorted when adding them to the dawg
// datastructure otherwise the operation can fail:
// panic: d.AddWord(): Words not in alphabetical order
slices.Sort(sortedDomains)
return sortedDomains, nil
}
// Two modes of operation: either return a (potentially large) []string with sorted output
// *or* update the dstmap of TapirNames directly and don't return the result
func ParseText(srcfile string, dstmap map[string]TapirName, dontsort bool) ([]string, error) {
ifd, err := os.Open(srcfile)
if err != nil {
return nil, err
}
defer func() {
err := ifd.Close()
if err != nil {
log.Fatal(err)
}
}()
sortedDomains := []string{}
scanner := bufio.NewScanner(ifd)
scanner.Split(bufio.ScanLines)
if dontsort {
for scanner.Scan() {
// sortedDomains = append(sortedDomains, dns.Fqdn(scanner.Text()))
name := dns.Fqdn(scanner.Text())
dstmap[name] = TapirName{Name: name}
}
return sortedDomains, nil //
} else {
// fmt.Println("Creating sorted domain list from text")
for scanner.Scan() {
sortedDomains = append(sortedDomains, dns.Fqdn(scanner.Text()))
}
slices.Sort(sortedDomains)
return sortedDomains, nil
}
}
func CreateDawg(sortedDomains []string, outfile string) error {
fmt.Printf("Creating DAWG data structure\n")
dawg := dawg.New()
for _, domain := range sortedDomains {
dawg.Add(strings.ToLower(domain))
if GlobalCF.Debug {
fmt.Printf("Added \"%s\" to DAWG\n", domain)
}
}
finder := dawg.Finish()
fmt.Printf("Saving DAWG to file %s\n", outfile)
_, err := finder.Save(outfile)
if err != nil {
return err
}
return nil
}
// XXX: This is a slow and costly operation. Do not use unnecessarily.
func ListDawg(df dawg.Finder) (int, []string) {
count := 0
var result []string
enumfn := func(idx int, s []rune, final bool) int {
count++
if final {
result = append(result, string(s))
}
return dawg.Continue
}
df.Enumerate(enumfn)
return count, result
}