-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtexts.go
171 lines (156 loc) · 5.05 KB
/
texts.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package main
import (
"encoding/json"
"net/http"
"github.com/patrickmn/go-cache"
"github.com/pkg/errors"
"github.com/thecsw/katya/log"
"github.com/thecsw/katya/storage"
)
// TextPayload is what we get from our crawlers on every text submission
type TextPayload struct {
// Name is the name of our crawler
Name string `json:"name"`
// StartURL is the starting URL that crawler had
StartURL string `json:"start"`
// URL is the URL of this text submission
URL string `json:"url"`
// IP is the IP address that the URL is associated with
IP string `json:"ip"`
// Status is the HTTP response code we received
Status int `json:"status"`
// Title is the title of the source webpage
Title string `json:"title"`
// NumWords is the number of words in this source (no punctuations)
NumWords int `json:"num_words"`
// NumSentences is the number of sentences in this source
NumSentences int `json:"num_sentences"`
// Original is the cleaned text crawler worked out
Original string `json:"original"`
// Text is the tokenized cleaned text SpaCy gave us
Text string `json:"text"`
// Shapes is the tokenized shapes data from SpaCy
Shapes string `json:"shapes"`
// Tags is the tokenized tags data from SpaCy
Tags string `json:"tags"`
// Lemmas is the tokenized lemmas data from SpaCy
Lemmas string `json:"lemmas"`
}
// textReceiver is used by crawlers to submit a new tagged and analyzed text
func textReceiver(w http.ResponseWriter, r *http.Request) {
scrapyLocalKey := r.Header.Get("Authorization")
if scrapyLocalKey != "cool_local_key" {
log.Error("Bad Authorization header", errors.New("bad key"), nil)
return
}
payload := &TextPayload{}
decoder := json.NewDecoder(r.Body)
err := decoder.Decode(payload)
if err != nil {
log.Error("Failed decoding a text payload", err, nil)
return
}
thisParams := log.Params{
"crawler": payload.Name,
"url": payload.URL,
"source": payload.StartURL,
}
// If it is a local upload, no need for a crawler
if payload.Name != "LOCAL_UPLOAD" {
// Check if such a crawler exists
crawlerExists, err := storage.IsCrawler(payload.Name)
if err != nil {
log.Error("Failed checking crawler's existence", err, log.Params{
"crawler": payload.Name,
})
}
if !crawlerExists {
httpJSON(
w,
nil,
http.StatusForbidden,
errors.New("this crawler doesn't exist"),
)
return
}
}
// Try to add the texts to the database
err = storage.CreateText(
payload.StartURL,
payload.URL,
payload.IP,
uint(payload.Status),
payload.Original,
payload.Text,
payload.Shapes,
payload.Tags,
payload.Lemmas,
payload.Title,
uint(payload.NumWords),
uint(payload.NumSentences),
)
if err != nil {
if err.Error() == "already exists" {
// If it already exists, still link the found text to the source if not already
httpJSON(w, httpMessageReturn{Message: "already exists"}, http.StatusOK, nil)
return
}
log.Error("Failed adding a new text", err, thisParams)
httpJSON(
w,
nil,
http.StatusInternalServerError,
errors.Wrap(err, "Failed storing text in the database"),
)
return
}
// Update the word and sent num caches
_ = sourcesNumWordsDelta.Add(payload.StartURL, uint(0), cache.NoExpiration)
_ = sourcesNumSentencesDelta.Add(payload.StartURL, uint(0), cache.NoExpiration)
_, _ = sourcesNumWordsDelta.IncrementUint(payload.StartURL, uint(payload.NumWords))
_, _ = sourcesNumSentencesDelta.IncrementUint(payload.StartURL, uint(payload.NumSentences))
_, _ = globalNumWordsDelta.IncrementUint(globalDeltaCacheKey, uint(payload.NumWords))
_, _ = globalNumSentencesDelta.IncrementUint(globalDeltaCacheKey, uint(payload.NumSentences))
httpJSON(w, httpMessageReturn{
Message: "success",
}, http.StatusOK, nil)
}
// StatusPayload is used by crawlers to report their status
type StatusPayload struct {
// Name is our crawler's name
Name string `json:"name"`
// Status is the most recent status of it
Status string `json:"status"`
}
// statusReceiver takes the input from crawlers' statuses
func statusReceiver(w http.ResponseWriter, r *http.Request) {
scrapyLocalKey := r.Header.Get("Authorization")
if scrapyLocalKey != "cool_local_key" {
log.Error("Bad Authorization header", errors.New("bad key"), nil)
return
}
payload := &StatusPayload{}
decoder := json.NewDecoder(r.Body)
err := decoder.Decode(payload)
if err != nil {
log.Error("Failed decoding a received status", err, nil)
}
switch payload.Status {
case "started":
if err := storage.CreateScrape(payload.Name); err != nil {
log.Error("failed to log create a scrape", err, log.Params{"name": payload.Name})
}
case "finished":
if err := storage.FinishScrape(payload.Name); err != nil {
log.Error("failed to log finish a scrape", err, log.Params{"name": payload.Name})
}
default:
httpJSON(w, nil, http.StatusBadRequest, errors.New("unknown status received"))
return
}
httpJSON(w, httpMessageReturn{"scrape status received"}, http.StatusOK, nil)
}
// helloReceiver just sends hello through API
func helloReceiver(w http.ResponseWriter, _ *http.Request) {
httpJSON(w, httpMessageReturn{"hello, world"}, http.StatusOK, nil)
}