-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpolish_data.py
367 lines (337 loc) · 15.8 KB
/
polish_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#!/usr/bin/env python
"""This module accesses several airtable 'views' that contain records that need some additional processing."""
import re
import time
import requests
from bs4 import BeautifulSoup
from cloudinary import uploader
from common import airtab_intakes as airtab
from common import cloudinary, dc, wrap_from_module
wrap_it_up = wrap_from_module('jail_scrapers/polish_data.py')
def polish_data():
"""This function does runs each of the module's functions."""
get_pixelated_mug()
update_summary()
get_charges_from_recent_text()
retry_getting_mugshot()
remove_weird_character()
parse_charge_1()
fix_charges_to_by_lines()
get_full_text()
get_all_intake_deets()
update_dc_fields()
def get_pixelated_mug():
"""This function uploads the raw image to cloudinary and
then uploads the pixelated version to the airtable record."""
t0, i = time.time(), 0
needs_pix_img_formula = "AND(PHOTO != '', PIXELATED_IMG = '', hours_since_verification < 24, jail != 'jcdc')"
records = airtab.get_all(formula=needs_pix_img_formula)
for record in records:
this_dict = {}
url = record["fields"]["PHOTO"][0]["url"]
r = requests.get(url)
content_type = r.headers['Content-Type']
# print(content_type)
if content_type == 'image/jpeg':
try:
upload_response = uploader.upload(url, opacity=40, effect="blur:400")
time.sleep(1)
this_dict["PIXELATED_IMG"] = [{"url": upload_response['secure_url']}]
airtab.update(record['id'], this_dict)
except cloudinary.exceptions.Error as err1:
print("cloudinary can't accept that shit: ", err1)
except AttributeError as err2:
print('Attribute Error for cloudinary upload: ', err2)
else:
print('this shit was some really weird content type:', content_type)
wrap_it_up(t0, new=i, total=len(records), function='get_pixelated_mug')
def update_summary(this_many=150):
"""This function updates the record summary. The reason we have this field,
rather than just use the 'blurb' field, is bc the gallery view works better
with a text field than it does with a formula field. Because this view will
regularly be packed full of records, the default max records is 100."""
t0, i = time.time(), 0
# outdated_summary_formula = "AND(blurb != '#ERROR!', blurb != summary)"
# records = airtab.get_all(formula=outdated_summary_formula, fields="blurb", max_records=this_many)
records = airtab.get_all(view='needs updated summary', fields="blurb", max_records=this_many)
for record in records:
this_dict = {}
this_dict["summary"] = record["fields"]["blurb"]
airtab.update(record["id"], this_dict)
wrap_it_up(t0, new=i, total=len(records), function='update_summary')
def get_charges_from_recent_text():
"""This function parces the recent text field and extracts the listed charges."""
t0, i = time.time(), 0
# this formula is the same as one used for `needs charges` view
needs_charges_formula = "AND(charges_updated = '', html != '', recent_text != '', hours_since_verification < 72, DONT_DELETE != 'no charges')"
records = airtab.get_all(formula=needs_charges_formula)
for record in records:
this_dict = {}
if record["fields"]["jail"] == "lcdc":
charges = []
bond_ammounts = []
fine_ammounts = []
soup = BeautifulSoup(record["fields"]["html"], "html.parser").tbody
rows = soup.find_all("tr")
if soup.tfoot:
goods = rows[: len(rows) - 1]
this_dict["intake_bond_cash"] = soup.tfoot.find_all("td")[2].b.string.strip()
this_dict["intake_fine_ammount"] = soup.tfoot.find_all("td")[3].b.string.strip()
else:
goods = rows
for row in goods:
cells = row.find_all("td")
if cells[0].string.strip():
if "," in cells[0].string.strip():
charges.append('"' + cells[0].string.strip() + '"')
else:
charges.append(cells[0].string.strip())
if cells[2].string.strip():
bond_ammounts.append(
cells[2].string.strip().replace(",", ""))
if cells[3].string.strip():
fine_ammounts.append(
cells[3].string.strip().replace(",", ""))
if charges:
this_dict["charges"] = ", ".join(charges)
if bond_ammounts:
this_dict["bond_ammounts"] = "\n".join(bond_ammounts)
if fine_ammounts:
this_dict["fine_ammounts"] = "\n".join(fine_ammounts)
airtab.update(record["id"], this_dict, typecast=True)
i += 1
elif record["fields"]["jail"] == "kcdc":
charges = []
text = record["fields"]["recent_text"]
goods = text[text.find("Charges:"): text.find("Note:")].splitlines()
if len(goods) > 1:
for good in goods:
if "," in good:
charges.append('"' + good.strip() + '"')
else:
charges.append(good)
this_dict["charges"] = ", ".join(goods[1:])
airtab.update(record["id"], this_dict)
i += 1
elif record["fields"]["jail"] in {"ccdc", "tcdc", "jcdc"}:
charges = []
text = record["fields"]["recent_text"]
x = text.find("\nCharges:") + 9
y = text.find("\nBond:")
goods = text[x:y].strip().splitlines()
for line in goods:
if "," in line:
charges.append('"' + line + '"')
else:
charges.append(line)
this_dict["charges"] = ", ".join(charges)
airtab.update(record["id"], this_dict)
i += 1
elif record["fields"]["jail"] == "hcdc":
messy = []
goods = []
data = record["fields"]["recent_text"].splitlines()
messy.append(data[data.index("Charge 1") + 1].strip())
messy.append(data[data.index("Charge 2") + 1].strip())
messy.append(data[data.index("Charge 3") + 1].strip())
messy.append(data[data.index("Charge 4") + 1].strip())
for x in messy:
if not x.startswith("Felony / Misd"):
if "," in x:
goods.append('"' + x + '"')
else:
goods.append(x)
this_dict["charges"] = ", ".join(goods)
airtab.update(record["id"], this_dict)
i += 1
wrap_it_up(t0, new=i, total=len(records), function='get_charges_from_recent_text')
def retry_getting_mugshot():
t0, i = time.time(), 0
needs_pic_formula = "AND(PHOTO = '', hours_since_verification < 12, jail != 'lcdc')"
records = airtab.get_all(formula=needs_pic_formula)
print("we're gonna retry getting mugs for", len(records), "records...")
for record in records:
this_dict = {}
r = requests.get(record['fields']['link'])
if record['fields']['jail'] == 'jcdc':
soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
img_tag = soup.find('div', class_='inmate_profile_image').img
if img_tag['alt'] != 'Image Not Availble':
this_dict['img_src'] = f"https://www.jonesso.com/{img_tag['src']}"
this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
# else:
# print('image not currently available')
elif record['fields']['jail'] == 'hcdc':
soup = BeautifulSoup(r.text, 'html.parser')
try:
img_src = 'http://www.co.hinds.ms.us' + soup.find('img', {'align': 'middle'})['src']
if requests.get(img_src).headers['Content-Type'] == 'image/jpeg':
this_dict['img_src'] = img_src
this_dict['PHOTO'] = [{'url': img_src}]
else:
print('image source isn\'t actually an image')
except TypeError:
print('no img tag in intake html')
elif record['fields']['jail'] == 'kcdc':
soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
try:
img_tag = soup.find('img')
except AttributeError:
# print('no img tag in intake html')
continue
if soup.img:
img_src_raw = soup.img['src']
if img_src_raw.startswith('templates/kempercountysheriff.com/images/inmates'):
this_dict['img_src'] = f"https://www.kempercountysheriff.com/{img_src_raw}"
this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
elif record['fields']['jail'] == 'acdc':
soup = BeautifulSoup(r.text, 'html.parser').find('div', class_='blog-content-container')
try:
img_tag = soup.find('img')
this_dict['img_src'] = img_tag.get('src')
this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
except AttributeError:
# print('no img tag in intake html')
continue
else:
print(f"awww hell... this one is from the {record['fields']['jail']} docket/scraper...")
airtab.update(record['id'], this_dict)
wrap_it_up(t0, new=i, total=len(records), function='retry_getting_mugshot')
def parse_charge_1():
t0, i = time.time(), 0
needs_charge_1_parsed_formula = "AND(OR(jail = 'mcdc', jail = 'prcdf'), charge_1_statute = '', hours_since_initial_scrape < 48, charge_1 != '', charge_1 != 'HOLDHOLD', charge_1 != 'DRUGDRUG COURT', charge_1 != 'HLD Other AgencyHold for other Agency')"
records = airtab.get_all(formula=needs_charge_1_parsed_formula)
for record in records:
this_dict = {}
x = None
if re.search("[)][A-Z]", record["fields"]["charge_1"]):
x = re.search("[)][A-Z]", record["fields"]["charge_1"])
elif re.search("[0-9][A-Z]", record["fields"]["charge_1"]):
x = re.search("[0-9][A-Z]", record["fields"]["charge_1"])
if x:
this_dict["charge_1_statute"] = record["fields"]["charge_1"][: x.start() + 1]
this_dict["charge_1_title"] = record["fields"]["charge_1"][x.end() - 1:]
try:
airtab.update(record["id"], this_dict)
i += 1
except requests.exceptions.HTTPError as err:
print(err)
continue
wrap_it_up(t0, new=i, total=len(records), function='parse_charge_1')
def fix_charges_to_by_lines():
t0, i = time.time(), 0
records = airtab.get_all(formula="AND(TEST_FORMULA != '', TEST_RESULT = '')", fields='charges')
for record in records:
this_dict = {}
cleaner = []
mess = record['fields']['charges'].replace('", ', '"\n').replace(', "', '\n"').splitlines()
for c in mess:
if c.startswith('"'):
cleaner.append(c.replace('"', ''))
else:
for d in c.split(', '):
cleaner.append(d)
this_dict['TEST_RESULT'] = '\n'.join(cleaner)
airtab.update(record['id'], this_dict)
i += 1
wrap_it_up(t0, new=i, total=len(records), function='fix_charges_to_by_lines')
def remove_weird_character():
t0, i = time.time(), 0
remove_wierd_character_formula = "AND(hours_since_verification > 12, FIND('ã', recent_text) > 1)"
records = airtab.get_all(formula=remove_wierd_character_formula, fields='recent_text')
for record in records:
this_dict = {}
x = record['fields']['recent_text'].find('ã')
y = record['fields']['recent_text'].find('\n', x)
this_dict['recent_text'] = record['fields']['recent_text'].replace(
record['fields']['recent_text'][x:y], '')
airtab.update(record['id'], this_dict)
i += 1
wrap_it_up(t0, new=i, total=len(records), function='remove_weird_character')
def get_full_text(this_many=150):
t0, i = time.time(), 0
# records = airtab.get_all(formula="AND(dc_id != '', dc_full_text = '')", fields=['dc_id'], max_records=this_many)
records = airtab.get_all(view="needs full text", fields=['dc_id'], max_records=this_many)
for record in records:
this_dict = {}
obj = dc.documents.get(record['fields']['dc_id'])
this_dict["dc_title"] = obj.title
this_dict["dc_access"] = obj.access
this_dict["dc_pages"] = obj.pages
# this_dict["dc_full_text"] = obj.full_text.decode("utf-8")
this_dict["dc_full_text"] = obj.full_text
airtab.update(record["id"], this_dict)
i += 1
wrap_it_up(t0, new=i, total=len(records), function='get_full_text')
def get_all_intake_deets():
t0, i = time.time(), 0
jcadc_deets_formula = "AND(jail = 'jcadc', charges = '', recent_text != '')"
records = airtab.get_all(formula=jcadc_deets_formula, fields='recent_text')
for record in records:
charges = []
bond_ammts = []
classifications = []
this_dict = {}
txt_str = record['fields']['recent_text']
chunks = txt_str.split('\nRequest Victim Notification\n')
match_1 = re.search(r"(\w+)\s+(Male|Female)", chunks[0])
try:
raw_race = match_1.group(1)
if raw_race == 'AVAILABLE':
this_dict['race'] = 'U'
else:
this_dict['race'] = raw_race[0]
this_dict['sex'] = match_1.group(2)[0]
except AttributeError:
print('there isnt race/sex info')
try:
this_dict['intake_weight'] = re.search(r"(\d+) Pounds", chunks[0]).group(1)
except AttributeError:
print('there isnt weight info')
try:
this_dict['intake_height'] = re.search(r"(\d Ft. \d+ In.)", chunks[0]).group(1)
except AttributeError:
print('idk how tall this person is')
try:
this_dict['intake_eye'] = re.search(r"(\w+)\s+Eyes", chunks[0]).group(1)
except AttributeError:
print('eye color is a mystery')
try:
this_dict['intake_age'] = re.search(r"(\d\d) Years Old", chunks[0]).group(1)
except AttributeError:
print('intake age is a mystery')
try:
crim_details = chunks[1].splitlines()
for ln in crim_details:
results = re.search(r"([MF]\w+) - Bond: (\$.*)", ln)
if results:
bond_ammts.append(results.group(2))
classifications.append(results.group(1))
elif ', ' in ln:
charges.append(f"\"{ln}\"")
else:
charges.append(ln)
this_dict['charges'] = ', '.join(charges)
this_dict['bond_ammounts'] = '\n'.join(bond_ammts)
this_dict['charge_classifications'] = ', '.join(classifications)
except IndexError:
print('no crim details')
airtab.update(record['id'], this_dict, typecast=True)
i += 1
wrap_it_up(t0, new=i, total=len(records), function='get_all_intake_deets')
def update_dc_fields():
records = airtab.get_all(view='need dc urls updated', fields='dc_id', max_records=100)
print(len(records), ' records need updated documentcloud URLs.')
for record in records:
this_dict = {}
dc_id = record['fields'].get('dc_id')
obj = dc.documents.get(dc_id)
this_dict["PDF"] = obj.pdf_url
this_dict["dc_canonical_url"] = obj.canonical_url
this_dict["dc_resources_page_image"] = obj.normal_image_url
airtab.update(record['id'], this_dict)
time.sleep(.3)
def main():
polish_data()
if __name__ == "__main__":
main()