-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path__main__.py
233 lines (184 loc) · 6.4 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""The main command line module that defines the "gv_dashboard_data" tool."""
import datetime
import click
import numpy as np
import simplejson as json
from loguru import logger
from . import DATA_DIR
from .courts import CourtInfoByIncident
from .geo import *
from .homicides import PPDHomicideTotal
from .shootings import ShootingVictimsData, load_existing_shootings_data
from .streets import StreetHotSpots
@click.group()
@click.version_option()
def cli():
"""Processing data for the Controller's Office gun violence dashboard.
https://controller.phila.gov/philadelphia-audits/mapping-gun-violence/#/
"""
pass
@cli.command()
@click.option("--debug", is_flag=True)
def save_geojson_layers(debug=False):
"""Save the various geojson layers needed in the dashboard."""
# ------------------------------------------------
# Part 1: Hot spot streets
# -------------------------------------------------
hotspots = StreetHotSpots(debug=debug)
hotspots.save()
# Functions
geo_funcs = [
get_zip_codes,
get_police_districts,
get_council_districts,
get_neighborhoods,
get_school_catchments,
get_pa_house_districts,
get_pa_senate_districts,
]
for func in geo_funcs:
tag = func.__name__.split("get_")[-1]
filename = f"{tag}.geojson"
path = DATA_DIR / "processed" / "geo" / filename
if debug:
logger.debug(f"Saving {filename}")
func().to_crs(epsg=4326).to_file(path, driver="GeoJSON")
@cli.command()
@click.option("--debug", is_flag=True, help="Whether to log debug statements.")
@click.option(
"--ignore-checks", is_flag=True, help="Whether to ignore any validation checks."
)
@click.option(
"--homicides-only", is_flag=True, help="Whether to process the Homicide data."
)
@click.option(
"--shootings-only", is_flag=True, help="Whether to process the shooting data."
)
@click.option(
"--force-homicide-update",
is_flag=True,
help="Whether to force the homicide update.",
)
def daily_update(
debug=False,
ignore_checks=False,
homicides_only=False,
shootings_only=False,
force_homicide_update=False,
):
"""Run the daily pre-processing update.
This runs the following steps:
1. Download a fresh copy of the shooting victims database.
2. Merge data for hot spot blocks.
3. Merge data for court information.
4. Save the processed shooting victims database.
5. Save the cumulative daily shooting victims total.
6. Scrape and save the homicide count from the PPD's website.
"""
# Do all parts
process_all = not (homicides_only or shootings_only)
# Initialize meta
meta = {}
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ------------------------------------------------------
# Part 1: Homicide count scraped from PPD
# ------------------------------------------------------
if process_all or homicides_only:
# Run the update
homicide_count = PPDHomicideTotal(debug=debug)
homicide_count.update(force=force_homicide_update)
# Update the meta
meta["last_updated_homicides"] = now
# ---------------------------------------------------
# Part 2: Main shooting victims data file
# ---------------------------------------------------
if process_all or shootings_only:
victims = ShootingVictimsData(debug=debug, ignore_checks=ignore_checks)
data = victims.get()
# Save victims data to annual files
victims.save(data)
# Update the meta
meta["last_updated_shootings"] = now
# Update meta data
meta_path = DATA_DIR / "meta.json"
existing_meta = json.load(meta_path.open(mode="r"))
# Remove old key
if "last_updated" in existing_meta:
existing_meta.pop("last_updated")
# Add new info
existing_meta.update(meta)
# Save the download time
json.dump(existing_meta, meta_path.open(mode="w"))
@cli.command()
@click.option(
"--nprocs",
type=int,
default=1,
help="If running in parallel, the total number of processes that will run.",
)
@click.option(
"--pid",
type=int,
default=0,
help=(
"If running in parallel, the local process id."
"This should be between 0 and number of processes."
),
)
@click.option(
"--sleep",
default=7,
help="Total waiting time b/w scraping calls (in seconds)",
type=int,
)
@click.option("--debug", is_flag=True, help="Whether to log debug statements.")
@click.option("--dry-run", is_flag=True, help="Do not save the results; dry run only.")
@click.option(
"--sample",
type=int,
default=None,
help="Only run a random sample of incident numbers.",
)
def scrape_courts_portal(nprocs, pid, sleep, debug, sample, dry_run):
"""Scrape courts information from the PA's Unified Judicial System's portal.
This can be run in parallel by specifying a total
number of processes and a specific chunk to run.
"""
# Load the existing data
shootings = load_existing_shootings_data()
# Drop duplicates
shootings = shootings.drop_duplicates(subset=["dc_key"])
# Sample?
if sample is not None:
shootings = shootings.sample(sample)
# Split
assert pid < nprocs
if nprocs > 1:
shootings_chunk = np.array_split(shootings, nprocs)[pid]
chunk = pid
else:
shootings_chunk = shootings
chunk = None
# Scrape courts info
courts_data = CourtInfoByIncident(debug=debug, sleep=sleep)
courts_data.update(shootings_chunk, chunk=chunk, dry_run=dry_run)
@cli.command()
@click.option("--debug", is_flag=True, help="Whether to log debug statements.")
@click.option("--dry-run", is_flag=True, help="Do not save the results; dry run only.")
def finalize_courts_scraping(debug, dry_run):
"""Finalize courts scraping by combining scraping results
computed in parallel.
This updates the "scraped_courts_data.json" data file.
"""
# Load the shootings data
data_path = DATA_DIR / "raw"
files = data_path.glob("scraped_courts_data_*.json")
combined = []
for f in sorted(files):
if debug:
logger.debug(f"Combining file: '{f}'")
combined += json.load(f.open("r"))
if not dry_run:
json.dump(combined, (DATA_DIR / "raw" / "scraped_courts_data.json").open("w"))
if __name__ == "__main__":
cli(prog_name="gv_dashboard_data")