-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #134 from transitmatters/bus_staging
INTRODUCING BUS MODE!
- Loading branch information
Showing
58 changed files
with
13,481 additions
and
7,513 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import argparse | ||
import pathlib | ||
import pandas as pd | ||
from datetime import datetime | ||
|
||
|
||
def load_data(input_csv, routes): | ||
""" | ||
Loads in the below format and makes some adjustments for processing. | ||
- Filter only points with actual trip data | ||
- Trim leading 0s from route_id | ||
- Select only route_ids in `routes` | ||
- Set scheduled/actual times to be on service_date, not 1900-01-01 | ||
- Map direction_id (Outbound -> 0, Inbound -> 1) | ||
""" | ||
""" | ||
"service_date", "route_id", "direction", "half_trip_id", "stop_id", "time_point_id", "time_point_order", "point_type", "standard_type", "scheduled", "actual", "scheduled_headway", "headway" | ||
2020-01-15, "01", "Inbound", 46374001, 67, "maput", 2, "Midpoint", "Schedule", 1900-01-01 05:08:00, 1900-01-01 05:09:07, -5, NA,NA | ||
2020-01-15, "01", "Inbound", 46374001, 110, "hhgat", 1, "Startpoint", "Schedule", 1900-01-01 05:05:00, 1900-01-01 05:04:34, 26, NA,NA | ||
2020-01-15, "01", "Inbound", 46374001, 72, "cntsq", 3, "Midpoint", "Schedule", 1900-01-01 05:11:00, 1900-01-01 05:12:01, -22, NA,NA | ||
2020-01-15, "01", "Inbound", 46374001, 75, "mit", 4, "Midpoint", "Schedule", 1900-01-01 05:14:00, 1900-01-01 05:14:58, -25, NA,NA | ||
2020-01-15, "01", "Inbound", 46374001, 79, "hynes", 5, "Midpoint", "Schedule", 1900-01-01 05:18:00, 1900-01-01 05:18:45, 32, NA,NA | ||
2020-01-15, "01", "Inbound", 46374001, 187, "masta", 6, "Midpoint", "Schedule", 1900-01-01 05:20:00, 1900-01-01 05:21:04, -33, NA,NA | ||
2020-01-15, "01", "Inbound", 46374045, 110, "hhgat", 1, "Startpoint", "Headway", 1900-01-01 05:20:00, 1900-01-01 05:20:45, NA, 900,971 | ||
""" | ||
|
||
# thinking about doing this in pandas to have all the info at once | ||
df = pd.read_csv(input_csv) | ||
df.rename(columns={ | ||
# This set of transformations covers prior-year bus data. | ||
'ServiceDate': 'service_date', | ||
'Route': 'route_id', | ||
'Direction': 'direction_id', | ||
'HalfTripId': 'half_trip_id', | ||
'Stop': 'stop_id', | ||
'stop_name': 'time_point_id', | ||
'stop_sequence': 'time_point_order', | ||
'Timepoint': 'time_point_id', | ||
'TimepointOrder': 'time_point_order', | ||
'PointType': 'point_type', | ||
'StandardType': 'standard_type', | ||
'Scheduled': 'scheduled', | ||
'Actual': 'actual', | ||
'ScheduledHeadway': 'scheduled_headway', | ||
'Headway': 'headway', | ||
'direction': 'direction_id' | ||
}, inplace=True) | ||
|
||
# We need to keep both "Headway" AND "Schedule": both can have timepoint data. | ||
df = df.loc[df.actual.notnull()] | ||
|
||
df.route_id = df.route_id.str.lstrip("0") | ||
if routes: | ||
df = df.loc[df.route_id.isin(routes)] | ||
|
||
# Convert dates | ||
df.scheduled = pd.to_datetime(df.scheduled) | ||
df.service_date = pd.to_datetime(df.service_date) | ||
df.actual = pd.to_datetime(df.actual) | ||
|
||
OFFSET = datetime(1900, 1, 1, 0, 0, 0) | ||
df.scheduled = df.service_date + (df.scheduled - OFFSET) | ||
df.actual = df.service_date + (df.actual - OFFSET) | ||
df.service_date = df.service_date.dt.date | ||
|
||
df.direction_id = df.direction_id.map({"Outbound": 0, "Inbound": 1}) | ||
|
||
return df | ||
|
||
|
||
def process_events(df): | ||
""" | ||
Take the tidied input data and rearrange the columns to match rapidtransit format. | ||
- Rename columns (trip_id, stop_sequence, event_time) | ||
- Remove extra columns | ||
- Add empty vehicle columns | ||
- Calculate event_type column with ARR and DEP entries | ||
""" | ||
CSV_HEADER = ["service_date", "route_id", "trip_id", "direction_id", "stop_id", | ||
"stop_sequence", "vehicle_id", "vehicle_label", "event_type", "event_time"] | ||
|
||
df = df.rename(columns={"half_trip_id": "trip_id", | ||
"time_point_order": "stop_sequence", | ||
"actual": "event_time"}) | ||
df.drop(columns=["time_point_id", "standard_type", "scheduled", "scheduled_headway", "headway"]) | ||
df["vehicle_id"] = "" | ||
df["vehicle_label"] = "" | ||
|
||
df["event_type"] = df.point_type.map({"Startpoint": ["DEP"], | ||
"Midpoint": ["ARR", "DEP"], | ||
"Endpoint": ["ARR"]}) | ||
df = df.explode("event_type") | ||
df = df[CSV_HEADER] # reorder | ||
|
||
return df | ||
|
||
|
||
def to_disk(df, outdir, nozip=False): | ||
""" | ||
For each service_date/stop_id/direction/route group, we write the events to disk. | ||
""" | ||
grouped = df.groupby(["service_date", "stop_id", "direction_id", "route_id"]) | ||
|
||
for name, events in grouped: | ||
service_date, stop_id, direction_id, route_id = name | ||
|
||
fname = pathlib.Path(outdir, | ||
"Events", | ||
"daily-bus-data", | ||
f"{route_id}-{direction_id}-{stop_id}", | ||
f"Year={service_date.year}", | ||
f"Month={service_date.month}", | ||
f"Day={service_date.day}", | ||
"events.csv.gz") | ||
fname.parent.mkdir(parents=True, exist_ok=True) | ||
# set mtime to 0 in gzip header for determinism (so we can re-gen old routes, and rsync to s3 will ignore) | ||
events.to_csv(fname, index=False, compression={"method": "gzip", "mtime": 0} if not nozip else None) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("input", metavar="INPUT_CSV") | ||
parser.add_argument("output", metavar="OUTPUT_DIR") | ||
parser.add_argument("--routes", "-r", nargs="*", type=str, | ||
help="One note here: we should always be additive with our route set \ | ||
in case 2 lines share the same stop id: we need both in the result file.") | ||
parser.add_argument("--nozip", "-nz", action="store_true", help="debug feature to skip gzipping") | ||
|
||
args = parser.parse_args() | ||
input_csv = args.input | ||
output_dir = args.output | ||
routes = args.routes | ||
no_zip = args.nozip | ||
|
||
pathlib.Path(output_dir).mkdir(exist_ok=True) | ||
|
||
data = load_data(input_csv, routes) | ||
events = process_events(data) | ||
to_disk(events, output_dir, nozip=no_zip) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/sh | ||
|
||
newfile=$1 | ||
|
||
for i in 1 15 22 23 28 32 39 57 66 71 73 77 111; do | ||
mkdir -p data/output/manifests/ | ||
pipenv run python manifest.py $newfile data/output/manifests/$i.json --checkpoints data/input/MBTA_GTFS/checkpoints.txt -r $i | ||
echo "Comparing old and new manifests for route $i" | ||
pipenv run python compare_manifest.py ../../src/bus_constants/$i.json data/output/manifests/$i.json | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import json | ||
import sys | ||
|
||
station_stops = {} | ||
|
||
|
||
def runone(path, first=False): | ||
unchanged = True | ||
current = json.load(open(path)) | ||
# Print any removed stops first | ||
if not first: | ||
my_stations = list(current.values())[0]['stations'] | ||
stat_map = dict(map(lambda x: (x['station'], x), my_stations)) | ||
for s in station_stops: | ||
if s not in stat_map: | ||
print(" - Station %s removed in file %s. (Stops: %s)" % (s, path, station_stops[s])) | ||
continue | ||
for d in station_stops[s]: | ||
for stop in station_stops[s][d]: | ||
if stop not in stat_map[s]['stops'][d]: | ||
print(" - Stop %s removed from %s in file %s" % (stop, s, path)) | ||
|
||
for i in list(current.values())[0]['stations']: | ||
s = i['station'] | ||
|
||
if s not in station_stops: | ||
station_stops[s] = {} | ||
if not first: | ||
print(" + Found new station %s" % s) | ||
unchanged = False | ||
for direction in i['stops']: | ||
if direction not in station_stops[s]: | ||
station_stops[s][direction] = [] | ||
for stop in i['stops'][direction]: | ||
if stop not in station_stops[s][direction]: | ||
station_stops[s][direction].append(stop) | ||
if not first: | ||
print(" + Found additional stop %s at station %s in %s" % (stop, s, path)) | ||
unchanged = False | ||
return unchanged | ||
|
||
|
||
def run(paths): | ||
unchanged = True | ||
runone(paths[0], first=True) | ||
for path in reversed(paths[1:]): | ||
unchanged = runone(path) and unchanged | ||
if unchanged: | ||
print("No new stations/stops on route.") | ||
else: | ||
print("Changed?") | ||
|
||
|
||
if __name__ == "__main__": | ||
run(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash -x | ||
|
||
routes="$@" | ||
if [ -z "$routes" ]; then | ||
routes="1" | ||
fi | ||
|
||
for y in 2018 2019 2020 2021; do | ||
for f in $(find data/input/$y/ -name '*.csv'); do | ||
echo "Generating stop data from $f" | ||
pipenv run python bus2train.py $f data/output -r $routes | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
for route in 1 111 15 22 23 28 32 39 57 66 71 73 77 114 116 117; do | ||
mkdir -p data/output/manifests/$route | ||
|
||
for f in $(find data/input/ -name *.csv); do | ||
month=$(echo $f | cut -d/ -f4 | cut -d. -f1) | ||
pipenv run python manifest.py $f "data/output/manifests/$route/$route_$month.json" --checkpoints "data/input/MBTA_GTFS/checkpoints.txt" -r $route | ||
done | ||
done |
Oops, something went wrong.