Merge pull request #134 from transitmatters/bus_staging

INTRODUCING BUS MODE!
transitmatters · Jan 25, 2022 · 0a12281 · 0a12281
2 parents 2ded2aa + 5fe30f3
commit 0a12281
Show file tree

Hide file tree

Showing 58 changed files with 13,481 additions and 7,513 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,10 +28,13 @@ build
 .env.production.local
 .eslintcache
 
+**/data
+
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 
 .idea
 .vscode
 *~
+*.pyc
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -7,18 +7,17 @@
     "chart.js": "^2.9.4",
     "classnames": "^2.2.6",
     "concurrently": "^5.2.0",
-    "flatpickr": "4.5.7",
+    "flatpickr": "^4.6.9",
     "lodash.merge": "^4.6.2",
-    "react": "^16.13.1",
+    "react": "^17.0.0",
     "react-chartjs-2": "^2.11.1",
-    "react-dom": "^16.13.1",
-    "react-ga": "^3.1.2",
+    "react-dom": "^17.0.0",
     "react-router-dom": "^5.1.2",
-    "react-scripts": "^3.4.1",
-    "react-select": "^3.1.0"
+    "react-scripts": "^4.0.3",
+    "react-select": "^4.0.0"
   },
   "devDependencies": {
-    "eslint": "^6.8.0"
+    "eslint": "^7.11.0"
   },
   "homepage": ".",
   "scripts": {

diff --git a/public/index.html b/public/index.html
@@ -10,10 +10,15 @@
   <meta name="twitter:card" content="summary" />
   <meta name="twitter:site" content="@transitmatters" />
   <meta name="twitter:title" content="TransitMatters Data Dashboard" />
-  <meta name="twitter:description" content="Explore and visualize MBTA rapid transit performance data with the new TransitMatters Data Dashboard." />
+  <meta name="twitter:description"
+    content="Explore MBTA subway and bus performance data with the TransitMatters Data Dashboard." />
   <meta name="twitter:image" content="https://dashboard.transitmatters.org/twitter-card.jpg" />
   <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
   <title>TransitMatters Data Dashboard</title>
+  <script>
+    window.goatcounter = { no_onload: true };
+  </script>
+  <script data-goatcounter="https://transitmatters-dd.goatcounter.com/count" src="//gc.zgo.at/count.js"></script>
 </head>
 
 <body>

diff --git a/server/Pipfile b/server/Pipfile
@@ -14,7 +14,7 @@ chalice = "*"
 pytz = "*"
 boto3 = "*"
 flake8 = "*"
-pandas = "*"
+pandas = ">=1.3"
 numpy = "*"
 importlib-resources = "*"
 

diff --git a/server/Pipfile.lock b/server/Pipfile.lock
diff --git a/server/app.py b/server/app.py
@@ -31,15 +31,6 @@ def parse_user_date(user_date):
     return date(year=year, month=month, day=day)
 
 
-def parse_query_stop_args(query_params, expected_stop_param_names):
-    stops_dict = {}
-    for stop_param in expected_stop_param_names:
-        query_value = query_params.get(stop_param)
-        if query_value:
-            stops_dict[stop_param] = query_value
-    return stops_dict
-
-
 def mutlidict_to_dict(mutlidict):
     res_dict = {}
     for key in mutlidict.keys():
@@ -75,23 +66,23 @@ def healthcheck():
 @app.route("/headways/{user_date}", cors=cors_config)
 def headways_route(user_date):
     date = parse_user_date(user_date)
-    stop = app.current_request.query_params["stop"]
-    return data_funcs.headways(date, [stop])
+    stops = app.current_request.query_params.getlist("stop")
+    return data_funcs.headways(date, stops)
 
 
 @app.route("/dwells/{user_date}", cors=cors_config)
 def dwells_route(user_date):
     date = parse_user_date(user_date)
-    stop = app.current_request.query_params["stop"]
-    return data_funcs.dwells(date, [stop])
+    stops = app.current_request.query_params.getlist("stop")
+    return data_funcs.dwells(date, stops)
 
 
 @app.route("/traveltimes/{user_date}", cors=cors_config)
 def traveltime_route(user_date):
     date = parse_user_date(user_date)
-    from_stop = app.current_request.query_params["from_stop"]
-    to_stop = app.current_request.query_params["to_stop"]
-    return data_funcs.travel_times(date, [from_stop], [to_stop])
+    from_stops = app.current_request.query_params.getlist("from_stop")
+    to_stops = app.current_request.query_params.getlist("to_stop")
+    return data_funcs.travel_times(date, from_stops, to_stops)
 
 
 @app.route("/alerts/{user_date}", cors=cors_config)
@@ -104,30 +95,41 @@ def alerts_route(user_date):
 def traveltime_aggregate_route():
     sdate = parse_user_date(app.current_request.query_params["start_date"])
     edate = parse_user_date(app.current_request.query_params["end_date"])
-    from_stop = app.current_request.query_params["from_stop"]
-    to_stop = app.current_request.query_params["to_stop"]
+    from_stops = app.current_request.query_params.getlist("from_stop")
+    to_stops = app.current_request.query_params.getlist("to_stop")
+
+    response = aggregation.travel_times_over_time(sdate, edate, from_stops, to_stops)
+    return json.dumps(response, indent=4, sort_keys=True, default=str)
+
+
+@app.route("/aggregate/traveltimes2", cors=cors_config)
+def traveltime_aggregate_route_2():
+    sdate = parse_user_date(app.current_request.query_params["start_date"])
+    edate = parse_user_date(app.current_request.query_params["end_date"])
+    from_stop = app.current_request.query_params.getlist("from_stop")
+    to_stop = app.current_request.query_params.getlist("to_stop")
 
-    response = aggregation.travel_times_over_time(sdate, edate, from_stop, to_stop)
+    response = aggregation.travel_times_all(sdate, edate, from_stop, to_stop)
     return json.dumps(response, indent=4, sort_keys=True, default=str)
 
 
 @app.route("/aggregate/headways", cors=cors_config)
 def headways_aggregate_route():
     sdate = parse_user_date(app.current_request.query_params["start_date"])
     edate = parse_user_date(app.current_request.query_params["end_date"])
-    stop = app.current_request.query_params["stop"]
+    stops = app.current_request.query_params.getlist("stop")
 
-    response = aggregation.headways_over_time(sdate, edate, stop)
+    response = aggregation.headways_over_time(sdate, edate, stops)
     return json.dumps(response, indent=4, sort_keys=True, default=str)
 
 
 @app.route("/aggregate/dwells", cors=cors_config)
 def dwells_aggregate_route():
     sdate = parse_user_date(app.current_request.query_params["start_date"])
     edate = parse_user_date(app.current_request.query_params["end_date"])
-    stop = app.current_request.query_params["stop"]
+    stops = app.current_request.query_params.getlist("stop")
 
-    response = aggregation.dwells_over_time(sdate, edate, stop)
+    response = aggregation.dwells_over_time(sdate, edate, stops)
     return json.dumps(response, indent=4, sort_keys=True, default=str)
 
 

diff --git a/server/bus/bus2train.py b/server/bus/bus2train.py
@@ -0,0 +1,144 @@
+import argparse
+import pathlib
+import pandas as pd
+from datetime import datetime
+
+
+def load_data(input_csv, routes):
+    """
+    Loads in the below format and makes some adjustments for processing.
+    - Filter only points with actual trip data
+    - Trim leading 0s from route_id
+    - Select only route_ids in `routes`
+    - Set scheduled/actual times to be on service_date, not 1900-01-01
+    - Map direction_id (Outbound -> 0, Inbound -> 1)
+    """
+    """
+    "service_date", "route_id", "direction",  "half_trip_id", "stop_id",  "time_point_id",  "time_point_order", "point_type",   "standard_type",  "scheduled",            "actual",               "scheduled_headway",  "headway"
+    2020-01-15,     "01",       "Inbound",    46374001,       67,         "maput",                2,            "Midpoint",     "Schedule",       1900-01-01 05:08:00,    1900-01-01 05:09:07,      -5,                   NA,NA
+    2020-01-15,     "01",       "Inbound",    46374001,       110,        "hhgat",                1,            "Startpoint",   "Schedule",       1900-01-01 05:05:00,    1900-01-01 05:04:34,      26,                   NA,NA
+    2020-01-15,     "01",       "Inbound",    46374001,       72,         "cntsq",                3,            "Midpoint",     "Schedule",       1900-01-01 05:11:00,    1900-01-01 05:12:01,      -22,                    NA,NA
+    2020-01-15,     "01",       "Inbound",    46374001,       75,         "mit",                  4,            "Midpoint",     "Schedule",       1900-01-01 05:14:00,    1900-01-01 05:14:58,      -25,                    NA,NA
+    2020-01-15,     "01",       "Inbound",    46374001,       79,         "hynes",                5,            "Midpoint",     "Schedule",       1900-01-01 05:18:00,    1900-01-01 05:18:45,      32,                   NA,NA
+    2020-01-15,     "01",       "Inbound",    46374001,       187,        "masta",                6,            "Midpoint",     "Schedule",       1900-01-01 05:20:00,    1900-01-01 05:21:04,      -33,                    NA,NA
+    2020-01-15,     "01",       "Inbound",    46374045,       110,        "hhgat",                1,            "Startpoint",   "Headway",        1900-01-01 05:20:00,    1900-01-01 05:20:45,      NA,                   900,971
+    """
+
+    # thinking about doing this in pandas to have all the info at once
+    df = pd.read_csv(input_csv)
+    df.rename(columns={
+        # This set of transformations covers prior-year bus data.
+        'ServiceDate': 'service_date',
+        'Route': 'route_id',
+        'Direction': 'direction_id',
+        'HalfTripId': 'half_trip_id',
+        'Stop': 'stop_id',
+        'stop_name': 'time_point_id',
+        'stop_sequence': 'time_point_order',
+        'Timepoint': 'time_point_id',
+        'TimepointOrder': 'time_point_order',
+        'PointType': 'point_type',
+        'StandardType': 'standard_type',
+        'Scheduled': 'scheduled',
+        'Actual': 'actual',
+        'ScheduledHeadway': 'scheduled_headway',
+        'Headway': 'headway',
+        'direction': 'direction_id'
+    }, inplace=True)
+
+    # We need to keep both "Headway" AND "Schedule": both can have timepoint data.
+    df = df.loc[df.actual.notnull()]
+
+    df.route_id = df.route_id.str.lstrip("0")
+    if routes:
+        df = df.loc[df.route_id.isin(routes)]
+
+    # Convert dates
+    df.scheduled = pd.to_datetime(df.scheduled)
+    df.service_date = pd.to_datetime(df.service_date)
+    df.actual = pd.to_datetime(df.actual)
+
+    OFFSET = datetime(1900, 1, 1, 0, 0, 0)
+    df.scheduled = df.service_date + (df.scheduled - OFFSET)
+    df.actual = df.service_date + (df.actual - OFFSET)
+    df.service_date = df.service_date.dt.date
+
+    df.direction_id = df.direction_id.map({"Outbound": 0, "Inbound": 1})
+
+    return df
+
+
+def process_events(df):
+    """
+    Take the tidied input data and rearrange the columns to match rapidtransit format.
+    - Rename columns (trip_id, stop_sequence, event_time)
+    - Remove extra columns
+    - Add empty vehicle columns
+    - Calculate event_type column with ARR and DEP entries
+    """
+    CSV_HEADER = ["service_date", "route_id", "trip_id", "direction_id", "stop_id",
+                  "stop_sequence", "vehicle_id", "vehicle_label", "event_type", "event_time"]
+
+    df = df.rename(columns={"half_trip_id": "trip_id",
+                            "time_point_order": "stop_sequence",
+                            "actual": "event_time"})
+    df.drop(columns=["time_point_id", "standard_type", "scheduled", "scheduled_headway", "headway"])
+    df["vehicle_id"] = ""
+    df["vehicle_label"] = ""
+
+    df["event_type"] = df.point_type.map({"Startpoint": ["DEP"],
+                                          "Midpoint": ["ARR", "DEP"],
+                                          "Endpoint": ["ARR"]})
+    df = df.explode("event_type")
+    df = df[CSV_HEADER]  # reorder
+
+    return df
+
+
+def to_disk(df, outdir, nozip=False):
+    """
+    For each service_date/stop_id/direction/route group, we write the events to disk.
+    """
+    grouped = df.groupby(["service_date", "stop_id", "direction_id", "route_id"])
+
+    for name, events in grouped:
+        service_date, stop_id, direction_id, route_id = name
+
+        fname = pathlib.Path(outdir,
+                             "Events",
+                             "daily-bus-data",
+                             f"{route_id}-{direction_id}-{stop_id}",
+                             f"Year={service_date.year}",
+                             f"Month={service_date.month}",
+                             f"Day={service_date.day}",
+                             "events.csv.gz")
+        fname.parent.mkdir(parents=True, exist_ok=True)
+        # set mtime to 0 in gzip header for determinism (so we can re-gen old routes, and rsync to s3 will ignore)
+        events.to_csv(fname, index=False, compression={"method": "gzip", "mtime": 0} if not nozip else None)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("input", metavar="INPUT_CSV")
+    parser.add_argument("output", metavar="OUTPUT_DIR")
+    parser.add_argument("--routes", "-r", nargs="*", type=str,
+                        help="One note here: we should always be additive with our route set \
+                            in case 2 lines share the same stop id: we need both in the result file.")
+    parser.add_argument("--nozip", "-nz", action="store_true", help="debug feature to skip gzipping")
+
+    args = parser.parse_args()
+    input_csv = args.input
+    output_dir = args.output
+    routes = args.routes
+    no_zip = args.nozip
+
+    pathlib.Path(output_dir).mkdir(exist_ok=True)
+
+    data = load_data(input_csv, routes)
+    events = process_events(data)
+    to_disk(events, output_dir, nozip=no_zip)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/server/bus/check_latest_manifests.sh b/server/bus/check_latest_manifests.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+newfile=$1
+
+for i in 1  15  22  23  28  32  39  57  66  71  73  77  111; do
+  mkdir -p data/output/manifests/
+  pipenv run python manifest.py $newfile data/output/manifests/$i.json --checkpoints data/input/MBTA_GTFS/checkpoints.txt -r $i
+  echo "Comparing old and new manifests for route $i"
+  pipenv run python compare_manifest.py ../../src/bus_constants/$i.json data/output/manifests/$i.json
+done
diff --git a/server/bus/compare_manifest.py b/server/bus/compare_manifest.py
@@ -0,0 +1,55 @@
+import json
+import sys
+
+station_stops = {}
+
+
+def runone(path, first=False):
+    unchanged = True
+    current = json.load(open(path))
+    # Print any removed stops first
+    if not first:
+        my_stations = list(current.values())[0]['stations']
+        stat_map = dict(map(lambda x: (x['station'], x), my_stations))
+        for s in station_stops:
+            if s not in stat_map:
+                print("  - Station %s removed in file %s. (Stops: %s)" % (s, path, station_stops[s]))
+                continue
+            for d in station_stops[s]:
+                for stop in station_stops[s][d]:
+                    if stop not in stat_map[s]['stops'][d]:
+                        print("  - Stop %s removed from %s in file %s" % (stop, s, path))
+
+    for i in list(current.values())[0]['stations']:
+        s = i['station']
+
+        if s not in station_stops:
+            station_stops[s] = {}
+            if not first:
+                print(" + Found new station %s" % s)
+                unchanged = False
+        for direction in i['stops']:
+            if direction not in station_stops[s]:
+                station_stops[s][direction] = []
+            for stop in i['stops'][direction]:
+                if stop not in station_stops[s][direction]:
+                    station_stops[s][direction].append(stop)
+                    if not first:
+                        print("  + Found additional stop %s at station %s in %s" % (stop, s, path))
+                        unchanged = False
+    return unchanged
+
+
+def run(paths):
+    unchanged = True
+    runone(paths[0], first=True)
+    for path in reversed(paths[1:]):
+        unchanged = runone(path) and unchanged
+    if unchanged:
+        print("No new stations/stops on route.")
+    else:
+        print("Changed?")
+
+
+if __name__ == "__main__":
+    run(sys.argv[1:])
diff --git a/server/bus/gen_bus_data.sh b/server/bus/gen_bus_data.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -x
+
+routes="$@"
+if [ -z "$routes" ]; then
+	routes="1"
+fi
+
+for y in 2018 2019 2020 2021; do
+    for f in $(find data/input/$y/ -name '*.csv'); do
+	echo "Generating stop data from $f"
+        pipenv run python bus2train.py $f data/output -r $routes
+    done
+done
diff --git a/server/bus/gen_manifests.sh b/server/bus/gen_manifests.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+for route in 1 111 15  22  23  28  32  39  57  66  71  73  77  114  116  117; do
+    mkdir -p data/output/manifests/$route
+
+    for f in $(find data/input/ -name *.csv); do
+        month=$(echo $f | cut -d/ -f4 | cut -d. -f1)
+        pipenv run python manifest.py $f "data/output/manifests/$route/$route_$month.json" --checkpoints "data/input/MBTA_GTFS/checkpoints.txt" -r $route
+    done
+done