made function independent of zipcode mapping

KnowledgeEdgeAI · Dec 10, 2024 · b8bbad6 · b8bbad6
1 parent 5a723d1
commit b8bbad6
Show file tree

Hide file tree

Showing 11 changed files with 404 additions and 155 deletions.
diff --git a/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc b/src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc
diff --git a/src/DP_epidemiology/__pycache__/viz.cpython-310.pyc b/src/DP_epidemiology/__pycache__/viz.cpython-310.pyc
diff --git a/src/DP_epidemiology/contact_matrix.py b/src/DP_epidemiology/contact_matrix.py
@@ -23,9 +23,9 @@
 UPPER_BOUND = 600
 
 
-def validate_input_data(df, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str):
+def validate_input_data(df, city_zipcode_map, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, default_city:str):
     # check city exists in the data
-    df = make_preprocess_location()(df)
+    df = make_preprocess_location(city_zipcode_map,default_city)(df)
     df[time_col] = pd.to_datetime(df[time_col])
     if city not in df[city_col].unique():
         raise ValueError("City does not exist in the data")
@@ -41,9 +41,9 @@ def validate_input_data(df, age_groups, consumption_distribution, start_date: da
             raise ValueError(f"Category {category} does not exist in the data")
 
 
-def get_private_counts(df, categories, start_date: datetime, end_date: datetime, city: str, epsilon: float = 1.0):
+def get_private_counts(df, city_zipcode_map:pd.DataFrame, categories, start_date: datetime, end_date: datetime, city: str, default_city:str, epsilon: float = 1.0):
     t_pre = (
-        make_preprocess_location()
+        make_preprocess_location(city_zipcode_map, default_city)
         >> make_truncate_time(start_date, end_date, time_col=time_col)
         >> make_filter_rows(txn_channel_col, "OFFLINE")
         >> make_filter_rows(city_col, city)
@@ -75,13 +75,13 @@ def get_private_counts(df, categories, start_date: datetime, end_date: datetime,
     return nb_transactions_avg_count_map
 
 
-def get_age_group_count_map(df, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, epsilon: float = 1.0):
+def get_age_group_count_map(df,city_zipcode_map,  age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, default_city:str, epsilon: float = 1.0):
 
     validate_input_data(
-        df, age_groups, consumption_distribution, start_date, end_date, city)
+        df, city_zipcode_map, age_groups, consumption_distribution, start_date, end_date, city, default_city)
 
     nb_transactions_avg_count_map = get_private_counts(
-        df, consumption_distribution.keys(), start_date, end_date, city, epsilon)
+        df, city_zipcode_map, consumption_distribution.keys(), start_date, end_date, city, default_city, epsilon)
 
     # calculate age group to avg count of members from that age group
     age_group_count_map = {}

diff --git a/src/DP_epidemiology/hotspot_analyzer.py b/src/DP_epidemiology/hotspot_analyzer.py
@@ -10,7 +10,7 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from DP_epidemiology.utilities import *
 
-def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city:str,epsilon:float):
+def hotspot_analyzer(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city:str, default_city:str, epsilon:float):
     """final function to predict hotspots"""
     bounds = (0, 600)
     upper_bound=600
@@ -31,7 +31,7 @@ def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city
 
 
     hotspot_predictor=(
-    make_preprocess_location()
+    make_preprocess_location(city_zipcode_map, default_city)
     >>make_filter(transaction_type_col,transaction_type_filter)
     >>make_filter(city_col,city)
     >>make_truncate_time(start_date, end_date, time_col)
@@ -43,9 +43,12 @@ def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city
 if __name__ == "__main__":
     import sys
     path=sys.argv[1]
-    start_date=datetime(sys.argv[2])
-    end_date=datetime(sys.argv[3])
-    city=sys.argv[4]
-    epsilon=sys.argv[5]
+    path_city_zipcode_map=sys.argv[2]
+    start_date=datetime(sys.argv[3])
+    end_date=datetime(sys.argv[4])
+    city=sys.argv[5]
+    default_city=sys.argv[6]
+    epsilon=sys.argv[7]
     df = pd.read_csv(path)
-    print(hotspot_analyzer(df,start_date,end_date,city,epsilon))
+    city_zipcode_map = pd.read_csv(path_city_zipcode_map)
+    print(hotspot_analyzer(df,city_zipcode_map, start_date,end_date,city,default_city, epsilon))
diff --git a/src/DP_epidemiology/mobility_analyzer.py b/src/DP_epidemiology/mobility_analyzer.py
@@ -13,7 +13,7 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from DP_epidemiology.utilities import *
 
-def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
+def mobility_analyzer_airline(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str,default_city:str, epsilon:float):
     """final function to predict hotspots"""
     bounds = (0, 600)
     upper_bound=600
@@ -36,7 +36,7 @@ def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datet
 
 
     analyzer=(
-    make_preprocess_location()
+    make_preprocess_location(city_zipcode_map,default_city)
     >>make_filter(city_col,city)
     >>make_filter(merch_category_col,merch_filter)
     >>make_truncate_time(start_date, end_date, time_col)
@@ -45,7 +45,7 @@ def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datet
 
     return analyzer(new_df)
 
-def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,category:str, epsilon:float):
+def mobility_analyzer(df:pd.DataFrame, city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str, default_city:str, category:str, epsilon:float):
     """final function to predict hotspots"""
     bounds = (0, 600)
     upper_bound=600
@@ -68,7 +68,7 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city
 
 
     analyzer=(
-    make_preprocess_location()
+    make_preprocess_location(city_zipcode_map,default_city)
     >>make_preprocess_merchant_mobility()
     >>make_filter(city_col,city)
     >>make_filter(merch_category_col, category)
@@ -78,8 +78,8 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city
 
     return analyzer(new_df)
 
-def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, category:str, epsilon:float):
-    df_transactional_mobility= mobility_analyzer(df_transactional_data,start_date,end_date,city,category,epsilon)
+def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, default_city:str, category:str, epsilon:float):
+    df_transactional_mobility= mobility_analyzer(df_transactional_data,city_zipcode_map, start_date,end_date,city,default_city, category,epsilon)
     offset=df_transactional_mobility["date"][0]
     df_google_mobility = preprocess_google_mobility(df_google_mobility_data,start_date,end_date,city,category,offset)
 

diff --git a/src/DP_epidemiology/pandemic_adherence_analyzer.py b/src/DP_epidemiology/pandemic_adherence_analyzer.py
@@ -10,7 +10,7 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from DP_epidemiology.utilities import *
 
-def pandemic_adherence_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,essential_or_luxury:str, epsilon:float):
+def pandemic_adherence_analyzer(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str,default_city:str, essential_or_luxury:str, epsilon:float):
     """final function to predict hotspots"""
     bounds = (0, 600)
     upper_bound=600
@@ -32,7 +32,7 @@ def pandemic_adherence_analyzer(df:pd.DataFrame,start_date:datetime,end_date:dat
 
 
     analyzer=(
-    make_preprocess_location()
+    make_preprocess_location(city_zipcode_map, default_city)
     >>make_preprocess_merchant()
     >>make_filter(city_col,city)
     >>make_filter(merch_category_col,essential_or_luxury)

diff --git a/src/DP_epidemiology/utilities.py b/src/DP_epidemiology/utilities.py
@@ -41,27 +41,17 @@ def approx_concentrated_divergence():
     """symmetric distance between the id sets"""
     return dp.user_distance("ApproxConcentratedDivergence()")
 
-def make_preprocess_location():
+def make_preprocess_location(city_zipcode_map:pd.DataFrame, default_city:str="Unknown"):
     """Create a 1-stable transformation to bin `merch_postal_code` by city"""
-
-    def categorize_city(code):
-        if code.startswith("5"):
-            return "Medellin"
-        elif code.startswith("11"):
-            return "Bogota"
-        elif code.startswith("70"):
-            return "Brasilia"
-        else:
-            return "Santiago"
-
+
     def location_preprocess(df):
         loc_df = df.copy()
         # Convert merchant_postal_code into str type
         loc_df["merch_postal_code"] = loc_df["merch_postal_code"].astype(str)
-        # Apply the function to create a new column
-        loc_df["city"] = loc_df["merch_postal_code"].apply(
-            categorize_city
-        )
+        # Create a dictionary for quick lookup
+        zipcode_to_city = dict(zip(city_zipcode_map['merch_postal_code'].astype(str), city_zipcode_map['city']))
+        # Map the city based on the zipcode
+        loc_df["city"] = loc_df["merch_postal_code"].map(zipcode_to_city).fillna(default_city)
         return loc_df
 
     return dp.t.make_user_transformation(

diff --git a/src/DP_epidemiology/viz.py b/src/DP_epidemiology/viz.py
@@ -23,7 +23,7 @@
 from DP_epidemiology.contact_matrix import get_age_group_count_map, get_contact_matrix_country
 
 
-def create_hotspot_dash_app(df:pd.DataFrame):
+def create_hotspot_dash_app(df:pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
     cities = {
     "Medellin": (6.2476, -75.5658),
     "Bogota": (4.7110, -74.0721),
@@ -69,7 +69,7 @@ def update_graph(start_date, end_date, epsilon, city):
         end_date = datetime.strptime(end_date, '%Y-%m-%d')
 
         # Filter data using hotspot_analyser
-        output = hotspot_analyzer(df, start_date, end_date, city, epsilon)
+        output = hotspot_analyzer(df, city_zipcode_map, start_date, end_date, city, default_city, epsilon)
         filtered_df = get_coordinates(output)
 
         # Plot using Plotly Express
@@ -96,7 +96,7 @@ def update_graph(start_date, end_date, epsilon, city):
 
     return app
 
-def create_mobility_dash_app(df: pd.DataFrame):
+def create_mobility_dash_app(df: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
     cities = {
         "Medellin": (6.2476, -75.5658),
         "Bogota": (4.7110, -74.0721),
@@ -152,7 +152,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):
         end_date = datetime.strptime(end_date, '%Y-%m-%d')
 
         # Call the mobility_analyzer function
-        filtered_df = mobility_analyzer(df, start_date, end_date, city_filter, category, epsilon)
+        filtered_df = mobility_analyzer(df, city_zipcode_map, start_date, end_date, city_filter, default_city, category, epsilon)
 
         # Plot using Plotly Express
         fig = px.line(
@@ -221,7 +221,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):
 
     return app
 
-def create_pandemic_adherence_dash_app(df: pd.DataFrame):
+def create_pandemic_adherence_dash_app(df: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
     cities = {
         "Medellin": (6.2476, -75.5658),
         "Bogota": (4.7110, -74.0721),
@@ -276,7 +276,7 @@ def update_graph(start_date, end_date, city_filter, essential_or_luxury, epsilon
         end_date = datetime.strptime(end_date, '%Y-%m-%d')
 
         # Call the pandemic_adherence_analyzer function
-        filtered_df = pandemic_adherence_analyzer(df, start_date, end_date, city_filter, essential_or_luxury, epsilon)
+        filtered_df = pandemic_adherence_analyzer(df, city_zipcode_map, start_date, end_date, city_filter, default_city, essential_or_luxury, epsilon)
 
         # Plot using Plotly Express
         fig = px.line(
@@ -345,7 +345,7 @@ def update_graph(start_date, end_date, city_filter, essential_or_luxury, epsilon
 
     return app
 
-def create_contact_matrix_dash_app(df:pd.DataFrame, age_groups:list=None, consumption_distribution : pd.DataFrame = None, P = None, scaling_factor = None):
+def create_contact_matrix_dash_app(df:pd.DataFrame, city_zipcode_map:pd.DataFrame,default_city:str, age_groups:list=None, consumption_distribution : pd.DataFrame = None, P = None, scaling_factor = None):
     cities = {
         "Medellin": (6.2476, -75.5658),
         "Bogota": (4.7110, -74.0721),
@@ -421,7 +421,7 @@ def update_contact_matrix(start_date, end_date, city, epsilon):
         # Get age group count map
         counts_per_city = []
         for city in cities:
-            counts = get_age_group_count_map(df, age_groups, consumption_distribution, start_date, end_date, city)
+            counts = get_age_group_count_map(df, city_zipcode_map, age_groups, consumption_distribution, start_date, end_date, city, default_city)
             counts_per_city.append(list(counts.values()))
 
         # Hardcoded population distribution for the example
@@ -448,7 +448,7 @@ def update_contact_matrix(start_date, end_date, city, epsilon):
 
 
 
-def create_mobility_validation_dash_app(df_transactional_data: pd.DataFrame, df_google_mobility_data: pd.DataFrame):
+def create_mobility_validation_dash_app(df_transactional_data: pd.DataFrame, df_google_mobility_data: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
     cities = {
         "Medellin": (6.2476, -75.5658),
         "Bogota": (4.7110, -74.0721),
@@ -504,7 +504,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):
         end_date = datetime.strptime(end_date, '%Y-%m-%d')
 
         # Call the mobility_analyzer function
-        filtered_df_transactional = mobility_analyzer(df_transactional_data, start_date, end_date, city_filter, category, epsilon)
+        filtered_df_transactional = mobility_analyzer(df_transactional_data, city_zipcode_map, start_date, end_date, city_filter, default_city, category, epsilon)
 
         # Call the preprocess_google_mobility function
         offset = filtered_df_transactional["date"].iloc[0]