Skip to content

Commit

Permalink
made function independent of zipcode mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
Kshubham20 committed Dec 10, 2024
1 parent 5a723d1 commit b8bbad6
Show file tree
Hide file tree
Showing 11 changed files with 404 additions and 155 deletions.
Binary file modified src/DP_epidemiology/__pycache__/contact_matrix.cpython-310.pyc
Binary file not shown.
Binary file modified src/DP_epidemiology/__pycache__/viz.cpython-310.pyc
Binary file not shown.
14 changes: 7 additions & 7 deletions src/DP_epidemiology/contact_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
UPPER_BOUND = 600


def validate_input_data(df, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str):
def validate_input_data(df, city_zipcode_map, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, default_city:str):
# check city exists in the data
df = make_preprocess_location()(df)
df = make_preprocess_location(city_zipcode_map,default_city)(df)
df[time_col] = pd.to_datetime(df[time_col])
if city not in df[city_col].unique():
raise ValueError("City does not exist in the data")
Expand All @@ -41,9 +41,9 @@ def validate_input_data(df, age_groups, consumption_distribution, start_date: da
raise ValueError(f"Category {category} does not exist in the data")


def get_private_counts(df, categories, start_date: datetime, end_date: datetime, city: str, epsilon: float = 1.0):
def get_private_counts(df, city_zipcode_map:pd.DataFrame, categories, start_date: datetime, end_date: datetime, city: str, default_city:str, epsilon: float = 1.0):
t_pre = (
make_preprocess_location()
make_preprocess_location(city_zipcode_map, default_city)
>> make_truncate_time(start_date, end_date, time_col=time_col)
>> make_filter_rows(txn_channel_col, "OFFLINE")
>> make_filter_rows(city_col, city)
Expand Down Expand Up @@ -75,13 +75,13 @@ def get_private_counts(df, categories, start_date: datetime, end_date: datetime,
return nb_transactions_avg_count_map


def get_age_group_count_map(df, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, epsilon: float = 1.0):
def get_age_group_count_map(df,city_zipcode_map, age_groups, consumption_distribution, start_date: datetime, end_date: datetime, city: str, default_city:str, epsilon: float = 1.0):

validate_input_data(
df, age_groups, consumption_distribution, start_date, end_date, city)
df, city_zipcode_map, age_groups, consumption_distribution, start_date, end_date, city, default_city)

nb_transactions_avg_count_map = get_private_counts(
df, consumption_distribution.keys(), start_date, end_date, city, epsilon)
df, city_zipcode_map, consumption_distribution.keys(), start_date, end_date, city, default_city, epsilon)

# calculate age group to avg count of members from that age group
age_group_count_map = {}
Expand Down
17 changes: 10 additions & 7 deletions src/DP_epidemiology/hotspot_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from DP_epidemiology.utilities import *

def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city:str,epsilon:float):
def hotspot_analyzer(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city:str, default_city:str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
Expand All @@ -31,7 +31,7 @@ def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city


hotspot_predictor=(
make_preprocess_location()
make_preprocess_location(city_zipcode_map, default_city)
>>make_filter(transaction_type_col,transaction_type_filter)
>>make_filter(city_col,city)
>>make_truncate_time(start_date, end_date, time_col)
Expand All @@ -43,9 +43,12 @@ def hotspot_analyzer(df:pd.DataFrame, start_date:datetime,end_date:datetime,city
if __name__ == "__main__":
import sys
path=sys.argv[1]
start_date=datetime(sys.argv[2])
end_date=datetime(sys.argv[3])
city=sys.argv[4]
epsilon=sys.argv[5]
path_city_zipcode_map=sys.argv[2]
start_date=datetime(sys.argv[3])
end_date=datetime(sys.argv[4])
city=sys.argv[5]
default_city=sys.argv[6]
epsilon=sys.argv[7]
df = pd.read_csv(path)
print(hotspot_analyzer(df,start_date,end_date,city,epsilon))
city_zipcode_map = pd.read_csv(path_city_zipcode_map)
print(hotspot_analyzer(df,city_zipcode_map, start_date,end_date,city,default_city, epsilon))
12 changes: 6 additions & 6 deletions src/DP_epidemiology/mobility_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from DP_epidemiology.utilities import *

def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str, epsilon:float):
def mobility_analyzer_airline(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str,default_city:str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
Expand All @@ -36,7 +36,7 @@ def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datet


analyzer=(
make_preprocess_location()
make_preprocess_location(city_zipcode_map,default_city)
>>make_filter(city_col,city)
>>make_filter(merch_category_col,merch_filter)
>>make_truncate_time(start_date, end_date, time_col)
Expand All @@ -45,7 +45,7 @@ def mobility_analyzer_airline(df:pd.DataFrame,start_date:datetime,end_date:datet

return analyzer(new_df)

def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,category:str, epsilon:float):
def mobility_analyzer(df:pd.DataFrame, city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str, default_city:str, category:str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
Expand All @@ -68,7 +68,7 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city


analyzer=(
make_preprocess_location()
make_preprocess_location(city_zipcode_map,default_city)
>>make_preprocess_merchant_mobility()
>>make_filter(city_col,city)
>>make_filter(merch_category_col, category)
Expand All @@ -78,8 +78,8 @@ def mobility_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city

return analyzer(new_df)

def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, category:str, epsilon:float):
df_transactional_mobility= mobility_analyzer(df_transactional_data,start_date,end_date,city,category,epsilon)
def mobility_validation_with_google_mobility(df_transactional_data:pd.DataFrame, df_google_mobility_data:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime, end_date:datetime, city:str, default_city:str, category:str, epsilon:float):
df_transactional_mobility= mobility_analyzer(df_transactional_data,city_zipcode_map, start_date,end_date,city,default_city, category,epsilon)
offset=df_transactional_mobility["date"][0]
df_google_mobility = preprocess_google_mobility(df_google_mobility_data,start_date,end_date,city,category,offset)

Expand Down
4 changes: 2 additions & 2 deletions src/DP_epidemiology/pandemic_adherence_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from DP_epidemiology.utilities import *

def pandemic_adherence_analyzer(df:pd.DataFrame,start_date:datetime,end_date:datetime,city: str,essential_or_luxury:str, epsilon:float):
def pandemic_adherence_analyzer(df:pd.DataFrame,city_zipcode_map:pd.DataFrame, start_date:datetime,end_date:datetime,city: str,default_city:str, essential_or_luxury:str, epsilon:float):
"""final function to predict hotspots"""
bounds = (0, 600)
upper_bound=600
Expand All @@ -32,7 +32,7 @@ def pandemic_adherence_analyzer(df:pd.DataFrame,start_date:datetime,end_date:dat


analyzer=(
make_preprocess_location()
make_preprocess_location(city_zipcode_map, default_city)
>>make_preprocess_merchant()
>>make_filter(city_col,city)
>>make_filter(merch_category_col,essential_or_luxury)
Expand Down
22 changes: 6 additions & 16 deletions src/DP_epidemiology/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,17 @@ def approx_concentrated_divergence():
"""symmetric distance between the id sets"""
return dp.user_distance("ApproxConcentratedDivergence()")

def make_preprocess_location():
def make_preprocess_location(city_zipcode_map:pd.DataFrame, default_city:str="Unknown"):
"""Create a 1-stable transformation to bin `merch_postal_code` by city"""

def categorize_city(code):
if code.startswith("5"):
return "Medellin"
elif code.startswith("11"):
return "Bogota"
elif code.startswith("70"):
return "Brasilia"
else:
return "Santiago"


def location_preprocess(df):
loc_df = df.copy()
# Convert merchant_postal_code into str type
loc_df["merch_postal_code"] = loc_df["merch_postal_code"].astype(str)
# Apply the function to create a new column
loc_df["city"] = loc_df["merch_postal_code"].apply(
categorize_city
)
# Create a dictionary for quick lookup
zipcode_to_city = dict(zip(city_zipcode_map['merch_postal_code'].astype(str), city_zipcode_map['city']))
# Map the city based on the zipcode
loc_df["city"] = loc_df["merch_postal_code"].map(zipcode_to_city).fillna(default_city)
return loc_df

return dp.t.make_user_transformation(
Expand Down
20 changes: 10 additions & 10 deletions src/DP_epidemiology/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from DP_epidemiology.contact_matrix import get_age_group_count_map, get_contact_matrix_country


def create_hotspot_dash_app(df:pd.DataFrame):
def create_hotspot_dash_app(df:pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
cities = {
"Medellin": (6.2476, -75.5658),
"Bogota": (4.7110, -74.0721),
Expand Down Expand Up @@ -69,7 +69,7 @@ def update_graph(start_date, end_date, epsilon, city):
end_date = datetime.strptime(end_date, '%Y-%m-%d')

# Filter data using hotspot_analyser
output = hotspot_analyzer(df, start_date, end_date, city, epsilon)
output = hotspot_analyzer(df, city_zipcode_map, start_date, end_date, city, default_city, epsilon)
filtered_df = get_coordinates(output)

# Plot using Plotly Express
Expand All @@ -96,7 +96,7 @@ def update_graph(start_date, end_date, epsilon, city):

return app

def create_mobility_dash_app(df: pd.DataFrame):
def create_mobility_dash_app(df: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
cities = {
"Medellin": (6.2476, -75.5658),
"Bogota": (4.7110, -74.0721),
Expand Down Expand Up @@ -152,7 +152,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):
end_date = datetime.strptime(end_date, '%Y-%m-%d')

# Call the mobility_analyzer function
filtered_df = mobility_analyzer(df, start_date, end_date, city_filter, category, epsilon)
filtered_df = mobility_analyzer(df, city_zipcode_map, start_date, end_date, city_filter, default_city, category, epsilon)

# Plot using Plotly Express
fig = px.line(
Expand Down Expand Up @@ -221,7 +221,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):

return app

def create_pandemic_adherence_dash_app(df: pd.DataFrame):
def create_pandemic_adherence_dash_app(df: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
cities = {
"Medellin": (6.2476, -75.5658),
"Bogota": (4.7110, -74.0721),
Expand Down Expand Up @@ -276,7 +276,7 @@ def update_graph(start_date, end_date, city_filter, essential_or_luxury, epsilon
end_date = datetime.strptime(end_date, '%Y-%m-%d')

# Call the pandemic_adherence_analyzer function
filtered_df = pandemic_adherence_analyzer(df, start_date, end_date, city_filter, essential_or_luxury, epsilon)
filtered_df = pandemic_adherence_analyzer(df, city_zipcode_map, start_date, end_date, city_filter, default_city, essential_or_luxury, epsilon)

# Plot using Plotly Express
fig = px.line(
Expand Down Expand Up @@ -345,7 +345,7 @@ def update_graph(start_date, end_date, city_filter, essential_or_luxury, epsilon

return app

def create_contact_matrix_dash_app(df:pd.DataFrame, age_groups:list=None, consumption_distribution : pd.DataFrame = None, P = None, scaling_factor = None):
def create_contact_matrix_dash_app(df:pd.DataFrame, city_zipcode_map:pd.DataFrame,default_city:str, age_groups:list=None, consumption_distribution : pd.DataFrame = None, P = None, scaling_factor = None):
cities = {
"Medellin": (6.2476, -75.5658),
"Bogota": (4.7110, -74.0721),
Expand Down Expand Up @@ -421,7 +421,7 @@ def update_contact_matrix(start_date, end_date, city, epsilon):
# Get age group count map
counts_per_city = []
for city in cities:
counts = get_age_group_count_map(df, age_groups, consumption_distribution, start_date, end_date, city)
counts = get_age_group_count_map(df, city_zipcode_map, age_groups, consumption_distribution, start_date, end_date, city, default_city)
counts_per_city.append(list(counts.values()))

# Hardcoded population distribution for the example
Expand All @@ -448,7 +448,7 @@ def update_contact_matrix(start_date, end_date, city, epsilon):



def create_mobility_validation_dash_app(df_transactional_data: pd.DataFrame, df_google_mobility_data: pd.DataFrame):
def create_mobility_validation_dash_app(df_transactional_data: pd.DataFrame, df_google_mobility_data: pd.DataFrame,city_zipcode_map:pd.DataFrame,default_city:str):
cities = {
"Medellin": (6.2476, -75.5658),
"Bogota": (4.7110, -74.0721),
Expand Down Expand Up @@ -504,7 +504,7 @@ def update_graph(start_date, end_date, city_filter, category, epsilon):
end_date = datetime.strptime(end_date, '%Y-%m-%d')

# Call the mobility_analyzer function
filtered_df_transactional = mobility_analyzer(df_transactional_data, start_date, end_date, city_filter, category, epsilon)
filtered_df_transactional = mobility_analyzer(df_transactional_data, city_zipcode_map, start_date, end_date, city_filter, default_city, category, epsilon)

# Call the preprocess_google_mobility function
offset = filtered_df_transactional["date"].iloc[0]
Expand Down
Loading

0 comments on commit b8bbad6

Please sign in to comment.