From bc59a8e352af2e022e0bb78fa8757c55afcecf66 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 00:32:26 +0000 Subject: [PATCH 01/35] feat: add if_load_docDB flag --- code/Home.py | 44 +++++++++++++++-------------- code/pages/1_Learning trajectory.py | 18 ++++++++++++ 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/code/Home.py b/code/Home.py index 71a809a..f821c40 100644 --- a/code/Home.py +++ b/code/Home.py @@ -280,7 +280,7 @@ def show_curriculums(): pass # ------- Layout starts here -------- # -def init(): +def init(if_load_docDB=True): # Clear specific session state and all filters for key in st.session_state: @@ -449,21 +449,22 @@ def _get_data_source(rig): # --- Load data from docDB --- - _df = merge_in_df_docDB(_df) - - # add docDB_status column - _df["docDB_status"] = _df.apply( - lambda row: ( - "0_not uploaded" - if pd.isnull(row["session_loc"]) - else ( - "1_uploaded but not processed" - if pd.isnull(row["processed_session_loc"]) - else "2_uploaded and processed" - ) - ), - axis=1, - ) + if if_load_docDB: + _df = merge_in_df_docDB(_df) + + # add docDB_status column + _df["docDB_status"] = _df.apply( + lambda row: ( + "0_not uploaded" + if pd.isnull(row["session_loc"]) + else ( + "1_uploaded but not processed" + if pd.isnull(row["processed_session_loc"]) + else "2_uploaded and processed" + ) + ), + axis=1, + ) st.session_state.df['sessions_bonsai'] = _df # Somehow _df loses the reference to the original dataframe st.session_state.session_stats_names = [keys for keys in _df.keys()] @@ -753,9 +754,10 @@ def app(): # st.dataframe(st.session_state.df_session_filtered, use_container_width=True, height=1000) -ok = True -if 'df' not in st.session_state or 'sessions_bonsai' not in st.session_state.df.keys(): - ok = init() +if __name__ == "__main__": + ok = True + if 'df' not in st.session_state or 'sessions_bonsai' not in st.session_state.df.keys(): + ok = init() -if ok: - app() + if ok: + app() diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index d8fa143..5bbea8e 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -10,11 +10,25 @@ from util.aws_s3 import load_data from util.streamlit import add_session_filter, data_selector +from Home import init + ss = st.session_state fs = s3fs.S3FileSystem(anon=False) cache_folder = 'aind-behavior-data/foraging_nwb_bonsai_processed/' +try: + st.set_page_config(layout="wide", + page_title='Foraging behavior browser', + page_icon=':mouse2:', + menu_items={ + 'Report a bug': "https://github.com/AllenNeuralDynamics/foraging-behavior-browser/issues", + 'About': "Github repo: https://github.com/AllenNeuralDynamics/foraging-behavior-browser" + } + ) +except: + pass + def app(): @@ -134,4 +148,8 @@ def do_pca(df, name): ) st.plotly_chart(fig) + +if 'df' not in st.session_state or 'sessions_bonsai' not in st.session_state.df.keys(): + init(if_load_docDB=False) + app() \ No newline at end of file From 7b2962e0c274d8c100bc65a06b21baef45edd07c Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 01:05:24 +0000 Subject: [PATCH 02/35] feat: add a new tab to learning_trajectory --- code/pages/1_Learning trajectory.py | 59 ++++++++++++++++++++++------- code/util/url_query_helper.py | 2 + 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 5bbea8e..f01301c 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -10,6 +10,16 @@ from util.aws_s3 import load_data from util.streamlit import add_session_filter, data_selector +import extra_streamlit_components as stx + +from util.url_query_helper import (checkbox_wrapper_for_url_query, + multiselect_wrapper_for_url_query, + number_input_wrapper_for_url_query, + slider_wrapper_for_url_query, + sync_session_state_to_URL, + sync_URL_to_session_state) + + from Home import init ss = st.session_state @@ -29,18 +39,8 @@ except: pass - -def app(): - - with st.sidebar: - add_session_filter(if_bonsai=True) - data_selector() - - if not hasattr(ss, 'df'): - st.write('##### Data not loaded yet, start from Home:') - st.page_link('Home.py', label='Home', icon="🏠") - return - +@st.cache_data() +def _get_metadata_col(): df = load_data()['sessions_bonsai'] # -- get cols -- @@ -54,9 +54,40 @@ def app(): if not any(ss in s for ss in ['performance'] ) ] + return col_perf, col_task + + +def app(): + with st.sidebar: + add_session_filter(if_bonsai=True) + data_selector() + + if not hasattr(ss, 'df'): + st.write('##### Data not loaded yet, start from Home:') + st.page_link('Home.py', label='Home', icon="🏠") + return - do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_perf], 'performance') - do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_task], 'task') + # === Main tabs === + chosen_id = stx.tab_bar(data=[ + stx.TabBarItemData(id="tab_PCA", title="PCA", description="PCA on performance and task parameters"), + stx.TabBarItemData(id="tab_stage", title="Training stages", description="Compare across training stages"), + ], default=st.query_params['tab_id_learning_trajectory'] if 'tab_id_learning_trajectory' in st.query_params + else st.session_state.tab_id_learning_trajectory) + + placeholder = st.container() + st.session_state.tab_id_learning_trajectory = chosen_id + st.markdown('---') + + if chosen_id == "tab_PCA": + col_perf, col_task = _get_metadata_col() + do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_perf], 'performance') + do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_task], 'task') + elif chosen_id == "tab_stage": + pass + + + # Update back to URL + sync_session_state_to_URL() def do_pca(df, name): diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index e1fa419..bf29a73 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -58,6 +58,8 @@ 'auto_training_curriculum_version': '1.0', 'auto_training_curriculum_schema_version': '1.0', 'auto_training_history_recent_weeks': 8, + + 'tab_id_learning_trajectory': 'tab_PCA', } def checkbox_wrapper_for_url_query(st_prefix, label, key, default, **kwargs): From e27c84e0394f87f3b5808e14723ffb0487f0867f Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 20:39:56 +0000 Subject: [PATCH 03/35] feat: add stage-histogram --- code/pages/1_Learning trajectory.py | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index f01301c..c69c5ca 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -3,6 +3,7 @@ import plotly.graph_objects as go import s3fs import streamlit as st +import matplotlib.pyplot as plt from plotly.subplots import make_subplots from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler @@ -83,6 +84,45 @@ def app(): do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_perf], 'performance') do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_task], 'task') elif chosen_id == "tab_stage": + + df = ss.df_session_filtered + + # Sort stages in the desired order + stage_order = [ + "STAGE_1_WARMUP", "STAGE_1", "STAGE_2", "STAGE_3", "STAGE_4", "STAGE_5", + "STAGE_6", "STAGE_7", "STAGE_8", "STAGE_9", "STAGE_10", "STAGE_FINAL", "GRADUATED" + ] + df['current_stage_actual'] = pd.Categorical(df['current_stage_actual'], categories=stage_order, ordered=True) + df = df.sort_values('current_stage_actual') + + # Start of Streamlit app + st.title("Histogram Visualization by Current Stage") + + # Checkbox to use density or not + use_density = st.checkbox("Use Density", value=False) + + # Multiselect for choosing numeric columns + numeric_columns = df.select_dtypes(include='number').columns + selected_columns = st.multiselect("Select columns to plot histograms", numeric_columns) + + # Create a density plot for each selected column grouped by 'current_stage_actual' + for column in selected_columns: + st.subheader(f"{'Density' if use_density else 'Histogram'} Plot of {column} grouped by 'current_stage_actual'") + fig = go.Figure() + for stage in df['current_stage_actual'].cat.categories: + if stage not in df['current_stage_actual'].unique(): + continue + stage_data = df[df['current_stage_actual'] == stage][column] + y_vals, x_vals = np.histogram(stage_data, bins=20, density=use_density) + fig.add_trace(go.Scatter(x=x_vals[:-1], y=y_vals, mode='lines', name=stage)) + + fig.update_layout( + title=f"{'Density' if use_density else 'Histogram'} Plot of {column} by Current Stage", + xaxis_title=column, + yaxis_title='Density' if use_density else 'Count' + ) + st.plotly_chart(fig) + pass From cfb1c47ed27d208cc2bd04f811f93f4061723451 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 20:40:52 +0000 Subject: [PATCH 04/35] linting --- code/pages/1_Learning trajectory.py | 297 ++++++++++++++++------------ 1 file changed, 176 insertions(+), 121 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index c69c5ca..7f2bdcb 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -13,12 +13,14 @@ import extra_streamlit_components as stx -from util.url_query_helper import (checkbox_wrapper_for_url_query, - multiselect_wrapper_for_url_query, - number_input_wrapper_for_url_query, - slider_wrapper_for_url_query, - sync_session_state_to_URL, - sync_URL_to_session_state) +from util.url_query_helper import ( + checkbox_wrapper_for_url_query, + multiselect_wrapper_for_url_query, + number_input_wrapper_for_url_query, + slider_wrapper_for_url_query, + sync_session_state_to_URL, + sync_URL_to_session_state, +) from Home import init @@ -26,34 +28,51 @@ ss = st.session_state fs = s3fs.S3FileSystem(anon=False) -cache_folder = 'aind-behavior-data/foraging_nwb_bonsai_processed/' +cache_folder = "aind-behavior-data/foraging_nwb_bonsai_processed/" try: - st.set_page_config(layout="wide", - page_title='Foraging behavior browser', - page_icon=':mouse2:', - menu_items={ - 'Report a bug': "https://github.com/AllenNeuralDynamics/foraging-behavior-browser/issues", - 'About': "Github repo: https://github.com/AllenNeuralDynamics/foraging-behavior-browser" - } - ) + st.set_page_config( + layout="wide", + page_title="Foraging behavior browser", + page_icon=":mouse2:", + menu_items={ + "Report a bug": "https://github.com/AllenNeuralDynamics/foraging-behavior-browser/issues", + "About": "Github repo: https://github.com/AllenNeuralDynamics/foraging-behavior-browser", + }, + ) except: pass + @st.cache_data() def _get_metadata_col(): - df = load_data()['sessions_bonsai'] - + df = load_data()["sessions_bonsai"] + # -- get cols -- - col_task = [s for s in df.metadata.columns - if not any(ss in s for ss in ['lickspout', 'weight', 'water', 'time', 'rig', - 'user_name', 'experiment', 'task', 'notes', 'laser'] - ) + col_task = [ + s + for s in df.metadata.columns + if not any( + ss in s + for ss in [ + "lickspout", + "weight", + "water", + "time", + "rig", + "user_name", + "experiment", + "task", + "notes", + "laser", + ] + ) ] - - col_perf = [s for s in df.session_stats.columns - if not any(ss in s for ss in ['performance'] - ) + + col_perf = [ + s + for s in df.session_stats.columns + if not any(ss in s for ss in ["performance"]) ] return col_perf, col_task @@ -62,38 +81,65 @@ def app(): with st.sidebar: add_session_filter(if_bonsai=True) data_selector() - - if not hasattr(ss, 'df'): - st.write('##### Data not loaded yet, start from Home:') - st.page_link('Home.py', label='Home', icon="🏠") + + if not hasattr(ss, "df"): + st.write("##### Data not loaded yet, start from Home:") + st.page_link("Home.py", label="Home", icon="🏠") return - + # === Main tabs === - chosen_id = stx.tab_bar(data=[ - stx.TabBarItemData(id="tab_PCA", title="PCA", description="PCA on performance and task parameters"), - stx.TabBarItemData(id="tab_stage", title="Training stages", description="Compare across training stages"), - ], default=st.query_params['tab_id_learning_trajectory'] if 'tab_id_learning_trajectory' in st.query_params - else st.session_state.tab_id_learning_trajectory) + chosen_id = stx.tab_bar( + data=[ + stx.TabBarItemData( + id="tab_PCA", + title="PCA", + description="PCA on performance and task parameters", + ), + stx.TabBarItemData( + id="tab_stage", + title="Training stages", + description="Compare across training stages", + ), + ], + default=( + st.query_params["tab_id_learning_trajectory"] + if "tab_id_learning_trajectory" in st.query_params + else st.session_state.tab_id_learning_trajectory + ), + ) placeholder = st.container() st.session_state.tab_id_learning_trajectory = chosen_id - st.markdown('---') + st.markdown("---") if chosen_id == "tab_PCA": col_perf, col_task = _get_metadata_col() - do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_perf], 'performance') - do_pca(ss.df_session_filtered.loc[:, ['subject_id', 'session'] + col_task], 'task') + do_pca( + ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_perf], + "performance", + ) + do_pca( + ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_task], "task" + ) elif chosen_id == "tab_stage": - + df = ss.df_session_filtered - + # Sort stages in the desired order stage_order = [ - "STAGE_1_WARMUP", "STAGE_1", "STAGE_2", "STAGE_3", "STAGE_4", "STAGE_5", - "STAGE_6", "STAGE_7", "STAGE_8", "STAGE_9", "STAGE_10", "STAGE_FINAL", "GRADUATED" + "STAGE_1_WARMUP", + "STAGE_1", + "STAGE_2", + "STAGE_3", + "STAGE_4", + "STAGE_5", + "STAGE_FINAL", + "GRADUATED", ] - df['current_stage_actual'] = pd.Categorical(df['current_stage_actual'], categories=stage_order, ordered=True) - df = df.sort_values('current_stage_actual') + df["current_stage_actual"] = pd.Categorical( + df["current_stage_actual"], categories=stage_order, ordered=True + ) + df = df.sort_values("current_stage_actual") # Start of Streamlit app st.title("Histogram Visualization by Current Stage") @@ -102,125 +148,134 @@ def app(): use_density = st.checkbox("Use Density", value=False) # Multiselect for choosing numeric columns - numeric_columns = df.select_dtypes(include='number').columns - selected_columns = st.multiselect("Select columns to plot histograms", numeric_columns) + numeric_columns = df.select_dtypes(include="number").columns + selected_columns = st.multiselect( + "Select columns to plot histograms", numeric_columns + ) # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: - st.subheader(f"{'Density' if use_density else 'Histogram'} Plot of {column} grouped by 'current_stage_actual'") + st.subheader( + f"{'Density' if use_density else 'Histogram'} Plot of {column} grouped by 'current_stage_actual'" + ) fig = go.Figure() - for stage in df['current_stage_actual'].cat.categories: - if stage not in df['current_stage_actual'].unique(): + for stage in df["current_stage_actual"].cat.categories: + if stage not in df["current_stage_actual"].unique(): continue - stage_data = df[df['current_stage_actual'] == stage][column] + stage_data = df[df["current_stage_actual"] == stage][column] y_vals, x_vals = np.histogram(stage_data, bins=20, density=use_density) - fig.add_trace(go.Scatter(x=x_vals[:-1], y=y_vals, mode='lines', name=stage)) - + fig.add_trace( + go.Scatter(x=x_vals[:-1], y=y_vals, mode="lines", name=stage) + ) + fig.update_layout( title=f"{'Density' if use_density else 'Histogram'} Plot of {column} by Current Stage", xaxis_title=column, - yaxis_title='Density' if use_density else 'Count' + yaxis_title="Density" if use_density else "Count", ) st.plotly_chart(fig) - - pass + pass # Update back to URL sync_session_state_to_URL() - + def do_pca(df, name): - df = df.dropna(axis=0, how='any') + df = df.dropna(axis=0, how="any") df = df[~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)] - - df_to_pca = df.drop(columns=['subject_id', 'session']) + + df_to_pca = df.drop(columns=["subject_id", "session"]) df_to_pca = df_to_pca.select_dtypes(include=[np.number, float, int]) - + # Standardize the features x = StandardScaler().fit_transform(df_to_pca) - + # Apply PCA pca = PCA(n_components=10) # Reduce to 2 dimensions for visualization principalComponents = pca.fit_transform(x) - + # Create a new DataFrame with the principal components principalDf = pd.DataFrame(data=principalComponents) - principalDf.index = df.set_index(['subject_id', 'session']).index - + principalDf.index = df.set_index(["subject_id", "session"]).index + principalDf.reset_index(inplace=True) - + # -- trajectory -- - st.markdown(f'### PCA on {name} metrics') + st.markdown(f"### PCA on {name} metrics") fig = go.Figure() - for mouse_id in principalDf['subject_id'].unique(): - subset = principalDf[principalDf['subject_id'] == mouse_id] - + for mouse_id in principalDf["subject_id"].unique(): + subset = principalDf[principalDf["subject_id"] == mouse_id] + # Add a 3D scatter plot for the current group - fig.add_trace(go.Scatter3d( - x=subset[0], - y=subset[1], - z=subset[2], - mode='lines+markers', - marker=dict(size=subset['session'].apply( - lambda x: 5 + 15*(x/20))), - name=f'{mouse_id}', # Name the trace for the legend - )) - - fig.update_layout(title=name, - scene=dict( - xaxis_title='Dim1', - yaxis_title='Dim2', - zaxis_title='Dim3' - ), - width=1300, - height=1000, - font_size=15, - ) + fig.add_trace( + go.Scatter3d( + x=subset[0], + y=subset[1], + z=subset[2], + mode="lines+markers", + marker=dict(size=subset["session"].apply(lambda x: 5 + 15 * (x / 20))), + name=f"{mouse_id}", # Name the trace for the legend + ) + ) + + fig.update_layout( + title=name, + scene=dict(xaxis_title="Dim1", yaxis_title="Dim2", zaxis_title="Dim3"), + width=1300, + height=1000, + font_size=15, + ) st.plotly_chart(fig) - + # -- variance explained -- var_explained = pca.explained_variance_ratio_ fig = go.Figure() - fig.add_trace(go.Scatter( - x=np.arange(1, len(var_explained)+1), - y=np.cumsum(var_explained), + fig.add_trace( + go.Scatter( + x=np.arange(1, len(var_explained) + 1), + y=np.cumsum(var_explained), ) - ) - fig.update_layout(title='Variance Explained', - yaxis=dict(range=[0, 1]), - width=300, - height=400, - font_size=15, - ) + ) + fig.update_layout( + title="Variance Explained", + yaxis=dict(range=[0, 1]), + width=300, + height=400, + font_size=15, + ) st.plotly_chart(fig) - + # -- pca components -- - pca_components = pd.DataFrame(pca.components_, - columns=df_to_pca.columns) + pca_components = pd.DataFrame(pca.components_, columns=df_to_pca.columns) pca_components fig = make_subplots(rows=3, cols=1) - + # In vertical subplots, each subplot show the components of a principal component for i in range(3): - fig.add_trace(go.Bar( - x=pca_components.columns, - y=pca_components.loc[i], - name=f'PC{i+1}', - ), row=i+1, col=1) - - fig.update_xaxes(showticklabels=i==2, row=i+1, col=1) - - fig.update_layout(title='PCA weights', - width=1000, - height=800, - font_size=20, - ) + fig.add_trace( + go.Bar( + x=pca_components.columns, + y=pca_components.loc[i], + name=f"PC{i+1}", + ), + row=i + 1, + col=1, + ) + + fig.update_xaxes(showticklabels=i == 2, row=i + 1, col=1) + + fig.update_layout( + title="PCA weights", + width=1000, + height=800, + font_size=20, + ) st.plotly_chart(fig) - - -if 'df' not in st.session_state or 'sessions_bonsai' not in st.session_state.df.keys(): + + +if "df" not in st.session_state or "sessions_bonsai" not in st.session_state.df.keys(): init(if_load_docDB=False) -app() \ No newline at end of file +app() From 0ab8e1781d5f0417e2f0798cf0fdad06323be1e0 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 20:46:45 +0000 Subject: [PATCH 05/35] feat: add stage color mapper --- code/pages/1_Learning trajectory.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 7f2bdcb..3706b49 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -4,6 +4,7 @@ import s3fs import streamlit as st import matplotlib.pyplot as plt +import matplotlib from plotly.subplots import make_subplots from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler @@ -132,18 +133,19 @@ def app(): "STAGE_2", "STAGE_3", "STAGE_4", - "STAGE_5", "STAGE_FINAL", "GRADUATED", ] + + stage_color_mapper = get_stage_color_mapper(stage_order) + + stage_color_mapper + df["current_stage_actual"] = pd.Categorical( df["current_stage_actual"], categories=stage_order, ordered=True ) df = df.sort_values("current_stage_actual") - # Start of Streamlit app - st.title("Histogram Visualization by Current Stage") - # Checkbox to use density or not use_density = st.checkbox("Use Density", value=False) @@ -162,7 +164,7 @@ def app(): for stage in df["current_stage_actual"].cat.categories: if stage not in df["current_stage_actual"].unique(): continue - stage_data = df[df["current_stage_actual"] == stage][column] + stage_data = df[df["current_stage_actual"] == stage][column].dropna() y_vals, x_vals = np.histogram(stage_data, bins=20, density=use_density) fig.add_trace( go.Scatter(x=x_vals[:-1], y=y_vals, mode="lines", name=stage) @@ -181,6 +183,17 @@ def app(): sync_session_state_to_URL() +def get_stage_color_mapper(stage_list): + # Mapping stages to colors from red to green, return rgb values + # Interpolate between red and green using the number of stages + cmap = plt.cm.get_cmap('RdYlGn', 100) + stage_color_mapper = { + stage: matplotlib.colors.rgb2hex( + cmap(i / (len(stage_list) - 1))) + for i, stage in enumerate(stage_list) + } + return stage_color_mapper + def do_pca(df, name): df = df.dropna(axis=0, how="any") df = df[~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)] From c22f0e12f39cd423425584f75444e6a9d5fb2f60 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 20:48:37 +0000 Subject: [PATCH 06/35] use matched color map --- code/pages/1_Learning trajectory.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 3706b49..03364c3 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -167,7 +167,13 @@ def app(): stage_data = df[df["current_stage_actual"] == stage][column].dropna() y_vals, x_vals = np.histogram(stage_data, bins=20, density=use_density) fig.add_trace( - go.Scatter(x=x_vals[:-1], y=y_vals, mode="lines", name=stage) + go.Scatter( + x=x_vals[:-1], + y=y_vals, + mode="lines", + line=dict(color=stage_color_mapper[stage]), + name=stage, + ) ) fig.update_layout( From a5d38f1f94b9eb026a9ba19fc3d86280bc3e6bb9 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:10:36 +0000 Subject: [PATCH 07/35] feat: add percentile --- code/pages/1_Learning trajectory.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 03364c3..dfbeec1 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -136,17 +136,15 @@ def app(): "STAGE_FINAL", "GRADUATED", ] - stage_color_mapper = get_stage_color_mapper(stage_order) - - stage_color_mapper - + df["current_stage_actual"] = pd.Categorical( df["current_stage_actual"], categories=stage_order, ordered=True ) df = df.sort_values("current_stage_actual") # Checkbox to use density or not + bins = st.slider("Number of bins", 10, 100, 20, 5) use_density = st.checkbox("Use Density", value=False) # Multiselect for choosing numeric columns @@ -165,25 +163,30 @@ def app(): if stage not in df["current_stage_actual"].unique(): continue stage_data = df[df["current_stage_actual"] == stage][column].dropna() - y_vals, x_vals = np.histogram(stage_data, bins=20, density=use_density) + count = len(stage_data) + y_vals, x_vals = np.histogram(stage_data, bins=bins, density=use_density) + percentiles = [np.percentile(stage_data, (np.sum(stage_data <= x) / len(stage_data)) * 100) for x in x_vals[:-1]] + customdata = np.array([percentiles]).T + fig.add_trace( go.Scatter( x=x_vals[:-1], y=y_vals, mode="lines", line=dict(color=stage_color_mapper[stage]), - name=stage, - ) + name=f"{stage} (n={count})", + customdata=customdata, + hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" + ) ) - fig.update_layout( title=f"{'Density' if use_density else 'Histogram'} Plot of {column} by Current Stage", xaxis_title=column, yaxis_title="Density" if use_density else "Count", + hovermode="x unified", ) st.plotly_chart(fig) - pass # Update back to URL sync_session_state_to_URL() From 433c42117f6eec55c37b380726c4b04e0c597a06 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:22:55 +0000 Subject: [PATCH 08/35] fix: percentile --- code/pages/1_Learning trajectory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index dfbeec1..2cfcb01 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -165,12 +165,12 @@ def app(): stage_data = df[df["current_stage_actual"] == stage][column].dropna() count = len(stage_data) y_vals, x_vals = np.histogram(stage_data, bins=bins, density=use_density) - percentiles = [np.percentile(stage_data, (np.sum(stage_data <= x) / len(stage_data)) * 100) for x in x_vals[:-1]] + percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in x_vals[1:]] customdata = np.array([percentiles]).T fig.add_trace( go.Scatter( - x=x_vals[:-1], + x=(x_vals[1:] + x_vals[:-1]) / 2, y=y_vals, mode="lines", line=dict(color=stage_color_mapper[stage]), From 1cde5d292b6758c661b3b69cfa2c98aa91733e9b Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:34:44 +0000 Subject: [PATCH 09/35] feat: add kernel smoothing --- code/pages/1_Learning trajectory.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 2cfcb01..763d485 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -11,6 +11,7 @@ from streamlit_plotly_events import plotly_events from util.aws_s3 import load_data from util.streamlit import add_session_filter, data_selector +from scipy.stats import gaussian_kde import extra_streamlit_components as stx @@ -146,6 +147,7 @@ def app(): # Checkbox to use density or not bins = st.slider("Number of bins", 10, 100, 20, 5) use_density = st.checkbox("Use Density", value=False) + use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) # Multiselect for choosing numeric columns numeric_columns = df.select_dtypes(include="number").columns @@ -159,18 +161,27 @@ def app(): f"{'Density' if use_density else 'Histogram'} Plot of {column} grouped by 'current_stage_actual'" ) fig = go.Figure() + + stage_data_all = df[column].dropna() + stage_data_all = stage_data_all[~stage_data_all.isin([np.inf, -np.inf])] + bin_edges = np.linspace(stage_data_all.min(), stage_data_all.max(), bins) + for stage in df["current_stage_actual"].cat.categories: if stage not in df["current_stage_actual"].unique(): continue stage_data = df[df["current_stage_actual"] == stage][column].dropna() count = len(stage_data) - y_vals, x_vals = np.histogram(stage_data, bins=bins, density=use_density) - percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in x_vals[1:]] + if use_kernel_smooth: + kde = gaussian_kde(stage_data) + y_vals = kde(bin_edges) + else: + y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) + percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in bin_edges[1:]] customdata = np.array([percentiles]).T fig.add_trace( go.Scatter( - x=(x_vals[1:] + x_vals[:-1]) / 2, + x=(bin_edges[1:] + bin_edges[:-1]) / 2, y=y_vals, mode="lines", line=dict(color=stage_color_mapper[stage]), From e9d354592616fdaff35f62cd2229a9751ee41e75 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:39:58 +0000 Subject: [PATCH 10/35] feat: add kernel smoothing --- code/pages/1_Learning trajectory.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 763d485..ae986af 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -146,8 +146,11 @@ def app(): # Checkbox to use density or not bins = st.slider("Number of bins", 10, 100, 20, 5) - use_density = st.checkbox("Use Density", value=False) use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) + if not use_kernel_smooth: + use_density = st.checkbox("Use Density", value=False) + else: + use_density = None # Multiselect for choosing numeric columns numeric_columns = df.select_dtypes(include="number").columns @@ -157,9 +160,6 @@ def app(): # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: - st.subheader( - f"{'Density' if use_density else 'Histogram'} Plot of {column} grouped by 'current_stage_actual'" - ) fig = go.Figure() stage_data_all = df[column].dropna() @@ -191,9 +191,9 @@ def app(): ) ) fig.update_layout( - title=f"{'Density' if use_density else 'Histogram'} Plot of {column} by Current Stage", + title=f"{column}", xaxis_title=column, - yaxis_title="Density" if use_density else "Count", + yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", hovermode="x unified", ) st.plotly_chart(fig) From d14fe7385271b404bca09bb8b8903b0916e32668 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:41:18 +0000 Subject: [PATCH 11/35] minor --- code/pages/1_Learning trajectory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index ae986af..4fadd2d 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -112,7 +112,6 @@ def app(): placeholder = st.container() st.session_state.tab_id_learning_trajectory = chosen_id - st.markdown("---") if chosen_id == "tab_PCA": col_perf, col_task = _get_metadata_col() From bdcbe2ea2aedebced8b2058dfe4e51af350ccec6 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:42:24 +0000 Subject: [PATCH 12/35] refactor --- code/pages/1_Learning trajectory.py | 148 ++++++++++++++-------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 4fadd2d..6644947 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -123,80 +123,7 @@ def app(): ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_task], "task" ) elif chosen_id == "tab_stage": - - df = ss.df_session_filtered - - # Sort stages in the desired order - stage_order = [ - "STAGE_1_WARMUP", - "STAGE_1", - "STAGE_2", - "STAGE_3", - "STAGE_4", - "STAGE_FINAL", - "GRADUATED", - ] - stage_color_mapper = get_stage_color_mapper(stage_order) - - df["current_stage_actual"] = pd.Categorical( - df["current_stage_actual"], categories=stage_order, ordered=True - ) - df = df.sort_values("current_stage_actual") - - # Checkbox to use density or not - bins = st.slider("Number of bins", 10, 100, 20, 5) - use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) - if not use_kernel_smooth: - use_density = st.checkbox("Use Density", value=False) - else: - use_density = None - - # Multiselect for choosing numeric columns - numeric_columns = df.select_dtypes(include="number").columns - selected_columns = st.multiselect( - "Select columns to plot histograms", numeric_columns - ) - - # Create a density plot for each selected column grouped by 'current_stage_actual' - for column in selected_columns: - fig = go.Figure() - - stage_data_all = df[column].dropna() - stage_data_all = stage_data_all[~stage_data_all.isin([np.inf, -np.inf])] - bin_edges = np.linspace(stage_data_all.min(), stage_data_all.max(), bins) - - for stage in df["current_stage_actual"].cat.categories: - if stage not in df["current_stage_actual"].unique(): - continue - stage_data = df[df["current_stage_actual"] == stage][column].dropna() - count = len(stage_data) - if use_kernel_smooth: - kde = gaussian_kde(stage_data) - y_vals = kde(bin_edges) - else: - y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) - percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in bin_edges[1:]] - customdata = np.array([percentiles]).T - - fig.add_trace( - go.Scatter( - x=(bin_edges[1:] + bin_edges[:-1]) / 2, - y=y_vals, - mode="lines", - line=dict(color=stage_color_mapper[stage]), - name=f"{stage} (n={count})", - customdata=customdata, - hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" - ) - ) - fig.update_layout( - title=f"{column}", - xaxis_title=column, - yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", - hovermode="x unified", - ) - st.plotly_chart(fig) - + metrics_grouped_by_stages(df=ss.df_session_filtered) # Update back to URL sync_session_state_to_URL() @@ -307,6 +234,79 @@ def do_pca(df, name): st.plotly_chart(fig) +def metrics_grouped_by_stages(df): + + # Sort stages in the desired order + stage_order = [ + "STAGE_1_WARMUP", + "STAGE_1", + "STAGE_2", + "STAGE_3", + "STAGE_4", + "STAGE_FINAL", + "GRADUATED", + ] + stage_color_mapper = get_stage_color_mapper(stage_order) + + df["current_stage_actual"] = pd.Categorical( + df["current_stage_actual"], categories=stage_order, ordered=True + ) + df = df.sort_values("current_stage_actual") + + # Checkbox to use density or not + bins = st.slider("Number of bins", 10, 100, 20, 5) + use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) + if not use_kernel_smooth: + use_density = st.checkbox("Use Density", value=False) + else: + use_density = None + + # Multiselect for choosing numeric columns + numeric_columns = df.select_dtypes(include="number").columns + selected_columns = st.multiselect( + "Select columns to plot histograms", numeric_columns + ) + + # Create a density plot for each selected column grouped by 'current_stage_actual' + for column in selected_columns: + fig = go.Figure() + + stage_data_all = df[column].dropna() + stage_data_all = stage_data_all[~stage_data_all.isin([np.inf, -np.inf])] + bin_edges = np.linspace(stage_data_all.min(), stage_data_all.max(), bins) + + for stage in df["current_stage_actual"].cat.categories: + if stage not in df["current_stage_actual"].unique(): + continue + stage_data = df[df["current_stage_actual"] == stage][column].dropna() + count = len(stage_data) + if use_kernel_smooth: + kde = gaussian_kde(stage_data) + y_vals = kde(bin_edges) + else: + y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) + percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in bin_edges[1:]] + customdata = np.array([percentiles]).T + + fig.add_trace( + go.Scatter( + x=(bin_edges[1:] + bin_edges[:-1]) / 2, + y=y_vals, + mode="lines", + line=dict(color=stage_color_mapper[stage]), + name=f"{stage} (n={count})", + customdata=customdata, + hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" + ) + ) + fig.update_layout( + title=f"{column}", + xaxis_title=column, + yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", + hovermode="x unified", + ) + st.plotly_chart(fig) + if "df" not in st.session_state or "sessions_bonsai" not in st.session_state.df.keys(): init(if_load_docDB=False) From e44e0f37626e947d67f5ce69bad4f02eeecca485 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:48:13 +0000 Subject: [PATCH 13/35] minor --- code/pages/1_Learning trajectory.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 6644947..0154db6 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -254,12 +254,12 @@ def metrics_grouped_by_stages(df): df = df.sort_values("current_stage_actual") # Checkbox to use density or not - bins = st.slider("Number of bins", 10, 100, 20, 5) use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) - if not use_kernel_smooth: - use_density = st.checkbox("Use Density", value=False) + if use_kernel_smooth: + bins = 100 else: - use_density = None + use_density = st.checkbox("Use Density", value=False) + bins = st.slider("Number of bins", 10, 100, 20, 5) # Multiselect for choosing numeric columns numeric_columns = df.select_dtypes(include="number").columns From 58f03cdd6678b3237700b202764f11675f1db7cd Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:55:17 +0000 Subject: [PATCH 14/35] feat: use task and perf columns --- code/pages/1_Learning trajectory.py | 136 +++++++++++++++------------- 1 file changed, 75 insertions(+), 61 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 0154db6..7f14af9 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -45,6 +45,29 @@ except: pass +# Sort stages in the desired order +STAGE_ORDER = [ + "STAGE_1_WARMUP", + "STAGE_1", + "STAGE_2", + "STAGE_3", + "STAGE_4", + "STAGE_FINAL", + "GRADUATED", +] + +def get_stage_color_mapper(stage_list): + # Mapping stages to colors from red to green, return rgb values + # Interpolate between red and green using the number of stages + cmap = plt.cm.get_cmap('RdYlGn', 100) + stage_color_mapper = { + stage: matplotlib.colors.rgb2hex( + cmap(i / (len(stage_list) - 1))) + for i, stage in enumerate(stage_list) + } + return stage_color_mapper + +STAGE_COLOR_MAPPER = get_stage_color_mapper(STAGE_ORDER) @st.cache_data() def _get_metadata_col(): @@ -67,7 +90,10 @@ def _get_metadata_col(): "task", "notes", "laser", - ] + "commit", + "repo", + "branch", + ] # exclude some columns ) ] @@ -129,17 +155,6 @@ def app(): sync_session_state_to_URL() -def get_stage_color_mapper(stage_list): - # Mapping stages to colors from red to green, return rgb values - # Interpolate between red and green using the number of stages - cmap = plt.cm.get_cmap('RdYlGn', 100) - stage_color_mapper = { - stage: matplotlib.colors.rgb2hex( - cmap(i / (len(stage_list) - 1))) - for i, stage in enumerate(stage_list) - } - return stage_color_mapper - def do_pca(df, name): df = df.dropna(axis=0, how="any") df = df[~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)] @@ -235,77 +250,76 @@ def do_pca(df, name): def metrics_grouped_by_stages(df): - - # Sort stages in the desired order - stage_order = [ - "STAGE_1_WARMUP", - "STAGE_1", - "STAGE_2", - "STAGE_3", - "STAGE_4", - "STAGE_FINAL", - "GRADUATED", - ] - stage_color_mapper = get_stage_color_mapper(stage_order) + + col_perf, col_task = _get_metadata_col() df["current_stage_actual"] = pd.Categorical( - df["current_stage_actual"], categories=stage_order, ordered=True + df["current_stage_actual"], categories=STAGE_ORDER, ordered=True ) df = df.sort_values("current_stage_actual") # Checkbox to use density or not use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) if use_kernel_smooth: + use_density = False bins = 100 else: use_density = st.checkbox("Use Density", value=False) bins = st.slider("Number of bins", 10, 100, 20, 5) # Multiselect for choosing numeric columns - numeric_columns = df.select_dtypes(include="number").columns - selected_columns = st.multiselect( - "Select columns to plot histograms", numeric_columns + selected_perf_columns = st.multiselect( + "Performance metrics", col_perf + ) + selected_task_columns = st.multiselect( + "Task parameters", col_task ) + selected_columns = selected_perf_columns + selected_task_columns # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: - fig = go.Figure() + fig = _plot_histograms(df, column, bins, use_kernel_smooth, use_density) + st.plotly_chart(fig) - stage_data_all = df[column].dropna() - stage_data_all = stage_data_all[~stage_data_all.isin([np.inf, -np.inf])] - bin_edges = np.linspace(stage_data_all.min(), stage_data_all.max(), bins) +@st.cache_data() +def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): + fig = go.Figure() + + stage_data_all = df[column].dropna() + stage_data_all = stage_data_all[~stage_data_all.isin([np.inf, -np.inf])] + bin_edges = np.linspace(stage_data_all.min(), stage_data_all.max(), bins) + + for stage in df["current_stage_actual"].cat.categories: + if stage not in df["current_stage_actual"].unique(): + continue + stage_data = df[df["current_stage_actual"] == stage][column].dropna() + count = len(stage_data) + if use_kernel_smooth: + kde = gaussian_kde(stage_data) + y_vals = kde(bin_edges) + else: + y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) + percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in bin_edges[1:]] + customdata = np.array([percentiles]).T - for stage in df["current_stage_actual"].cat.categories: - if stage not in df["current_stage_actual"].unique(): - continue - stage_data = df[df["current_stage_actual"] == stage][column].dropna() - count = len(stage_data) - if use_kernel_smooth: - kde = gaussian_kde(stage_data) - y_vals = kde(bin_edges) - else: - y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) - percentiles = [(np.sum(stage_data <= x) / len(stage_data)) * 100 for x in bin_edges[1:]] - customdata = np.array([percentiles]).T - - fig.add_trace( - go.Scatter( - x=(bin_edges[1:] + bin_edges[:-1]) / 2, - y=y_vals, - mode="lines", - line=dict(color=stage_color_mapper[stage]), - name=f"{stage} (n={count})", - customdata=customdata, - hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" - ) + fig.add_trace( + go.Scatter( + x=(bin_edges[1:] + bin_edges[:-1]) / 2, + y=y_vals, + mode="lines", + line=dict(color=STAGE_COLOR_MAPPER[stage]), + name=f"{stage} (n={count})", + customdata=customdata, + hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" ) - fig.update_layout( - title=f"{column}", - xaxis_title=column, - yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", - hovermode="x unified", ) - st.plotly_chart(fig) + fig.update_layout( + title=f"{column}", + xaxis_title=column, + yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", + hovermode="x unified", + ) + return fig if "df" not in st.session_state or "sessions_bonsai" not in st.session_state.df.keys(): init(if_load_docDB=False) From 4f6442de538ba704c49d33d1f7d642b381a787a3 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 21:59:28 +0000 Subject: [PATCH 15/35] minor --- code/pages/1_Learning trajectory.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 7f14af9..4761b7d 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -12,6 +12,7 @@ from util.aws_s3 import load_data from util.streamlit import add_session_filter, data_selector from scipy.stats import gaussian_kde +import streamlit_nested_layout import extra_streamlit_components as stx @@ -149,6 +150,7 @@ def app(): ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_task], "task" ) elif chosen_id == "tab_stage": + st.markdown("### Distributions of metrics and/or parameters grouped by training stages") metrics_grouped_by_stages(df=ss.df_session_filtered) # Update back to URL @@ -258,8 +260,18 @@ def metrics_grouped_by_stages(df): ) df = df.sort_values("current_stage_actual") + # Multiselect for choosing numeric columns + cols = st.columns([1, 1, 1]) + selected_perf_columns = cols[0].multiselect( + "Performance metrics to plot", col_perf + ) + selected_task_columns = cols[1].multiselect( + "Task parameters to plot", col_task + ) + selected_columns = selected_perf_columns + selected_task_columns + # Checkbox to use density or not - use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=False) + use_kernel_smooth = st.checkbox("Use Kernel Smoothing", value=True) if use_kernel_smooth: use_density = False bins = 100 @@ -267,14 +279,6 @@ def metrics_grouped_by_stages(df): use_density = st.checkbox("Use Density", value=False) bins = st.slider("Number of bins", 10, 100, 20, 5) - # Multiselect for choosing numeric columns - selected_perf_columns = st.multiselect( - "Performance metrics", col_perf - ) - selected_task_columns = st.multiselect( - "Task parameters", col_task - ) - selected_columns = selected_perf_columns + selected_task_columns # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: From a49be7cd17dabfc653ecb47a291e7a92324727d4 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 22:15:39 +0000 Subject: [PATCH 16/35] fix: remove negative ITI --- code/Home.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/Home.py b/code/Home.py index f821c40..18e1085 100644 --- a/code/Home.py +++ b/code/Home.py @@ -385,6 +385,8 @@ def _get_data_source(rig): _df.loc[_df['water_in_session_manual'] > 100, ['water_in_session_manual', 'water_in_session_total', 'water_after_session']] = np.nan + _df.loc[_df['duration_iti_median'] < 0, + ['duration_iti_median', 'duration_iti_mean', 'duration_iti_std', 'duration_iti_min', 'duration_iti_max']] = np.nan # # add something else # add abs(bais) to all terms that have 'bias' in name From 78e5caacc8c79faff66f2131946e7437a6252a65 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 23:24:34 +0000 Subject: [PATCH 17/35] feat: save to url --- code/pages/1_Learning trajectory.py | 31 ++++++++++++++++++----------- code/util/url_query_helper.py | 2 ++ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 4761b7d..82b942c 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -57,6 +57,7 @@ "GRADUATED", ] +@st.cache_data() def get_stage_color_mapper(stage_list): # Mapping stages to colors from red to green, return rgb values # Interpolate between red and green using the number of stages @@ -105,6 +106,7 @@ def _get_metadata_col(): ] return col_perf, col_task +COL_PERF, COL_TASK = _get_metadata_col() def app(): with st.sidebar: @@ -141,13 +143,12 @@ def app(): st.session_state.tab_id_learning_trajectory = chosen_id if chosen_id == "tab_PCA": - col_perf, col_task = _get_metadata_col() do_pca( - ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_perf], + ss.df_session_filtered.loc[:, ["subject_id", "session"] + COL_PERF], "performance", ) do_pca( - ss.df_session_filtered.loc[:, ["subject_id", "session"] + col_task], "task" + ss.df_session_filtered.loc[:, ["subject_id", "session"] + COL_TASK], "task" ) elif chosen_id == "tab_stage": st.markdown("### Distributions of metrics and/or parameters grouped by training stages") @@ -252,9 +253,7 @@ def do_pca(df, name): def metrics_grouped_by_stages(df): - - col_perf, col_task = _get_metadata_col() - + df["current_stage_actual"] = pd.Categorical( df["current_stage_actual"], categories=STAGE_ORDER, ordered=True ) @@ -262,12 +261,21 @@ def metrics_grouped_by_stages(df): # Multiselect for choosing numeric columns cols = st.columns([1, 1, 1]) - selected_perf_columns = cols[0].multiselect( - "Performance metrics to plot", col_perf - ) - selected_task_columns = cols[1].multiselect( - "Task parameters to plot", col_task + + selected_perf_columns = multiselect_wrapper_for_url_query( + cols[0], + label= "Performance metrics to plot", + options=COL_PERF, + default=["finished_trials", "finished_rage", "foraging_eff_random_seed"], + key='stage_distribution_selected_perf_columns', ) + selected_task_columns = multiselect_wrapper_for_url_query( + cols[1], + label= "Task parameters to plot", + options=COL_TASK, + default=["effective_block_length_median", "duration_iti_median", "p_reward_contrast_mean"], + key='stage_distribution_selected_task_columns', + ) selected_columns = selected_perf_columns + selected_task_columns # Checkbox to use density or not @@ -279,7 +287,6 @@ def metrics_grouped_by_stages(df): use_density = st.checkbox("Use Density", value=False) bins = st.slider("Number of bins", 10, 100, 20, 5) - # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: fig = _plot_histograms(df, column, bins, use_kernel_smooth, use_density) diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index bf29a73..ad57b3d 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -60,6 +60,8 @@ 'auto_training_history_recent_weeks': 8, 'tab_id_learning_trajectory': 'tab_PCA', + 'stage_distribution_selected_perf_columns': ["finished_trials", "finished_rage", "foraging_eff_random_seed"], + 'stage_distribution_selected_perf_columns': ["effective_block_length_median", "duration_iti_median", "p_reward_contrast_mean"], } def checkbox_wrapper_for_url_query(st_prefix, label, key, default, **kwargs): From 4993a2932d48b2d4a2d7d8af76a15a418ebdeab8 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 23:27:27 +0000 Subject: [PATCH 18/35] fix --- code/pages/1_Learning trajectory.py | 6 +++--- code/util/url_query_helper.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 82b942c..a55b9db 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -263,17 +263,17 @@ def metrics_grouped_by_stages(df): cols = st.columns([1, 1, 1]) selected_perf_columns = multiselect_wrapper_for_url_query( - cols[0], + st, label= "Performance metrics to plot", options=COL_PERF, default=["finished_trials", "finished_rage", "foraging_eff_random_seed"], key='stage_distribution_selected_perf_columns', ) selected_task_columns = multiselect_wrapper_for_url_query( - cols[1], + st, label= "Task parameters to plot", options=COL_TASK, - default=["effective_block_length_median", "duration_iti_median", "p_reward_contrast_mean"], + default=["effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean"], key='stage_distribution_selected_task_columns', ) selected_columns = selected_perf_columns + selected_task_columns diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index ad57b3d..acfad72 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -61,7 +61,7 @@ 'tab_id_learning_trajectory': 'tab_PCA', 'stage_distribution_selected_perf_columns': ["finished_trials", "finished_rage", "foraging_eff_random_seed"], - 'stage_distribution_selected_perf_columns': ["effective_block_length_median", "duration_iti_median", "p_reward_contrast_mean"], + 'stage_distribution_selected_task_columns': ["effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean"], } def checkbox_wrapper_for_url_query(st_prefix, label, key, default, **kwargs): From 315fe7d270ab66c0250f030a8d2b885c81bb983d Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 23:30:14 +0000 Subject: [PATCH 19/35] improve title --- code/pages/1_Learning trajectory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index a55b9db..daee9f8 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -264,7 +264,7 @@ def metrics_grouped_by_stages(df): selected_perf_columns = multiselect_wrapper_for_url_query( st, - label= "Performance metrics to plot", + label= "Animal performance to plot", options=COL_PERF, default=["finished_trials", "finished_rage", "foraging_eff_random_seed"], key='stage_distribution_selected_perf_columns', @@ -325,7 +325,7 @@ def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): ) ) fig.update_layout( - title=f"{column}", + title=f'{"Animal performance: " if column in COL_PERF else "Task parameters: "}{column}', xaxis_title=column, yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", hovermode="x unified", From 2ec2dbaa7b32860c48d5b0afd5b44142180b8642 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 23:34:11 +0000 Subject: [PATCH 20/35] minor --- code/pages/1_Learning trajectory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index daee9f8..2b369a8 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -284,8 +284,8 @@ def metrics_grouped_by_stages(df): use_density = False bins = 100 else: + bins = st.columns([1, 5])[0].slider("Number of bins", 10, 100, 20, 5) use_density = st.checkbox("Use Density", value=False) - bins = st.slider("Number of bins", 10, 100, 20, 5) # Create a density plot for each selected column grouped by 'current_stage_actual' for column in selected_columns: From b4eda9abdb4545c503042f13c30cba47b66a9d00 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 4 Dec 2024 23:48:57 +0000 Subject: [PATCH 21/35] further remove abnormal iti_mean --- code/Home.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/Home.py b/code/Home.py index 18e1085..52178c6 100644 --- a/code/Home.py +++ b/code/Home.py @@ -385,7 +385,7 @@ def _get_data_source(rig): _df.loc[_df['water_in_session_manual'] > 100, ['water_in_session_manual', 'water_in_session_total', 'water_after_session']] = np.nan - _df.loc[_df['duration_iti_median'] < 0, + _df.loc[(_df['duration_iti_median'] < 0) | (_df['duration_iti_mean'] < 0), ['duration_iti_median', 'duration_iti_mean', 'duration_iti_std', 'duration_iti_min', 'duration_iti_max']] = np.nan # # add something else From b3c7dcc5ab72f028e2a620f90c7a071b761e4ed4 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 06:53:48 +0000 Subject: [PATCH 22/35] use columns --- code/pages/1_Learning trajectory.py | 17 ++++++++++------- code/util/url_query_helper.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 2b369a8..53b79ba 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -259,14 +259,12 @@ def metrics_grouped_by_stages(df): ) df = df.sort_values("current_stage_actual") - # Multiselect for choosing numeric columns - cols = st.columns([1, 1, 1]) - + # Multiselect for choosing numeric columns selected_perf_columns = multiselect_wrapper_for_url_query( st, label= "Animal performance to plot", options=COL_PERF, - default=["finished_trials", "finished_rage", "foraging_eff_random_seed"], + default=["finished_trials", "finished_rate", "foraging_eff_random_seed"], key='stage_distribution_selected_perf_columns', ) selected_task_columns = multiselect_wrapper_for_url_query( @@ -286,11 +284,16 @@ def metrics_grouped_by_stages(df): else: bins = st.columns([1, 5])[0].slider("Number of bins", 10, 100, 20, 5) use_density = st.checkbox("Use Density", value=False) + + # Columns to plot + num_plot_cols = st.columns([1, 7])[0].slider("Number of plotting columns", 1, 5, 4) + cols = st.columns([1] * num_plot_cols) # Create a density plot for each selected column grouped by 'current_stage_actual' - for column in selected_columns: - fig = _plot_histograms(df, column, bins, use_kernel_smooth, use_density) - st.plotly_chart(fig) + for n, column in enumerate(selected_columns): + with cols[n % num_plot_cols]: + fig = _plot_histograms(df, column, bins, use_kernel_smooth, use_density) + st.plotly_chart(fig, use_container_width=True) @st.cache_data() def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index acfad72..df3b3df 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -60,7 +60,7 @@ 'auto_training_history_recent_weeks': 8, 'tab_id_learning_trajectory': 'tab_PCA', - 'stage_distribution_selected_perf_columns': ["finished_trials", "finished_rage", "foraging_eff_random_seed"], + 'stage_distribution_selected_perf_columns': ["finished_trials", "finished_rate", "foraging_eff_random_seed"], 'stage_distribution_selected_task_columns': ["effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean"], } From 15a9e7ca043381d6f28438d9432f09da6761e3b6 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 07:27:47 +0000 Subject: [PATCH 23/35] minor --- code/pages/1_Learning trajectory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 53b79ba..2480bd8 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -262,7 +262,7 @@ def metrics_grouped_by_stages(df): # Multiselect for choosing numeric columns selected_perf_columns = multiselect_wrapper_for_url_query( st, - label= "Animal performance to plot", + label= "Animal performance metrics to plot", options=COL_PERF, default=["finished_trials", "finished_rate", "foraging_eff_random_seed"], key='stage_distribution_selected_perf_columns', From 7e00d3c57cb50050ed925a8ffe91bb6e78996e2b Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 07:38:32 +0000 Subject: [PATCH 24/35] legend --- code/pages/1_Learning trajectory.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 2480bd8..10a2aeb 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -306,8 +306,11 @@ def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): for stage in df["current_stage_actual"].cat.categories: if stage not in df["current_stage_actual"].unique(): continue - stage_data = df[df["current_stage_actual"] == stage][column].dropna() - count = len(stage_data) + stage_data = df[df["current_stage_actual"] == stage][[column, "subject_id"]].dropna() + n_sessions = len(stage_data) + n_mice = len(stage_data["subject_id"].unique()) + + stage_data = stage_data[column] if use_kernel_smooth: kde = gaussian_kde(stage_data) y_vals = kde(bin_edges) @@ -322,7 +325,7 @@ def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): y=y_vals, mode="lines", line=dict(color=STAGE_COLOR_MAPPER[stage]), - name=f"{stage} (n={count})", + name=f"{stage}
({n_mice} mice, {n_sessions} sessions)", customdata=customdata, hovertemplate=f"Percentile: %{{customdata[0]:.2f}}%
" ) From 65c71a2b7cbd851122c0623c851b7249d6d0f2c3 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 07:52:54 +0000 Subject: [PATCH 25/35] feat: split according to curriculum --- code/pages/1_Learning trajectory.py | 41 +++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 10a2aeb..bc08101 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -253,13 +253,13 @@ def do_pca(df, name): def metrics_grouped_by_stages(df): - + df["current_stage_actual"] = pd.Categorical( df["current_stage_actual"], categories=STAGE_ORDER, ordered=True ) df = df.sort_values("current_stage_actual") - # Multiselect for choosing numeric columns + # Multiselect for choosing numeric columns selected_perf_columns = multiselect_wrapper_for_url_query( st, label= "Animal performance metrics to plot", @@ -284,17 +284,30 @@ def metrics_grouped_by_stages(df): else: bins = st.columns([1, 5])[0].slider("Number of bins", 10, 100, 20, 5) use_density = st.checkbox("Use Density", value=False) - - # Columns to plot + num_plot_cols = st.columns([1, 7])[0].slider("Number of plotting columns", 1, 5, 4) - cols = st.columns([1] * num_plot_cols) # Create a density plot for each selected column grouped by 'current_stage_actual' - for n, column in enumerate(selected_columns): - with cols[n % num_plot_cols]: - fig = _plot_histograms(df, column, bins, use_kernel_smooth, use_density) - st.plotly_chart(fig, use_container_width=True) - + unique_curriculum_name = df["curriculum_name"].unique() + for curriculum_name in [name for name in unique_curriculum_name if name != "None"]: + st.markdown(f"### Curriculum name: {curriculum_name}") + + # Columns to plot + cols = st.columns([1] * num_plot_cols) + for n, column in enumerate(selected_columns): + with cols[n % num_plot_cols]: + fig = _plot_histograms( + df[df["curriculum_name"] == curriculum_name], + column, + bins, + use_kernel_smooth, + use_density, + ) + st.plotly_chart(fig, use_container_width=True) + + st.markdown("---") + + @st.cache_data() def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): fig = go.Figure() @@ -312,7 +325,13 @@ def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): stage_data = stage_data[column] if use_kernel_smooth: - kde = gaussian_kde(stage_data) + if len(stage_data.unique()) == 1: + # Handle case with only one unique value + unique_value = stage_data.iloc[0] + # Create a small range around the unique value for KDE + kde = lambda x: np.exp(-((x - unique_value) ** 2) / (unique_value/100)) # Fallback + else: + kde = gaussian_kde(stage_data) y_vals = kde(bin_edges) else: y_vals, _ = np.histogram(stage_data, bins=bin_edges, density=use_density) From b91f672416ad6b74dce24e2b26f25cd39197e20d Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 08:00:00 +0000 Subject: [PATCH 26/35] feat: add more columns (hardcode) --- code/pages/1_Learning trajectory.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index bc08101..b649b0b 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -97,12 +97,23 @@ def _get_metadata_col(): "branch", ] # exclude some columns ) + ] + [ + 'avg_trial_length_in_seconds', ] col_perf = [ s for s in df.session_stats.columns if not any(ss in s for ss in ["performance"]) + ] + [ + #TODO: build column groups in Home.py. Now I'm hardcoding. + 'abs(bias_naive)', + 'abs(logistic_Su2022_bias)', + 'logistic_Su2022_RewC_tau', + 'logistic_Su2022_UnrC_amp', + 'logistic_Su2022_UnrC_tau', + 'logistic_Su2022_bias', + 'logistic_Su2022_score_mean', ] return col_perf, col_task @@ -286,6 +297,7 @@ def metrics_grouped_by_stages(df): use_density = st.checkbox("Use Density", value=False) num_plot_cols = st.columns([1, 7])[0].slider("Number of plotting columns", 1, 5, 4) + st.markdown("---") # Create a density plot for each selected column grouped by 'current_stage_actual' unique_curriculum_name = df["curriculum_name"].unique() From 942dca91794200c9ab81edbd2b7d509db1f7015d Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 08:11:54 +0000 Subject: [PATCH 27/35] improve title --- code/pages/1_Learning trajectory.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index b649b0b..ea89882 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -308,6 +308,10 @@ def metrics_grouped_by_stages(df): cols = st.columns([1] * num_plot_cols) for n, column in enumerate(selected_columns): with cols[n % num_plot_cols]: + st.markdown(f'''
Animal performance: {column}''' + if column in COL_PERF + else f"
Task parameter: {column}", + unsafe_allow_html=True) fig = _plot_histograms( df[df["curriculum_name"] == curriculum_name], column, @@ -362,7 +366,6 @@ def _plot_histograms(df, column, bins, use_kernel_smooth, use_density): ) ) fig.update_layout( - title=f'{"Animal performance: " if column in COL_PERF else "Task parameters: "}{column}', xaxis_title=column, yaxis_title="Kernel density" if use_kernel_smooth else "Density" if use_density else "Count", hovermode="x unified", From 1eec52e987cbdf48b9c28ab3dc70f2497a687489 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 08:15:56 +0000 Subject: [PATCH 28/35] minor --- code/pages/1_Learning trajectory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index ea89882..98b7892 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -300,7 +300,7 @@ def metrics_grouped_by_stages(df): st.markdown("---") # Create a density plot for each selected column grouped by 'current_stage_actual' - unique_curriculum_name = df["curriculum_name"].unique() + unique_curriculum_name = ['Uncoupled Without Baiting', 'Uncoupled Baiting', 'Coupled Baiting'] for curriculum_name in [name for name in unique_curriculum_name if name != "None"]: st.markdown(f"### Curriculum name: {curriculum_name}") @@ -308,7 +308,7 @@ def metrics_grouped_by_stages(df): cols = st.columns([1] * num_plot_cols) for n, column in enumerate(selected_columns): with cols[n % num_plot_cols]: - st.markdown(f'''
Animal performance: {column}''' + st.write(f'''
Animal performance: {column}''' if column in COL_PERF else f"
Task parameter: {column}", unsafe_allow_html=True) From 336131994510faec88fe204aad13cfcfef06652b Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 08:24:43 +0000 Subject: [PATCH 29/35] modify default list --- code/Home.py | 3 + code/util/url_query_helper.py | 135 +++++++++++++++++++--------------- 2 files changed, 77 insertions(+), 61 deletions(-) diff --git a/code/Home.py b/code/Home.py index 52178c6..14f2433 100644 --- a/code/Home.py +++ b/code/Home.py @@ -388,6 +388,9 @@ def _get_data_source(rig): _df.loc[(_df['duration_iti_median'] < 0) | (_df['duration_iti_mean'] < 0), ['duration_iti_median', 'duration_iti_mean', 'duration_iti_std', 'duration_iti_min', 'duration_iti_max']] = np.nan + _df.loc[_df['invalid_lick_ratio'] < 0, + ['invalid_lick_ratio']]= np.nan + # # add something else # add abs(bais) to all terms that have 'bias' in name for col in _df.columns: diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index df3b3df..1199a4a 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -9,60 +9,73 @@ # dict of "key": default pairs # Note: When creating the widget, add argument "value"/"index" as well as "key" for all widgets you want to sync with URL to_sync_with_url_query_default = { - 'if_load_bpod_sessions': False, + "if_load_bpod_sessions": False, + "to_filter_columns": [ + "subject_id", + "task", + "session", + "finished_trials", + "foraging_eff", + ], + "filter_subject_id": "", + "filter_session": [0.0, None], + "filter_finished_trials": [0.0, None], + "filter_foraging_eff": [0.0, None], + "filter_task": ["all"], + "table_height": 300, + "tab_id": "tab_auto_train_history", + "x_y_plot_xname": "session", + "x_y_plot_yname": "foraging_performance_random_seed", + "x_y_plot_group_by": "h2o", + "x_y_plot_if_show_dots": True, + "x_y_plot_if_aggr_each_group": True, + "x_y_plot_aggr_method_group": "lowess", + "x_y_plot_if_aggr_all": True, + "x_y_plot_aggr_method_all": "mean +/- sem", + "x_y_plot_smooth_factor": 5, + "x_y_plot_if_use_x_quantile_group": False, + "x_y_plot_q_quantiles_group": 20, + "x_y_plot_if_use_x_quantile_all": False, + "x_y_plot_q_quantiles_all": 20, + "x_y_plot_if_show_diagonal": False, + "x_y_plot_dot_size": 10, + "x_y_plot_dot_opacity": 0.3, + "x_y_plot_line_width": 2.0, + "x_y_plot_figure_width": 1300, + "x_y_plot_figure_height": 900, + "x_y_plot_font_size_scale": 1.0, + "x_y_plot_selected_color_map": "Plotly", + "x_y_plot_size_mapper": "finished_trials", + "x_y_plot_size_mapper_gamma": 1.0, + "x_y_plot_size_mapper_range": [3, 20], + "session_plot_mode": "sessions selected from table or plot", + "session_plot_selected_draw_types": list(draw_type_mapper_session_level.keys()), + "session_plot_number_cols": 3, + "auto_training_history_x_axis": "date", + "auto_training_history_sort_by": "first_date", + "auto_training_history_sort_order": "descending", + "auto_training_curriculum_name": "Uncoupled Baiting", + "auto_training_curriculum_version": "1.0", + "auto_training_curriculum_schema_version": "1.0", + "auto_training_history_recent_weeks": 8, - 'to_filter_columns': ['subject_id', 'task', 'session', 'finished_trials', 'foraging_eff'], - 'filter_subject_id': '', - 'filter_session': [0.0, None], - 'filter_finished_trials': [0.0, None], - 'filter_foraging_eff': [0.0, None], - 'filter_task': ['all'], - - 'table_height': 300, - - 'tab_id': 'tab_auto_train_history', - 'x_y_plot_xname': 'session', - 'x_y_plot_yname': 'foraging_performance_random_seed', - 'x_y_plot_group_by': 'h2o', - 'x_y_plot_if_show_dots': True, - 'x_y_plot_if_aggr_each_group': True, - 'x_y_plot_aggr_method_group': 'lowess', - 'x_y_plot_if_aggr_all': True, - 'x_y_plot_aggr_method_all': 'mean +/- sem', - 'x_y_plot_smooth_factor': 5, - 'x_y_plot_if_use_x_quantile_group': False, - 'x_y_plot_q_quantiles_group': 20, - 'x_y_plot_if_use_x_quantile_all': False, - 'x_y_plot_q_quantiles_all': 20, - 'x_y_plot_if_show_diagonal': False, - 'x_y_plot_dot_size': 10, - 'x_y_plot_dot_opacity': 0.3, - 'x_y_plot_line_width': 2.0, - 'x_y_plot_figure_width': 1300, - 'x_y_plot_figure_height': 900, - 'x_y_plot_font_size_scale': 1.0, - 'x_y_plot_selected_color_map': 'Plotly', - - 'x_y_plot_size_mapper': 'finished_trials', - 'x_y_plot_size_mapper_gamma': 1.0, - 'x_y_plot_size_mapper_range': [3, 20], - - 'session_plot_mode': 'sessions selected from table or plot', - 'session_plot_selected_draw_types': list(draw_type_mapper_session_level.keys()), - 'session_plot_number_cols': 3, - - 'auto_training_history_x_axis': 'date', - 'auto_training_history_sort_by': 'first_date', - 'auto_training_history_sort_order': 'descending', - 'auto_training_curriculum_name': 'Uncoupled Baiting', - 'auto_training_curriculum_version': '1.0', - 'auto_training_curriculum_schema_version': '1.0', - 'auto_training_history_recent_weeks': 8, - - 'tab_id_learning_trajectory': 'tab_PCA', - 'stage_distribution_selected_perf_columns': ["finished_trials", "finished_rate", "foraging_eff_random_seed"], - 'stage_distribution_selected_task_columns': ["effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean"], - } + "tab_id_learning_trajectory": "tab_PCA", + "stage_distribution_selected_perf_columns": [ + "finished_trials", + "finished_rate", + "foraging_eff_random_seed", + "abs(logistic_Su2022_bias)", + "early_lick_rate", + "invalid_lick_ratio", + "double_dipping_rate_finished_trials", + "lick_consistency_mean_finished_trials", + ], + "stage_distribution_selected_task_columns": [ + "effective_block_length_median", + "duration_iti_mean", + "p_reward_contrast_mean", + ], +} def checkbox_wrapper_for_url_query(st_prefix, label, key, default, **kwargs): return st_prefix.checkbox( @@ -90,7 +103,7 @@ def selectbox_wrapper_for_url_query(st_prefix, label, options, key, default, **k key=key, **kwargs, ) - + def multiselect_wrapper_for_url_query(st_prefix, label, options, key, default, **kwargs): return st_prefix.multiselect( label, @@ -128,8 +141,8 @@ def slider_wrapper_for_url_query(st_prefix, label, min_value, max_value, key, de key=key, **kwargs, ) - - + + def number_input_wrapper_for_url_query(st_prefix, label, min_value, max_value, key, default, **kwargs): return st_prefix.number_input( label=label, @@ -145,8 +158,8 @@ def number_input_wrapper_for_url_query(st_prefix, label, min_value, max_value, k key=key, **kwargs, ) - - + + def sync_URL_to_session_state(): """Assign session_state to sync with URL""" @@ -213,7 +226,7 @@ def sync_URL_to_session_state(): st.session_state[key] = default except: print(f'Failed to set {key} to {default}') - + def sync_session_state_to_URL(): # Add all 'filter_' fields to the default list @@ -235,8 +248,8 @@ def sync_session_state_to_URL(): st.query_params.update({key: st.session_state[key]}) except: print(f'Failed to update {key} to URL query') - - + + def get_filter_type(df, column): if is_numeric_dtype(df[column]): return 'slider_range_float' From 9d32c7d8672602a4403e4b014abf483e01402e4d Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 18:23:59 +0000 Subject: [PATCH 30/35] change default --- code/pages/1_Learning trajectory.py | 10 +++++----- code/util/url_query_helper.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index 98b7892..b4f3ee3 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -132,16 +132,16 @@ def app(): # === Main tabs === chosen_id = stx.tab_bar( data=[ - stx.TabBarItemData( - id="tab_PCA", - title="PCA", - description="PCA on performance and task parameters", - ), stx.TabBarItemData( id="tab_stage", title="Training stages", description="Compare across training stages", ), + stx.TabBarItemData( + id="tab_PCA", + title="PCA", + description="PCA on performance and task parameters", + ), ], default=( st.query_params["tab_id_learning_trajectory"] diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index 1199a4a..0ba47f2 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -59,7 +59,7 @@ "auto_training_curriculum_schema_version": "1.0", "auto_training_history_recent_weeks": 8, - "tab_id_learning_trajectory": "tab_PCA", + "tab_id_learning_trajectory": "tab_stage", "stage_distribution_selected_perf_columns": [ "finished_trials", "finished_rate", From 18f808388b6628c6d63cac55f7932b984e0011b1 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 18:43:26 +0000 Subject: [PATCH 31/35] feat: add rewC amp --- code/pages/1_Learning trajectory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index b4f3ee3..c0e5f4c 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -109,6 +109,7 @@ def _get_metadata_col(): #TODO: build column groups in Home.py. Now I'm hardcoding. 'abs(bias_naive)', 'abs(logistic_Su2022_bias)', + 'logistic_Su2022_RewC_amp', 'logistic_Su2022_RewC_tau', 'logistic_Su2022_UnrC_amp', 'logistic_Su2022_UnrC_tau', From 25c75a6979f4bafb1bf0637af12aa53704f9d756 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 18:57:43 +0000 Subject: [PATCH 32/35] add weight --- code/pages/1_Learning trajectory.py | 6 ++++-- code/util/url_query_helper.py | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Learning trajectory.py index c0e5f4c..85baea1 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Learning trajectory.py @@ -23,6 +23,7 @@ slider_wrapper_for_url_query, sync_session_state_to_URL, sync_URL_to_session_state, + to_sync_with_url_query_default, ) @@ -99,6 +100,7 @@ def _get_metadata_col(): ) ] + [ 'avg_trial_length_in_seconds', + 'weight_after_ratio', ] col_perf = [ @@ -276,14 +278,14 @@ def metrics_grouped_by_stages(df): st, label= "Animal performance metrics to plot", options=COL_PERF, - default=["finished_trials", "finished_rate", "foraging_eff_random_seed"], + default=to_sync_with_url_query_default["stage_distribution_selected_perf_columns"], key='stage_distribution_selected_perf_columns', ) selected_task_columns = multiselect_wrapper_for_url_query( st, label= "Task parameters to plot", options=COL_TASK, - default=["effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean"], + default=to_sync_with_url_query_default["stage_distribution_selected_task_columns"], key='stage_distribution_selected_task_columns', ) selected_columns = selected_perf_columns + selected_task_columns diff --git a/code/util/url_query_helper.py b/code/util/url_query_helper.py index 0ba47f2..f9cc99f 100644 --- a/code/util/url_query_helper.py +++ b/code/util/url_query_helper.py @@ -65,15 +65,20 @@ "finished_rate", "foraging_eff_random_seed", "abs(logistic_Su2022_bias)", + 'logistic_Su2022_RewC_amp', + 'logistic_Su2022_RewC_tau', + 'logistic_Su2022_UnrC_amp', + 'logistic_Su2022_UnrC_tau', + 'logistic_Su2022_score_mean', "early_lick_rate", "invalid_lick_ratio", "double_dipping_rate_finished_trials", - "lick_consistency_mean_finished_trials", ], "stage_distribution_selected_task_columns": [ "effective_block_length_median", "duration_iti_mean", "p_reward_contrast_mean", + "weight_after_ratio", ], } From 73e75fe14db465b9c22b9d93f10361e7f6a7526e Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 19:10:19 +0000 Subject: [PATCH 33/35] rename page --- .../{1_Learning trajectory.py => 1_Basic behavior analysis.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename code/pages/{1_Learning trajectory.py => 1_Basic behavior analysis.py} (99%) diff --git a/code/pages/1_Learning trajectory.py b/code/pages/1_Basic behavior analysis.py similarity index 99% rename from code/pages/1_Learning trajectory.py rename to code/pages/1_Basic behavior analysis.py index 85baea1..2cc5495 100644 --- a/code/pages/1_Learning trajectory.py +++ b/code/pages/1_Basic behavior analysis.py @@ -142,7 +142,7 @@ def app(): ), stx.TabBarItemData( id="tab_PCA", - title="PCA", + title="Learning trajectory", description="PCA on performance and task parameters", ), ], From b6a6a69eb67a76ee786d50ec96c44f9411939c5a Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 19:20:19 +0000 Subject: [PATCH 34/35] bump version --- code/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/__init__.py b/code/__init__.py index 0f9ce39..005fd5f 100644 --- a/code/__init__.py +++ b/code/__init__.py @@ -1 +1 @@ -__ver__ = 'v2.5.5' +__ver__ = 'v2.5.6' From 784f06beb6c1b67bff10b6978d1544cc0c9e2f16 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 5 Dec 2024 19:23:02 +0000 Subject: [PATCH 35/35] minor tweak of sidebar --- code/pages/1_Basic behavior analysis.py | 3 ++- code/util/streamlit.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/code/pages/1_Basic behavior analysis.py b/code/pages/1_Basic behavior analysis.py index 2cc5495..9c3b752 100644 --- a/code/pages/1_Basic behavior analysis.py +++ b/code/pages/1_Basic behavior analysis.py @@ -10,7 +10,7 @@ from sklearn.preprocessing import StandardScaler from streamlit_plotly_events import plotly_events from util.aws_s3 import load_data -from util.streamlit import add_session_filter, data_selector +from util.streamlit import add_session_filter, data_selector, add_footnote from scipy.stats import gaussian_kde import streamlit_nested_layout @@ -126,6 +126,7 @@ def app(): with st.sidebar: add_session_filter(if_bonsai=True) data_selector() + add_footnote() if not hasattr(ss, "df"): st.write("##### Data not loaded yet, start from Home:") diff --git a/code/util/streamlit.py b/code/util/streamlit.py index 0849ff3..5ef2b06 100644 --- a/code/util/streamlit.py +++ b/code/util/streamlit.py @@ -691,13 +691,14 @@ def add_dot_property_mapper(): def data_selector(): - with st.expander(f'Session selector', expanded=True): - # --- add a download button --- - _add_download_filtered_session() - + with st.expander(f'Session selector', expanded=True): with st.expander(f"Filtered: {len(st.session_state.df_session_filtered)} sessions, " f"{len(st.session_state.df_session_filtered.h2o.unique())} mice", expanded=False): st.dataframe(st.session_state.df_session_filtered) + + # --- add a download button --- + with st.columns([1, 10])[1]: + _add_download_filtered_session() # cols = st.columns([4, 1]) # with cols[0].expander(f"From dataframe: {len(st.session_state.df_selected_from_dataframe)} sessions", expanded=False):