From 5df2850ab8adb8f77bdb7d9b2110c03650976873 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 15 Aug 2024 20:18:18 +0000 Subject: [PATCH 01/10] feat: catch up David's code --- code/util/fetch_data_docDB.py | 46 +++++++++++++++++++++++++++++------ environment/Dockerfile | 3 ++- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py index d456db5..1a3c487 100644 --- a/code/util/fetch_data_docDB.py +++ b/code/util/fetch_data_docDB.py @@ -1,22 +1,47 @@ +"""Code to fetch data from docDB by David Feng +""" + import pandas as pd import logging import time - +import semver logger = logging.getLogger(__name__) +def find_probes(r): + version = semver.Version.parse((r.get('procedures') or {}).get('schema_version', '0.0.0')) + + probes = [] + if version >= "0.8.1": # post introduction of Surgery concept + sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {} + for sub_proc in sub_procs: + if sub_proc.get('procedure_type') == 'Surgery': + for sp in sub_proc['procedures']: + if sp['procedure_type'] == 'Fiber implant': + probes += sp['probes'] + else: # pre Surgery + sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {} + for sp in sub_procs: + if sp['procedure_type'] == 'Fiber implant': + probes += sp['probes'] + + return probes + + def fetch_fip_data(client): # search for records that have the "fib" (for fiber photometry) modality in data_description logger.warning("fetching 'fib' records...") - modality_results = client.retrieve_docdb_records(filter_query={ - "data_description.modality.abbreviation": "fib" - }) + modality_results = client.retrieve_docdb_records( + filter_query={"data_description.modality.abbreviation": "fib"}, + paginate_batch_size=500 + ) # there are more from the past that didn't specify modality correctly. # until this is fixed, need to guess by asset name logger.warning("fetching FIP records by name...") - name_results = client.retrieve_docdb_records(filter_query={ - "name": {"$regex": "^FIP.*"} - }) + name_results = client.retrieve_docdb_records( + filter_query={"name": {"$regex": "^FIP.*"}}, + paginate_batch_size=500 + ) # make some dataframes from these two queries records_by_modality_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in modality_results ]) @@ -29,7 +54,7 @@ def fetch_fip_data(client): records_by_name_df = records_by_name_df.drop(dup_df.index.values) # now we have a master data frame - combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0).drop_duplicates() + combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0)#.drop_duplicates() # let's get processed results too logger.warning("fetching processed results...") @@ -54,6 +79,8 @@ def map_record_to_dict(record): subject = record.get('subject', {}) or {} subject_id = subject.get('subject_id') or '' subject_genotype = subject.get('genotype') or '' + session = record.get('session') or {} + task_type = session.get('session_type') or '' return { 'location': record['location'], @@ -61,6 +88,9 @@ def map_record_to_dict(record): 'creation_time': creation_time, 'subject_id': subject_id, 'subject_genotype': subject_genotype, + 'probes': str(find_probes(record)), + 'task_type': task_type + } diff --git a/environment/Dockerfile b/environment/Dockerfile index 1443947..2158d94 100644 --- a/environment/Dockerfile +++ b/environment/Dockerfile @@ -107,7 +107,8 @@ RUN pip install -U --no-cache-dir \ git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main \ pygwalker==0.4.7 \ aind-data-access-api[docdb]==0.13.0 \ - streamlit-dynamic-filters==0.1.9 + streamlit-dynamic-filters==0.1.9 \ + semver==3.0.2 ADD "https://github.com/coder/code-server/releases/download/v4.21.1/code-server-4.21.1-linux-amd64.tar.gz" /.code-server/code-server.tar.gz From ce5d5d94594e47a01c9c0aafe66d61f6c848536e Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 15 Aug 2024 20:19:07 +0000 Subject: [PATCH 02/10] feat: add cache to doc_DB --- code/pages/3_AIND data access playground.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/code/pages/3_AIND data access playground.py b/code/pages/3_AIND data access playground.py index b9dfcae..60e57ac 100644 --- a/code/pages/3_AIND data access playground.py +++ b/code/pages/3_AIND data access playground.py @@ -2,12 +2,12 @@ ''' import logging -from aind_data_access_api.document_db import MetadataDbClient -from util.fetch_data_docDB import fetch_fip_data - import streamlit as st from streamlit_dynamic_filters import DynamicFilters +from aind_data_access_api.document_db import MetadataDbClient +from util.fetch_data_docDB import fetch_fip_data + try: st.set_page_config(layout="wide", page_title='Foraging behavior browser', @@ -20,8 +20,9 @@ except: pass -@st.cache_data -def load_data(): +@st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours +def load_data_from_docDB(): + client = load_client() df = fetch_fip_data(client) return df @@ -33,9 +34,10 @@ def load_client(): collection="data_assets" ) -client = load_client() -df = load_data() +df = load_data_from_docDB() -dynamic_filters = DynamicFilters(df=df, filters=['subject_id', 'subject_genotype']) +dynamic_filters = DynamicFilters( + df=df, + filters=['subject_id', 'subject_genotype']) dynamic_filters.display_filters() dynamic_filters.display_df() From f9409525ec8f765aec447581179609e1ea2a0ac4 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 15 Aug 2024 20:19:25 +0000 Subject: [PATCH 03/10] build: lock semver version --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ca0c0d..51eb58f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -96,4 +96,5 @@ zipp==3.10.0 git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main pygwalker==0.4.7 aind-data-access-api[docdb]==0.13.0 -streamlit-dynamic-filters==0.1.9 \ No newline at end of file +streamlit-dynamic-filters==0.1.9 +semver==3.0.2 \ No newline at end of file From 49d52428896a2a0d6ab135ca2bfda310e7064d14 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 15 Aug 2024 20:25:15 +0000 Subject: [PATCH 04/10] WIP: add backbone for Rachel --- code/Home.py | 17 +++++++++++++++-- code/pages/3_AIND data access playground.py | 17 +---------------- code/util/fetch_data_docDB.py | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/code/Home.py b/code/Home.py index 4fc3b6f..3251aa0 100644 --- a/code/Home.py +++ b/code/Home.py @@ -42,6 +42,8 @@ multiselect_wrapper_for_url_query, number_input_wrapper_for_url_query, ) +from util.fetch_data_docDB import load_data_from_docDB + from aind_auto_train.curriculum_manager import CurriculumManager from aind_auto_train.auto_train_manager import DynamicForagingAutoTrainManager from aind_auto_train import __version__ as auto_train_version @@ -309,7 +311,7 @@ def init(): # For historial reason, the suffix of df['sessions_bonsai'] just mean the data of the Home.py page df['sessions_bonsai'] = pd.concat([df['sessions_bonsai'], df_bpod['sessions_bonsai']], axis=0) - + st.session_state.df = df st.session_state.df_selected_from_plotly = pd.DataFrame(columns=['h2o', 'session']) st.session_state.df_selected_from_dataframe = pd.DataFrame(columns=['h2o', 'session']) @@ -450,9 +452,12 @@ def _get_data_source(rig): 'task', 'notes'] new_order = first_several_cols + [col for col in _df.columns if col not in first_several_cols] _df = _df[new_order] + + + # --- Load data from docDB --- + merge_in_df_docDB(_df) st.session_state.df['sessions_bonsai'] = _df # Somehow _df loses the reference to the original dataframe - st.session_state.session_stats_names = [keys for keys in _df.keys()] # Set session state from URL @@ -463,6 +468,14 @@ def _get_data_source(rig): return True +def merge_in_df_docDB(_df): + # Fetch df_docDB + + # Parse session and subject_id from session_name + + # Merge with _df + + return _df def app(): diff --git a/code/pages/3_AIND data access playground.py b/code/pages/3_AIND data access playground.py index 60e57ac..ea36fd9 100644 --- a/code/pages/3_AIND data access playground.py +++ b/code/pages/3_AIND data access playground.py @@ -5,8 +5,7 @@ import streamlit as st from streamlit_dynamic_filters import DynamicFilters -from aind_data_access_api.document_db import MetadataDbClient -from util.fetch_data_docDB import fetch_fip_data +from util.fetch_data_docDB import load_data_from_docDB try: st.set_page_config(layout="wide", @@ -20,20 +19,6 @@ except: pass -@st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours -def load_data_from_docDB(): - client = load_client() - df = fetch_fip_data(client) - return df - -@st.cache_resource -def load_client(): - return MetadataDbClient( - host="api.allenneuraldynamics.org", - database="metadata_index", - collection="data_assets" - ) - df = load_data_from_docDB() dynamic_filters = DynamicFilters( diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py index 1a3c487..4cdd7af 100644 --- a/code/util/fetch_data_docDB.py +++ b/code/util/fetch_data_docDB.py @@ -5,8 +5,26 @@ import logging import time import semver +import streamlit as st logger = logging.getLogger(__name__) +from aind_data_access_api.document_db import MetadataDbClient + +@st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours +def load_data_from_docDB(): + client = load_client() + df = fetch_fip_data(client) + return df + +@st.cache_resource +def load_client(): + return MetadataDbClient( + host="api.allenneuraldynamics.org", + database="metadata_index", + collection="data_assets" + ) + + def find_probes(r): version = semver.Version.parse((r.get('procedures') or {}).get('schema_version', '0.0.0')) From 36c4148075f3cf25eea8beb91ba80b51cde6afd6 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Thu, 15 Aug 2024 20:43:05 +0000 Subject: [PATCH 05/10] build: fix semver --- environment/Dockerfile | 1 - requirements.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/environment/Dockerfile b/environment/Dockerfile index 2158d94..4834dd3 100644 --- a/environment/Dockerfile +++ b/environment/Dockerfile @@ -79,7 +79,6 @@ RUN pip install -U --no-cache-dir \ scipy==1.10.0 \ scikit-learn==1.3.2 \ seaborn==0.11.2 \ - semver==2.13.0 \ six==1.16.0 \ smmap==5.0.0 \ statannotations==0.5.0 \ diff --git a/requirements.txt b/requirements.txt index 51eb58f..2c37526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,7 +68,6 @@ s3fs==2022.11.0 scipy==1.10.0 scikit-learn==1.3.2 seaborn==0.11.2 -semver==2.13.0 six==1.16.0 smmap==5.0.0 statannotations==0.5.0 From 3b880de3c9f195dd85862aa4f40768aa0f87f606 Mon Sep 17 00:00:00 2001 From: rachelstephlee Date: Wed, 21 Aug 2024 22:37:27 +0000 Subject: [PATCH 06/10] merged mongoDB dataframe with han's --- code/Home.py | 17 +++++++++++----- code/util/fetch_data_docDB.py | 37 ++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/code/Home.py b/code/Home.py index ef1d9d4..bb1c9f9 100644 --- a/code/Home.py +++ b/code/Home.py @@ -455,7 +455,7 @@ def _get_data_source(rig): # --- Load data from docDB --- - merge_in_df_docDB(_df) + _df = merge_in_df_docDB(_df) st.session_state.df['sessions_bonsai'] = _df # Somehow _df loses the reference to the original dataframe st.session_state.session_stats_names = [keys for keys in _df.keys()] @@ -470,12 +470,19 @@ def _get_data_source(rig): def merge_in_df_docDB(_df): # Fetch df_docDB - + df = load_data_from_docDB() + # Parse session and subject_id from session_name + df['session_date'] = pd.to_datetime(df['session_name'].str.split('_').str[2]) + # Extract the session_time. remove the '-' and remove the leading zero. + df['session_time'] = df['session_name'].str.split('_').str[-1] + df['nwb_suffix'] = df['session_time'].str.replace('-', '').str.lstrip('0').astype('int64') - # Merge with _df - - return _df + # Merge with _df. left merged to keep everything on han's side + + left_merged = pd.merge(_df, df, how='left', on=['subject_id', 'session_date', 'nwb_suffix']) + + return left_merged def app(): diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py index 4cdd7af..0136684 100644 --- a/code/util/fetch_data_docDB.py +++ b/code/util/fetch_data_docDB.py @@ -52,6 +52,7 @@ def fetch_fip_data(client): filter_query={"data_description.modality.abbreviation": "fib"}, paginate_batch_size=500 ) + logger.warning(f"found {len(modality_results)} results") # there are more from the past that didn't specify modality correctly. # until this is fixed, need to guess by asset name @@ -60,19 +61,24 @@ def fetch_fip_data(client): filter_query={"name": {"$regex": "^FIP.*"}}, paginate_batch_size=500 ) - - # make some dataframes from these two queries - records_by_modality_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in modality_results ]) - records_by_name_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in name_results ]) + logger.warning(f"found {len(name_results)} results") + # in case there is overlap between these two queries, filter down to a single list with unique IDs + unique_results_by_id = { r['_id']: r for r in modality_results } | { r['_id']: r for r in name_results } + results = list(unique_results_by_id.values()) + logger.warning(f"found {len(results)} unique results") + + # filter out results with 'processed' in the name because I can't rely on data_description.data_level :( + results = [ r for r in results if not 'processed' in r['name'] ] + + # make a dataframe + records_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in results ]) + # currently there are some sessions uploaded twice in two different locations. # let's filter out the ones in aind-ophys-data, a deprecated location - dup_df = records_by_name_df[records_by_name_df.duplicated('session_name',keep=False)] + dup_df = records_df[records_df.duplicated('session_name',keep=False)] dup_df = dup_df[dup_df.location.str.contains("aind-ophys-data")] - records_by_name_df = records_by_name_df.drop(dup_df.index.values) - - # now we have a master data frame - combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0)#.drop_duplicates() + records_df = records_df.drop(dup_df.index.values) # let's get processed results too logger.warning("fetching processed results...") @@ -84,10 +90,11 @@ def fetch_fip_data(client): processed_results_by_name = { r['name']: r for r in processed_results } # adding two columns to our master dataframe - result name and result s3 location - combined_df['results'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name')) - combined_df['results_location'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location')) + records_df['results'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name')) + records_df['results_location'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location')) - return combined_df + return records_df + def map_record_to_dict(record): @@ -97,8 +104,7 @@ def map_record_to_dict(record): subject = record.get('subject', {}) or {} subject_id = subject.get('subject_id') or '' subject_genotype = subject.get('genotype') or '' - session = record.get('session') or {} - task_type = session.get('session_type') or '' + return { 'location': record['location'], @@ -106,8 +112,7 @@ def map_record_to_dict(record): 'creation_time': creation_time, 'subject_id': subject_id, 'subject_genotype': subject_genotype, - 'probes': str(find_probes(record)), - 'task_type': task_type + 'probes': str(find_probes(record)) } From 1851470d3c53ffc936aaee2a2218bf4ee11c966e Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 21 Aug 2024 23:03:27 +0000 Subject: [PATCH 07/10] fix: always turn group_by column in xy_plot to str --- code/util/streamlit.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/code/util/streamlit.py b/code/util/streamlit.py index 97afa29..7988287 100644 --- a/code/util/streamlit.py +++ b/code/util/streamlit.py @@ -1004,9 +1004,8 @@ def _add_agg(df_this, x_name, y_name, group, aggr_method, if_use_x_quantile, q_q else: df['dot_size'] = dot_size_base - # Turn column of group_by to string if it's not - if not is_string_dtype(df[group_by]): - df[group_by] = df[group_by].astype(str) + # Always turn group_by column to str + df[group_by] = df[group_by].astype(str) # Add a diagonal line first if if_show_diagonal: From 7d548eeedb34e223a8aa15e38cb353e0f65e6b9c Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 21 Aug 2024 23:08:43 +0000 Subject: [PATCH 08/10] build: update config.toml --- .streamlit/config.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.streamlit/config.toml b/.streamlit/config.toml index c9b8c45..3b0d455 100644 --- a/.streamlit/config.toml +++ b/.streamlit/config.toml @@ -1,2 +1,8 @@ [theme] -base="dark" \ No newline at end of file +base="dark" + +[server] +fileWatcherType = "poll" + +[global] +disableWidgetStateDuplicationWarning = true \ No newline at end of file From 9bbe44eaf8e3517c3b4bc17a6cd5fb52f37ae46a Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 21 Aug 2024 23:08:50 +0000 Subject: [PATCH 09/10] ci: bump version --- code/Home.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/Home.py b/code/Home.py index bb1c9f9..cd7bbc2 100644 --- a/code/Home.py +++ b/code/Home.py @@ -12,7 +12,7 @@ """ -__ver__ = 'v2.4.1' +__ver__ = 'v2.5.0' import pandas as pd import streamlit as st From 4bf1feef33088d942d09451de2bd3c6a3c366741 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Wed, 21 Aug 2024 23:12:51 +0000 Subject: [PATCH 10/10] add notes --- code/pages/3_AIND data access playground.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/pages/3_AIND data access playground.py b/code/pages/3_AIND data access playground.py index ea36fd9..1ea1089 100644 --- a/code/pages/3_AIND data access playground.py +++ b/code/pages/3_AIND data access playground.py @@ -21,6 +21,8 @@ df = load_data_from_docDB() +st.markdown(f'### Note: the dataframe showing here has been merged in to the master table on the Home page!') + dynamic_filters = DynamicFilters( df=df, filters=['subject_id', 'subject_genotype'])