diff --git a/.streamlit/config.toml b/.streamlit/config.toml index c9b8c45..3b0d455 100644 --- a/.streamlit/config.toml +++ b/.streamlit/config.toml @@ -1,2 +1,8 @@ [theme] -base="dark" \ No newline at end of file +base="dark" + +[server] +fileWatcherType = "poll" + +[global] +disableWidgetStateDuplicationWarning = true \ No newline at end of file diff --git a/code/Home.py b/code/Home.py index 20d8513..cd7bbc2 100644 --- a/code/Home.py +++ b/code/Home.py @@ -12,7 +12,7 @@ """ -__ver__ = 'v2.4.1' +__ver__ = 'v2.5.0' import pandas as pd import streamlit as st @@ -42,6 +42,8 @@ multiselect_wrapper_for_url_query, number_input_wrapper_for_url_query, ) +from util.fetch_data_docDB import load_data_from_docDB + from aind_auto_train.curriculum_manager import CurriculumManager from aind_auto_train.auto_train_manager import DynamicForagingAutoTrainManager from aind_auto_train import __version__ as auto_train_version @@ -309,7 +311,7 @@ def init(): # For historial reason, the suffix of df['sessions_bonsai'] just mean the data of the Home.py page df['sessions_bonsai'] = pd.concat([df['sessions_bonsai'], df_bpod['sessions_bonsai']], axis=0) - + st.session_state.df = df st.session_state.df_selected_from_plotly = pd.DataFrame(columns=['h2o', 'session']) st.session_state.df_selected_from_dataframe = pd.DataFrame(columns=['h2o', 'session']) @@ -450,9 +452,12 @@ def _get_data_source(rig): 'task', 'notes'] new_order = first_several_cols + [col for col in _df.columns if col not in first_several_cols] _df = _df[new_order] + + + # --- Load data from docDB --- + _df = merge_in_df_docDB(_df) st.session_state.df['sessions_bonsai'] = _df # Somehow _df loses the reference to the original dataframe - st.session_state.session_stats_names = [keys for keys in _df.keys()] # Set session state from URL @@ -463,6 +468,21 @@ def _get_data_source(rig): return True +def merge_in_df_docDB(_df): + # Fetch df_docDB + df = load_data_from_docDB() + + # Parse session and subject_id from session_name + df['session_date'] = pd.to_datetime(df['session_name'].str.split('_').str[2]) + # Extract the session_time. remove the '-' and remove the leading zero. + df['session_time'] = df['session_name'].str.split('_').str[-1] + df['nwb_suffix'] = df['session_time'].str.replace('-', '').str.lstrip('0').astype('int64') + + # Merge with _df. left merged to keep everything on han's side + + left_merged = pd.merge(_df, df, how='left', on=['subject_id', 'session_date', 'nwb_suffix']) + + return left_merged def app(): diff --git a/code/pages/3_AIND data access playground.py b/code/pages/3_AIND data access playground.py index b9dfcae..1ea1089 100644 --- a/code/pages/3_AIND data access playground.py +++ b/code/pages/3_AIND data access playground.py @@ -2,12 +2,11 @@ ''' import logging -from aind_data_access_api.document_db import MetadataDbClient -from util.fetch_data_docDB import fetch_fip_data - import streamlit as st from streamlit_dynamic_filters import DynamicFilters +from util.fetch_data_docDB import load_data_from_docDB + try: st.set_page_config(layout="wide", page_title='Foraging behavior browser', @@ -20,22 +19,12 @@ except: pass -@st.cache_data -def load_data(): - df = fetch_fip_data(client) - return df - -@st.cache_resource -def load_client(): - return MetadataDbClient( - host="api.allenneuraldynamics.org", - database="metadata_index", - collection="data_assets" - ) +df = load_data_from_docDB() -client = load_client() -df = load_data() +st.markdown(f'### Note: the dataframe showing here has been merged in to the master table on the Home page!') -dynamic_filters = DynamicFilters(df=df, filters=['subject_id', 'subject_genotype']) +dynamic_filters = DynamicFilters( + df=df, + filters=['subject_id', 'subject_genotype']) dynamic_filters.display_filters() dynamic_filters.display_df() diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py index d456db5..0136684 100644 --- a/code/util/fetch_data_docDB.py +++ b/code/util/fetch_data_docDB.py @@ -1,35 +1,84 @@ +"""Code to fetch data from docDB by David Feng +""" + import pandas as pd import logging import time - +import semver +import streamlit as st logger = logging.getLogger(__name__) +from aind_data_access_api.document_db import MetadataDbClient + +@st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours +def load_data_from_docDB(): + client = load_client() + df = fetch_fip_data(client) + return df + +@st.cache_resource +def load_client(): + return MetadataDbClient( + host="api.allenneuraldynamics.org", + database="metadata_index", + collection="data_assets" + ) + + +def find_probes(r): + version = semver.Version.parse((r.get('procedures') or {}).get('schema_version', '0.0.0')) + + probes = [] + if version >= "0.8.1": # post introduction of Surgery concept + sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {} + for sub_proc in sub_procs: + if sub_proc.get('procedure_type') == 'Surgery': + for sp in sub_proc['procedures']: + if sp['procedure_type'] == 'Fiber implant': + probes += sp['probes'] + else: # pre Surgery + sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {} + for sp in sub_procs: + if sp['procedure_type'] == 'Fiber implant': + probes += sp['probes'] + + return probes + + def fetch_fip_data(client): # search for records that have the "fib" (for fiber photometry) modality in data_description logger.warning("fetching 'fib' records...") - modality_results = client.retrieve_docdb_records(filter_query={ - "data_description.modality.abbreviation": "fib" - }) + modality_results = client.retrieve_docdb_records( + filter_query={"data_description.modality.abbreviation": "fib"}, + paginate_batch_size=500 + ) + logger.warning(f"found {len(modality_results)} results") # there are more from the past that didn't specify modality correctly. # until this is fixed, need to guess by asset name logger.warning("fetching FIP records by name...") - name_results = client.retrieve_docdb_records(filter_query={ - "name": {"$regex": "^FIP.*"} - }) - - # make some dataframes from these two queries - records_by_modality_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in modality_results ]) - records_by_name_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in name_results ]) + name_results = client.retrieve_docdb_records( + filter_query={"name": {"$regex": "^FIP.*"}}, + paginate_batch_size=500 + ) + logger.warning(f"found {len(name_results)} results") + # in case there is overlap between these two queries, filter down to a single list with unique IDs + unique_results_by_id = { r['_id']: r for r in modality_results } | { r['_id']: r for r in name_results } + results = list(unique_results_by_id.values()) + logger.warning(f"found {len(results)} unique results") + + # filter out results with 'processed' in the name because I can't rely on data_description.data_level :( + results = [ r for r in results if not 'processed' in r['name'] ] + + # make a dataframe + records_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in results ]) + # currently there are some sessions uploaded twice in two different locations. # let's filter out the ones in aind-ophys-data, a deprecated location - dup_df = records_by_name_df[records_by_name_df.duplicated('session_name',keep=False)] + dup_df = records_df[records_df.duplicated('session_name',keep=False)] dup_df = dup_df[dup_df.location.str.contains("aind-ophys-data")] - records_by_name_df = records_by_name_df.drop(dup_df.index.values) - - # now we have a master data frame - combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0).drop_duplicates() + records_df = records_df.drop(dup_df.index.values) # let's get processed results too logger.warning("fetching processed results...") @@ -41,10 +90,11 @@ def fetch_fip_data(client): processed_results_by_name = { r['name']: r for r in processed_results } # adding two columns to our master dataframe - result name and result s3 location - combined_df['results'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name')) - combined_df['results_location'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location')) + records_df['results'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name')) + records_df['results_location'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location')) - return combined_df + return records_df + def map_record_to_dict(record): @@ -55,12 +105,15 @@ def map_record_to_dict(record): subject_id = subject.get('subject_id') or '' subject_genotype = subject.get('genotype') or '' + return { 'location': record['location'], 'session_name': record['name'], 'creation_time': creation_time, 'subject_id': subject_id, 'subject_genotype': subject_genotype, + 'probes': str(find_probes(record)) + } diff --git a/code/util/streamlit.py b/code/util/streamlit.py index 97afa29..7988287 100644 --- a/code/util/streamlit.py +++ b/code/util/streamlit.py @@ -1004,9 +1004,8 @@ def _add_agg(df_this, x_name, y_name, group, aggr_method, if_use_x_quantile, q_q else: df['dot_size'] = dot_size_base - # Turn column of group_by to string if it's not - if not is_string_dtype(df[group_by]): - df[group_by] = df[group_by].astype(str) + # Always turn group_by column to str + df[group_by] = df[group_by].astype(str) # Add a diagonal line first if if_show_diagonal: diff --git a/environment/Dockerfile b/environment/Dockerfile index 1443947..4834dd3 100644 --- a/environment/Dockerfile +++ b/environment/Dockerfile @@ -79,7 +79,6 @@ RUN pip install -U --no-cache-dir \ scipy==1.10.0 \ scikit-learn==1.3.2 \ seaborn==0.11.2 \ - semver==2.13.0 \ six==1.16.0 \ smmap==5.0.0 \ statannotations==0.5.0 \ @@ -107,7 +106,8 @@ RUN pip install -U --no-cache-dir \ git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main \ pygwalker==0.4.7 \ aind-data-access-api[docdb]==0.13.0 \ - streamlit-dynamic-filters==0.1.9 + streamlit-dynamic-filters==0.1.9 \ + semver==3.0.2 ADD "https://github.com/coder/code-server/releases/download/v4.21.1/code-server-4.21.1-linux-amd64.tar.gz" /.code-server/code-server.tar.gz diff --git a/requirements.txt b/requirements.txt index 5ca0c0d..2c37526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,7 +68,6 @@ s3fs==2022.11.0 scipy==1.10.0 scikit-learn==1.3.2 seaborn==0.11.2 -semver==2.13.0 six==1.16.0 smmap==5.0.0 statannotations==0.5.0 @@ -96,4 +95,5 @@ zipp==3.10.0 git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main pygwalker==0.4.7 aind-data-access-api[docdb]==0.13.0 -streamlit-dynamic-filters==0.1.9 \ No newline at end of file +streamlit-dynamic-filters==0.1.9 +semver==3.0.2 \ No newline at end of file