diff --git a/DataAnalysis/Visualization/Intro/CT.html b/DataAnalysis/Visualization/Intro/CT.html new file mode 100644 index 0000000..bec05df --- /dev/null +++ b/DataAnalysis/Visualization/Intro/CT.html @@ -0,0 +1,1311 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + \ No newline at end of file diff --git a/DataAnalysis/Visualization/Intro/foliumExample.py b/DataAnalysis/Visualization/Intro/foliumExample.py new file mode 100644 index 0000000..e4eaa8a --- /dev/null +++ b/DataAnalysis/Visualization/Intro/foliumExample.py @@ -0,0 +1,49 @@ + + +# https://www.youtube.com/watch?v=xPk7S-Eb4J4 + +# imports +import pandas as pd +import folium + +# this makes it so that you see all the columns in a pd.show() +pd.set_option('display.max_columns', None) + +# load the data +# https://github.com/practicalaifab/folium/blob/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv +# https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv +df = pd.read_csv("https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv") + +# get list of hoospitals + +# filter for only MA hospitals +ma = df[df['STATE'] == 'MA'] +ma = ma[['NAME', 'LATITUDE', 'LONGITUDE']] + +# display +# map.head() + + +# get the mean lat/lon for the map crteation +lat_mean = ma['LATITUDE'].mean() +lon_mean = ma['LONGITUDE'].mean() + +# create folium map +map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15) + +# need to creat list of hospitals to put them on the map +list_hosp = ma.values.tolist() + +# loop over list +for index in list_hosp: + # add to map + map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green'))) + + +# save map as html file +map.save("ma.html") + + +# df.show() + + diff --git a/DataAnalysis/Visualization/Intro/foliumWithFunction.py b/DataAnalysis/Visualization/Intro/foliumWithFunction.py new file mode 100644 index 0000000..6415929 --- /dev/null +++ b/DataAnalysis/Visualization/Intro/foliumWithFunction.py @@ -0,0 +1,90 @@ + + +# https://www.youtube.com/watch?v=xPk7S-Eb4J4 + +# imports +import pandas as pd +import folium + +# this makes it so that you see all the columns in a pd.show() +pd.set_option('display.max_columns', None) + +# load the data +# https://github.com/practicalaifab/folium/blob/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv +# https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv +df = pd.read_csv("https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv") + + +# function +def choose_state(data, state_option): + state = data[data['STATE'] == state_option] + state = state[['NAME', 'LATITUDE', 'LONGITUDE']] + + # return + return state + +def plot_state(data): + # get lon/lat + lat_mean = data['LATITUDE'].mean() + lon_mean = data['LONGITUDE'].mean() + + # get the map + map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15) + + # populate the map + list_hosp = data.values.tolist() + for index in list_hosp: + # add to map + map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green'))) + + # return + return map + +if __name__ == "__main__": + state = 'CT' + # get the state data + df_state = choose_state(df, state) + + print(df_state.info()) + + # get the map + map = plot_state(df_state) + + # save map as html file + map.save("{}.html".format(state)) + + + +# # get list of hoospitals + +# # filter for only MA hospitals +# ma = df[df['STATE'] == 'MA'] +# ma = ma[['NAME', 'LATITUDE', 'LONGITUDE']] + +# # display +# # map.head() + + +# # get the mean lat/lon for the map crteation +# lat_mean = ma['LATITUDE'].mean() +# lon_mean = ma['LONGITUDE'].mean() + +# # create folium map +# map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15) + +# # need to creat list of hospitals to put them on the map +# list_hosp = ma.values.tolist() + +# # loop over list +# for index in list_hosp: +# # add to map +# map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green'))) + + +# # save map as html file +# map.save("ma.html") + + +# df.show() + + diff --git a/DataAnalysis/Visualization/Intro/ma.html b/DataAnalysis/Visualization/Intro/ma.html new file mode 100644 index 0000000..833bbcb --- /dev/null +++ b/DataAnalysis/Visualization/Intro/ma.html @@ -0,0 +1,3795 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + \ No newline at end of file diff --git a/DccKP/Burden/Json/burdenQuery.json b/DccKP/Burden/Json/burdenQuery.json new file mode 100644 index 0000000..f86004b --- /dev/null +++ b/DccKP/Burden/Json/burdenQuery.json @@ -0,0 +1,21 @@ +{ + "covariates": [ + "C1", + "C2", + "C3", + "SEX" + ], + "mdv": "mdv27", + "dataset_id": "samples_55k_multi", + "variants": [ + "20_16223957_C_T", + "8_118184783_C_T", + "8_118165282_C_T" + ], + "ci_level": 0.95, + "phenotype": "t2d", + "calc_ci": true, + "filters": [], + "allele_type": "bi", + "samples": [] +} \ No newline at end of file diff --git a/DccKP/Burden/Json/federatedBurdenQuery.json b/DccKP/Burden/Json/federatedBurdenQuery.json new file mode 100644 index 0000000..551cdd3 --- /dev/null +++ b/DccKP/Burden/Json/federatedBurdenQuery.json @@ -0,0 +1,10 @@ +{ + "mdv": "mdv25", + "variants": [ + "8_118184783_C_T" + ], + "ci_level": 0.95, + "phenotype": "WAISTC_CM", + "calc_ci": true, + "samples": [] +} \ No newline at end of file diff --git a/DccKP/Burden/Json/federatedBurdenQuery2.json b/DccKP/Burden/Json/federatedBurdenQuery2.json new file mode 100644 index 0000000..ca45ffb --- /dev/null +++ b/DccKP/Burden/Json/federatedBurdenQuery2.json @@ -0,0 +1,21 @@ +{ + "covariates": [ + "C1", + "C2", + "C3", + "SEX" + ], + "mdv": "mdv27", + "dataset_id": "samples_55k_multi", + "variants": [ + "20_16223957_C_T", + "8_118184783_C_T", + "8_118165282_C_T" + ], + "ci_level": 0.95, + "phenotype": "t2d", + "calc_ci": true, + "filters": [], + "allele_type": "multi", + "samples": [] +} \ No newline at end of file diff --git a/DccKP/Burden/Json/federatedKpQuery.json b/DccKP/Burden/Json/federatedKpQuery.json new file mode 100644 index 0000000..9b634b3 --- /dev/null +++ b/DccKP/Burden/Json/federatedKpQuery.json @@ -0,0 +1,47 @@ +{ + "passback": "123abc", + "entity": "variant", + "limit": 50, + "count": false, + "properties": { + "cproperty": [ + "MOST_DEL_SCORE", + "VAR_ID", + "DBSNP_ID" + ], + "orderBy": [], + "dproperty": { + "MAF": [ + "GWAS_OxBB_mdv1" + ], + "MAC": [ + "GWAS_OxBB_mdv1" + ] + }, + "pproperty": { + "BETA": { + "GWAS_OxBB_mdv1": [ + "FG" + ] + } + } + }, + "filters": [ + { + "dataset_id": "blah", + "phenotype": "blah", + "operand": "CHROM", + "operator": "EQ", + "value": "20", + "operand_type": "STRING" + }, + { + "dataset_id": "GWAS_OxBB_mdv1", + "phenotype": "blah", + "operand": "MAF", + "operator": "GT", + "value": 0, + "operand_type": "FLOAT" + } + ] +} \ No newline at end of file diff --git a/DccKP/Hail/hailRegionCall.json b/DccKP/Hail/hailRegionCall.json new file mode 100644 index 0000000..43cfe52 --- /dev/null +++ b/DccKP/Hail/hailRegionCall.json @@ -0,0 +1,26 @@ +{ + "passback": "example", + "api_version": 1, + "variant_filters": [ + { + "operand": "chrom", + "operator": "eq", + "value": "10", + "operand_type": "string" + }, + { + "operand": "pos", + "operator": "gte", + "value": 114550452, + "operand_type": "integer" + }, + { + "operand": "pos", + "operator": "lte", + "value": 115067678, + "operand_type": "integer" + } + ], + "limit": 5000, + "count": false +} \ No newline at end of file diff --git a/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql b/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql index c0e5c2b..4e932d1 100644 --- a/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql +++ b/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql @@ -10,7 +10,7 @@ create table tran_upkeep.data_pathway ( pathway_updated_name varchar(2000), systematic_name varchar(200), pmid varchar(200), - exact_souurce varchar(200), + exact_source varchar(200), msig_url varchar(2000), ontology_id varchar(200), gene_count int(9) not null, diff --git a/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py b/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py new file mode 100644 index 0000000..a271534 --- /dev/null +++ b/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py @@ -0,0 +1,146 @@ + +# imports +import json +import sys +import logging +import datetime +import os +import requests +from pathlib import Path +import re +import csv +import pandas as pd + +# constants +handler = logging.StreamHandler(sys.stdout) +logger = logging.getLogger(__name__) +dir_code = "/Users/mduby/Code/WorkspacePython/" +dir_code = "/home/javaprog/Code/PythonWorkspace/" +dir_data = "/Users/mduby//Data/Broad/" +dir_data = "/home/javaprog/Data/Broad/" +sys.path.insert(0, dir_code + 'MachineLearningPython/DccKP/Translator/TranslatorLibraries') +import translator_libs as tl +location_servers = dir_code + "MachineLearningPython/DccKP/Translator/Misc/Json/trapiListServices.json" +date_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") +location_results = dir_data + "Translator/Workflows/PathwayPpargT2d/SenmedDb/" +file_result = location_results + "sandrinePapersSenmedDbUmls.csv" +url_biothings_senmeddb = "https://biothings.ncats.io/semmeddb/query?q=pmid:{}&size=100" +max_count = 200 + +# list of papers +map_papers = {} +# map_papers['16150867'] = "3-phosphoinositide-dependent protein kinase-1 activates the peroxisome proliferator-activated receptor-gamma and promotes adipocyte differentiation, Yin " +# map_papers['8001151'] = "Stimulation of adipogenesis in fibroblasts by PPAR gamma 2, a lipid-activated transcription factor, Tontonoz" +# map_papers['12021175'] = "Gene expression profile of adipocyte differentiation and its regulation by peroxisome proliferator-activated receptor-gamma agonists, Gerhold" +# map_papers['10339548'] = "A peroxisome proliferator-activated receptor gamma ligand inhibits adipocyte differentiation. Oberfield" +# map_papers['7838715'] = "Adipocyte-specific transcription factor ARF6 is a heterodimeric complex of two nuclear hormone receptors, PPAR gamma and RXR alpha, Tontonoz" +# map_papers['10622252'] = "Dominant negative mutations in human PPARgamma associated with severe insulin resistance, diabetes mellitus and hypertension, Barroso" +# map_papers['9806549'] = "A Pro12Ala substitution in PPARgamma2 associated with decreased receptor activity, lower body mass index and improved insulin sensitivity, Deeb" +# map_papers['25157153'] = "Rare variants in PPARG with decreased activity in adipocyte differentiation are associated with increased risk of type 2 diabetes, Majithia" + + +# # 20220529 - new papers +# map_papers['34900790'] = "The role of the PPARG (Pro12Ala) common genetic variant on type 2 diabetes mellitus risk" +# map_papers['35462933'] = "PRDM16 Regulating Adipocyte Transformation and Thermogenesis: A Promising Therapeutic Target for Obesity and Diabetes" +# map_papers['35364246'] = "Therapeutic implications of sonic hedgehog pathway in metabolic disorders: Novel target for effective treatment" +# map_papers['35341481'] = "Loss of thymidine phosphorylase activity disrupts adipocyte differentiation and induces insulin-resistant lipoatrophic diabetes" +# map_papers['35054888'] = "Effects of Isorhamnetin on Diabetes and Its Associated Complications: A Review of In Vitro and In Vivo Studies and a Post Hoc Transcriptome Analysis of Involved Molecular Pathways" +# map_papers['34545810'] = "Impaired mRNA splicing and proteostasis in preadipocytes in obesity-related metabolic disease" +# map_papers['33959308'] = "Curcumin improves adipocytes browning and mitochondrial function in 3T3-L1 cells and obese rodent model" +# map_papers['14684744'] = "Dioxin increases C/EBPbeta transcription by activating cAMP/protein kinase A" +# map_papers['14530861'] = "The FOXC2 -512C>T variant is associated with hypertriglyceridaemia and increased serum C-peptide in Danish Caucasian glucose-tolerant subjects" +# map_papers['12855691'] = "Overexpression of sterol regulatory element-binding protein-1a in mouse adipose tissue produces adipocyte hypertrophy, increased fatty acid secretion, and fatty liver" +# map_papers['12677228'] = "The Role of PPARgamma Ligands as Regulators of the Immune Response" +# map_papers['11928067'] = "Pro12Ala polymorphism in the peroxisome proliferator-activated receptor-gamma2 (PPARgamma2) is associated with higher levels of total cholesterol and LDL-cholesterol in male caucasian type 2 diabetes patients" +# map_papers['27909015'] = "Diabetic human adipose tissue-derived mesenchymal stem cells fail to differentiate in functional adipocytes" +# map_papers['27815534'] = "Biological roles of microRNAs in the control of insulin secretion and action" +# map_papers['27657995'] = "Effects of Streptozotocin-Induced Diabetes on Proliferation and Differentiation Abilities of Mesenchymal Stem Cells Derived from Subcutaneous and Visceral Adipose Tissues" +# map_papers['27493874'] = "Diabetic mice exhibited a peculiar alteration in body composition with exaggerated ectopic fat deposition after muscle injury due to anomalous cell differentiation" +# map_papers['27445976'] = "Cooperation between HMGA1 and HIF-1 Contributes to Hypoxia-Induced VEGF and Visfatin Gene Expression in 3T3-L1 Adipocytes" + +# 20221118 - search for sandrine's chem papers +# (Chou DH-C et al., ACS Med Chem Lett 2011, PMID: 21927648,Chou DH-C et al., J Am Chem Soc 2015, PMID: 26042473,Vetere, Amedeo, et al., Nature reviews Drug discovery, PMID: 24525781) + +map_papers['21927648'] = "Chou DH-C et al., ACS Med Chem Lett 2011" +map_papers['26042473'] = "Chou DH-C et al., J Am Chem Soc 2015" +map_papers['24525781'] = "Vetere, Amedeo, et al., Nature reviews Drug discovery" + +def query_biothings(paper_id, paper_name, log=False): + ''' + find the journal if in the results + ''' + # initialize + pubmed_id = 'PMID:' + paper_id + list_results = [] + is_found = False + url_query = url_biothings_senmeddb.format(paper_id) + + # log + if log: + print("looking for pubmed id: {}".format(url_query)) + + # query the service + response = requests.get(url_query) + + # try and catch exception + try: + json_output = response.json() + # if log: + # print("got result: \n{}".format(json_output)) + except ValueError: + print("GOT ERROR: skipping") + + # pick put the data + map_result = {'pubmed_id': paper_id, 'info': paper_name[0:60], 'predicate': None, 'subject': None, 'subject_type': None, 'object': None, 'object_type': None} + if json_output: + if isinstance(json_output, dict): + if json_output.get('hits'): + for child in json_output.get('hits'): + is_found = True + map_result = child.get('predicate') + map_result = {'pubmed_id': paper_id, 'info': paper_name[0:60], 'predicate': child.get('predicate'), + 'subj_umls': child.get('subject').get('umls'), + 'subject': child.get('subject').get('name'), 'subject_type': child.get('subject').get('semantic_type_name'), + 'obj_umls': child.get('object').get('umls'), + 'object': child.get('object').get('name'), 'object_type': child.get('object').get('semantic_type_name'),} + list_results.append(map_result) + + # add to list + if not is_found: + list_results.append(map_result) + + # return + return list_results + +if __name__ == "__main__": + # initialize + count = 0 + list_result = [] + + # loop through the paper ids + for key, value in map_papers.items(): + # test the max count + if count < max_count: + count += 1 + + # get the biothings data for the paper + list_temp = query_biothings(key, value, log=True) + + # add to the results + list_result = list_result + list_temp + + # print the results + print("\n=====results") + for child in list_result: + print(child) + + # create dataframe + df_papers = pd.DataFrame(list_result) + #temporaly display 999 rows + with pd.option_context('display.max_rows', 999): + print (df_papers) + + # write out the file + df_papers.to_csv(file_result, sep='\t') + print("wrote out the file to: {}".format(file_result)) + diff --git a/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc b/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc index 958f3ec..2e39ba4 100644 Binary files a/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc and b/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc differ diff --git a/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json b/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json index 62bcded..ff92296 100644 --- a/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json +++ b/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json @@ -14,7 +14,6 @@ "categories": [ "biolink:Pathway" ], - "constraints": [], "ids": [ "REACT:R-HSA-381340" ], @@ -24,7 +23,6 @@ "categories": [ "biolink:Disease" ], - "constraints": [], "is_set": false } } diff --git a/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json b/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json index 1343e83..46fe255 100644 --- a/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json +++ b/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json @@ -3,12 +3,10 @@ "query_graph": { "edges": { "e01": { - "constraints": [], "object": "pathway", "subject": "gene" }, "e02": { - "constraints": [], "object": "disease", "subject": "pathway" } @@ -18,7 +16,6 @@ "categories": [ "biolink:Disease" ], - "constraints": [], "ids": [ "MONDO:0005148" ], @@ -28,7 +25,6 @@ "categories": [ "biolink:Gene" ], - "constraints": [], "ids": [ "NCBIGene:5468" ], @@ -38,7 +34,6 @@ "categories": [ "biolink:Pathway" ], - "constraints": [], "is_set": false } } diff --git a/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt b/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt new file mode 100644 index 0000000..5eba448 --- /dev/null +++ b/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt @@ -0,0 +1,20 @@ + + +- genetics significance + - We want to use that for the gene pheWAS plots but use 5e-8 for variants + + + + - tools + - gregor - for annotations/enrichments + - magma - for association stats + - LDSC - LD score regression + - takes into account variants that travel together and their effect sizes + + + +Genetic variation can affect the levels of protein; consider a variant that reduces the amount of mRNA transcribed, this could have a profound effect on the amount of mRNA available to translate into a protein. + +Consider another variant that does not affect the abundance of the mRNA transcribed, but alters one of the many important sequences that are required for translation (co-factor binding sites, ribosomal binding sites, start site, etc.). + +These are just a couple of examples that first came to mind, but there are many more. For instance, a variant may increase expression of a particular microRNA that in turn inhibits the translation of another mRNA molecule. I have not included any references as this is more a logical exercise - doubtless there are many more ways a genetic variant can affect protein abundance. diff --git a/Notes/CheatSheets/ML/pysparkCheatSheet.txt b/Notes/CheatSheets/ML/pysparkCheatSheet.txt index 6c20549..c9ef3c3 100644 --- a/Notes/CheatSheets/ML/pysparkCheatSheet.txt +++ b/Notes/CheatSheets/ML/pysparkCheatSheet.txt @@ -50,3 +50,47 @@ def round_down(x): round_down_udf = udf(round_down, IntegerType()) # 2nd arg is type returned df.select(round_down_udf('fare').alias('int_fare')) + + + +- describe + df_export.printSchema() + df_export.count() + df_export.describe() + df_export.show() + +- data aggregation + df_export.groupBy("chromosome").count().orderBy("chromosome").show(25, False) + + +- select subset of columns + df_export = df_nonnull_load.select("dbSnp", 'chromosome', 'position') + df.where(F.col("count").isNull()).show() + + +- split column into other columns + split_col = pyspark.sql.functions.split(df['my_str_col'], '-') + df = df.withColumn('NAME1', split_col.getItem(0)) + +- export data + df.coalesce(1).write.csv('result.csv') # one file + + df_export.write.mode('overwrite').csv(out_file) + os.system("cat output/test/p* > output/test.csv") # one file; will not work in distributed env + + df_export.write.mode('overwrite').option("delimiter", "\t").csv(out_dir) + + + srcdir = '%s/variants/*/%s' % (s3dir, args.phenotype) + outdir = '%s/out/metaanalysis/variants/%s' % (s3dir, args.phenotype) + + + + + +-- scratch + opts = argparse.ArgumentParser() + opts.add_argument('phenotype') + + + \ No newline at end of file diff --git a/Notes/ML/pytorchCheatSheet.txt b/Notes/CheatSheets/ML/pytorchCheatSheet.txt similarity index 100% rename from Notes/ML/pytorchCheatSheet.txt rename to Notes/CheatSheets/ML/pytorchCheatSheet.txt diff --git a/Notes/CheatSheets/ML/sklearnCheatSheet.txt b/Notes/CheatSheets/ML/sklearnCheatSheet.txt index 2a9d09c..6221014 100644 --- a/Notes/CheatSheets/ML/sklearnCheatSheet.txt +++ b/Notes/CheatSheets/ML/sklearnCheatSheet.txt @@ -6,3 +6,7 @@ model_selection: - StratifiedKFold - takes group information into account for balanced folds for classification - train_test_split() - train/test splitting + +Time series +----------- +- needs to be normalized diff --git a/Notes/CheatSheets/Python/pythonCheatSheet.txt b/Notes/CheatSheets/Python/pythonCheatSheet.txt index c07a654..b106884 100644 --- a/Notes/CheatSheets/Python/pythonCheatSheet.txt +++ b/Notes/CheatSheets/Python/pythonCheatSheet.txt @@ -18,11 +18,21 @@ json: with open('strings.json') as f: d = json.load(f) print(d) + + - display json + print("build trapi payload: \n{}".format(json.dumps(payload, indent=2)) virtual env: ------------ - python3 -m venv tutorial-env +strings: +-------- + +pandas: +------- + - df_results = pd.DataFrame.from_dict(list_gene_chemical) + lists: ------ - chunks = [data[x:x+100] for x in range(0, len(data), 100)] # split list into sublists of size @@ -64,6 +74,14 @@ files: for line in file: print(line.rstrip()) +jupyter: +-------- +- expand seen rows for pandas + from IPython.display import display + pd.options.display.max_rows = 999 + display(df_results) + + misc: @@ -76,12 +94,33 @@ databases: cur = db.cursor() cur.execute(sql).fetchall()/fetchone() + +sqlalchemy: +----------- +- return new rowid + with engine.connect() as conn: + sql_params = data.dict() + sql_params.update({'s3_bucket_id': s3_record_id, 'metadata': json.dumps(data.metadata)}) + res = conn.execute(text(""" + INSERT INTO records (s3_bucket_id, name, metadata, data_source_type, data_source, data_type, genome_build, + ancestry, data_submitter, data_submitter_email, institution, sex, global_sample_size, t1d_sample_size, + bmi_adj_sample_size, status, additional_data) VALUES(:s3_bucket_id, :name, :metadata, :data_source_type, + :data_source, :data_type, :genome_build, :ancestry, :data_submitter, :data_submitter_email, :institution, + :sex, :global_sample_size, :t1d_sample_size, :bmi_adj_sample_size, :status, :additional_data) + """), sql_params) + conn.commit() + s3.create_record_directory(s3_record_id) + return s3_record_id, res.lastrowid + objects: -------- - __init__(self) - __repl__(self) - __str__(self) + gene_association: GenePhenotypeAssociation + for gene_association in list_gene_assoc: + debug: ------ - help() diff --git a/Notes/CheatSheets/System/chromeCheatShe3et.txt b/Notes/CheatSheets/System/chromeCheatShe3et.txt new file mode 100644 index 0000000..37e3e2a --- /dev/null +++ b/Notes/CheatSheets/System/chromeCheatShe3et.txt @@ -0,0 +1,7 @@ + + +- to open postman in chrome + - Type chrome://apps/ in your chorme address line. + + + \ No newline at end of file diff --git a/Notes/CheatSheets/System/gitCheatSheet.txt b/Notes/CheatSheets/System/gitCheatSheet.txt index 878196d..72e6c50 100644 --- a/Notes/CheatSheets/System/gitCheatSheet.txt +++ b/Notes/CheatSheets/System/gitCheatSheet.txt @@ -22,6 +22,14 @@ new branch: - git checkout -b - git push -u origin ky_code_documentation +github ssh issue: +----------------- +git@github.com:broadinstitute/genetics-kp-dev.git + +info: +----- + - git rev-parse --abbrev-ref HEAD (find out what branch you're on) + github actions: --------------- - create yaml file in .github/workflows diff --git a/Notes/CheatSheets/System/mysqlCheatSheet.txt b/Notes/CheatSheets/System/mysqlCheatSheet.txt index e7975b1..29eb401 100644 --- a/Notes/CheatSheets/System/mysqlCheatSheet.txt +++ b/Notes/CheatSheets/System/mysqlCheatSheet.txt @@ -1,4 +1,8 @@ +strings: +-------- + +SELECT SUBSTRING("SQL Tutorial", 1, 6) AS ExtractString; @@ -26,8 +30,8 @@ misc: - update comb_node_ontology node join tran_upkeep.data_pathway pathway on node.node_code COLLATE utf8mb4_general_ci = pathway.pathway_code set node.node_name = pathway.pathway_updated_name - where node.node_type_id = 4; - + where node.nodSELECT SUBSTRING("SQL Tutorial", 1, 6) AS ExtractString; + - delete on join delete edge from comb_edge_node edge inner join comb_node_ontology node on edge.target_node_id = node.id diff --git a/Notes/CheatSheets/System/translatorCheatSheet.txt b/Notes/CheatSheets/System/translatorCheatSheet.txt new file mode 100644 index 0000000..4b2e51b --- /dev/null +++ b/Notes/CheatSheets/System/translatorCheatSheet.txt @@ -0,0 +1,6 @@ + + + +- query the ARS command line + - curl -X POST "https://ars-prod.transltr.io/ars/api/submit" -H "accept: application/json" -H "Content-Type: application/json" -d @yourQueryFileHere.json + \ No newline at end of file diff --git a/Notes/ML/2020mlScratchPad.txt b/Notes/ML/2020mlScratchPad.txt index 012252f..7bf6865 100644 --- a/Notes/ML/2020mlScratchPad.txt +++ b/Notes/ML/2020mlScratchPad.txt @@ -1,5 +1,49 @@ +20221107 - mit ml seminar +- with single cell assays + - can measure + - rna Seq + - atac seq + - chromatin accessibility + - chromatin 3d + - gene expression + - want toi translation/predict what other modlity of measurement will be + - ie: from gene expression and chromatin accessibility, predict atac seq + - for learning + - need measurements from same cell for training data + - most assays are destructive, so only get one reading per cell + - use non destructive assay + +- auto encoder + - get representation of batch effects and sequencing depth +- peptides + - sequence of amino acids (10-15 size?) + - get observed spectrum from mass spectometer for each peptide + - the order of the amino acids affects the mass spectrometer reading curve + - only 1.4% overlap in peptides between species + - ? -> research +- for transformers in NLP + - learning pairwise relationships between words + - can we use that for spectrometer peaks + +- ml models for genetics mentionned + - babel 2021 + + + +20221027 - broadway - ml in clinical applications +- questions to use ML + - what are protective mech for disease in the genome (like pcsk9) + - how well infer rna expression from dna + - can we combine omics data to predict outcomes and mech insights + - can we discover all endophenotypes (like BMI types) + +- misc + - if unfold DNA, should be 6 feet long + - BMI - ratio of weight to surface area + + 20201223 - pipeline work - wrote test pipeline script using CA housing data - standard scalar didn't help with R2 score diff --git a/Notes/ML/geneticsMlMeetings.txt b/Notes/ML/geneticsMlMeetings.txt new file mode 100644 index 0000000..86453db --- /dev/null +++ b/Notes/ML/geneticsMlMeetings.txt @@ -0,0 +1,532 @@ + + +20230322 - mia images and omics predictions +- notes + - advantage of getting data on cell profile without destroying it + - biomedical imaging as an alternative + - cheaper to produce than omics data + - how can images co,plement omics data + - so biomolecules meaured omics data also reflect/aborb light differently + - so in data, combine stain data with single cell omics data set + + + +20230308 - mia primer, autoencoders for genetics + +- notes + - autoencoders + - variational autenconders + - when combining different types of data in the input layer, make sure the data scale is the same + - swaping AAE for VAE + - get same performance + - AAE learned a different part of the data than the VAE + - if combine, then get improvement + - for VAE, need to pick input variables carefully + - normal NN will ignore extraneous vars + - VAE needs to rebuild all vars, so results skewed by unnecessary vars + - VAEs are stochastic, don't always give the same result when trained + - because of sdampling in the encoding step? + - so need to run experiments multiple times on multiple trained models + + - for t2d model + - trained model, then perturbed data to see what effect it has + +- notes for 2nd talk + - ROPE: region of practical equivalence + - for perturbation slide, the versicolor dataset if the perturbed one + + + + + + + +20230306 - schmidt center - de novo protein design, david baker +- presenter + - david bake, https://www.bakerlab.org/ + - institute for protein design + +- notes + - use rosetta model + - but can also use deep leartning + - more easy to design small proteins that bind to other large targets + - design protein that alters bound protein state when bound + - deep learning + - use reinforcement learning, monte carlo tree search + - custom model + - rosettafold + - baek and dimaio + - second edition of model performed better by modifying loss function + - goes beyond proetein, but also dna + - now rosettafold can model any biomedical molecule system + - ligandMPNN, proteinMPNN + +- ie: + - take flu virus, point to binding section, then let diffusion process design a protein that binds to that area + + + + + +20230301 - ai in rradiology +- notes + - noticed that models of chest xrays could identigfy race of patient + - bias? + + + +20230215 - computationally designed proteins +- talk + - AF stands for alpha fold + - why predicting protein-protein interaction is hard + - very retstrictive set of molecular amino acids + - restrictive set of amino acids + - dynamics of protein + - predicitng p-p interaction + - identify a target site on protein for binding + - model motif based on site + - find matching motif for binding protein + - will find some sections where you can't mutate AA or else deleterious to binding + - no improvments to be made + +- 2nd talk + - if two proteins are near together in 3d space, they tend to mutate together + - see this in multi sequence alignments (MSA) between species + - protein MPNN + - protein message passing neural network + + + +- todo + - research TM score + + +20230201 - mia meeting, brain hie, stanforcd - learning protein evolution + +- study genetic heritability in proteins by training ML + - idea of dependencies of choice of amino acid in one position, influences choice of amino acid in other position + +- protein training models + - protein language models + - take protein sequence, mask 15% of sequnence and train model to predict maked sequneces + +- solve viral escape problem + - flu, hiv, covid + - viral escape can be caused by single amino acid change + - viral escape wants to keep binding of protein but change signature so host doesn't recognize the new virus + - maynard word game + - if want to mutate 'word' to word 'gene', but only one letter at a time and intermediate words need to be valid + +- approach + - train anguage models on sequences of amino acids + - some ML models predicted covid mutations before they became variations of concern + - protein language models learn evolutionary rules + +- for proteins + - fitness curve of the sequence across sequence space + - flu + - evolves about one sequence a year ? + - try to predict next evolution by models (evolutionary velocity) + - models dont' assume selection pressure (from immunity) + +- info + - serpins -> protein family + +- future + - testing antibody changes against future virus sequences + - also improve fitness of the antibody + - can also try to improve catalitic activity of an enzyme + +- algorithms + - markov chain with monte carlo + - initialize random sequence x + - loop for 50k generations + - mutate x to x' + - calculate f(x') for new sequnece + - if f(x') is superior to f(x), replace x with x' + - can also tolerate some drop in f(x') to account for escaping local maximum + - use random loss: bayesian loss? + +- use deep protein ml models (alphafold) + - but define your own fitness functions + - get different resuklts + - do find some sequences that exist in nature + - hie, candido 2022 bioRxiv + +- evolution of viruses go hand in hand with evolution of the hosts + - not well understood, but could use ML models to understand better + +- papers + + +- models + - DeepSequence + - mostly one sequnece change at a time? + + +---------------------------------------------------------------------- +20201208 - crispr ML +- base editing + - gores after cytisine + - avoids double starnd break, so avoid indels + +- genetic bases + - purines have 2 rings, pyrimidines have one rings + - so hard to convert between the two + +- how specific is crispr gene editing +- dna repair outcomes from cas9 +- base editing outcomes + - base editing very efficient at point mutations + - poitions size of 4-6 nucletides +- more base editor than crispr options (easier to create) +- for wanted goal phenotype, could use multiple base editors + +- ML + - 10k labeled inputs + - use xgboosted tree + - deep conditiona; autoregressive model + - pass freq from previous position to next to predict next position + - deeper decoder with more layers helped a lot + - + +------------------------------------------------------------------------ +20201007 - dyno therapeutics, model guided biological sequence design +- sam sinai, dyno therapeutics + +- process (see figure) + - get data from assays + - use ML models to approximate rest of sequences and annotations + - look for best sequences in the ML model + - then assay those sequences and restart the process + +- models + - feedback gans, deep exploration networks + - feedback nn, dyna-ppo (google) + - use ensemble of models (nn, randim forest) and cuts exploration if uncertainty too high relative to beginning + - BO - bayesian optimization, hard to scale + - + +20200923 - mia zoom - + + + +20180509 - auditing datasets using AI +- + + + + +20180502 - math club, carpenter lab, imaging +- looking at pictures of nultiple cells + - detection problem + - box around each cell + - segmentation problem + - find cell boudary + +- cell detection problem + - for cell detection from slide, detecting all the cells in the image + - keras R-CNN package on github for TF + - uses faster R CNN network design + - faster region convolution neural network + - https://github.com/broadinstitute/keras-rcnn + + +- nucleaus segmentation problem + - find the nuleaus in the cell + - otsu's thesholding problem + - 2 neural net architecture + - u-net and DeepCell + - DL makes fewer errors + - splits (when one nuclei viewed as 2) and merges (when 2 nuclei seen as 1) + - pipleines + - CellProfiler Advanced, CellProfiler basic + - nucleus segmentation challenge + - data science bowl + +- image based profiling + - create signatures of tteratments from images + - cells treated with the same treatment should look similar, not dissimilar from the others + - so segmentation problem + - 3 approaches + - CellProfiler + - Transfer learning + - Weakliy supervised learning + + +- capture single cell heteregeoneity + - how to compare two c3ll populations + - look at their phentoyeps + - profiling challenges + - noise and loss of signal + - mean (or median) profiling + - but lose heteregeoneity information + - complementary measures to mean + - measures of dispersion (might have same mean but be situated fatter out in 3D plot) + + + + +HMDB0004953 + + + + +20180411 - MIA, protein folding problem +- simulation for protein structure + - built around energetic optimization + - folding goes from high energy state to low energy state + + - sequence -> energy landscape -> structure + +- simulator for pretein structure + - back propogation through time + =- how to train your simulator + - function for each pair of amino acids + - start with a straint protein sequence, the end up with 3d model of the folding + + - energy -_ simulator -> protein structure + loss function + - energy over time, so various steps to each simulator step + +- protein folding important in designing medications + + +- terms + - homology + - the state of having the same or similar relation, relative position, or structure + + - RNN (recurring neural networks) + - graph neural networks (used in 3d chemistry) + - langevin dynamics (gradient descent with noise) + - gradient descent process can turn chaotic after awhile; chaos causes not being to back propageta in time (used in learning) + - ie: protein could start bouncing between multiple positions that can tend to go apart -> go into chaos, start getting meaningless numbers + - perturbations in the system can become amplified by tthe syste itself + - protein fold: arrangement of secondary structure elements + + + +20171129 - 3d mapping of the human gehnome, prof aiden, Rice University +- dbna 2 meters long +- contac6t mapping + - exploring structurer via proximity + - for town, examine how residents co localize + - for genome, use assays that measure dna/dna contacts in intact nuclei (nuclear ligation assay) + - this contact maps the human genome + +- Hi-C: 3d genome sequencing +- chroms in mamls are not extended, they occupy a very discrete volume + +- 3d features have 1d codes + - weak sense (correlation); there are 1d featres that enable you to predict hwo the genome will fld + - strng sense (causation) + +- some patterns/heat maps are close,ly correlated to open/close chrmatin +- loops in chromatin = peaks in contact map + +- hi c map + - 2d heat map, usually 400kb x 400 kb + - some undercoverd sectionsm, so run some normalizations + - so looks like contact matrix + - usulally find 10k loops + - observations + - most loops are short, less than 2MB + - a lot of loops are conserved accross cell types, and some species + - some change + - loops are anchored at convergent CTCF sites + - 90% pointed at inward notation + - hypothesis + - 2 hooked donuts go up and down dna, and stop at stop signs (pointed in their direction), then get loop + - extrusion enables prediction of loops + - the convergent rule is a code in the strong sense + - test: by editing single CTCF motifs, we can engineer loops + - try using crispr, take out 20-100 bases of the human genome + - by deleting tamdem CTCF sites, we can refold whole chromosomes + - note: multiple groups have proposed interphae loop extrusion model + - observation + - if degrade cohesion, degrade all loops + - just get single line heat map + - use oxin to degrade the cohesion + - if you withdraw the oxin, loops come back + - looks lke the extrusion process slides at 400 bases per second + +- what do loop domains do? + - loops associated with gene activity + - surpisingly, few genes are affected by the loss of cihesin + + + + + + + +- look up + - histone code + + +20171115 - mia, microsoft research with crispr +- crispr + - cas9 does the cutting + - site has a gg + - giude rna (20-30 nucleotides long) and appended gg (where cut willoccur) + - how to test if successful + - flow cytometry (FC) + - drug resisteance assay (RES) + - if gene knocked out, cell survives + +- support vector machines + +- look up + - regression trees + - iteravely combine regression trees + - each step, re weight traing data to give more weight to those with bad predictions that last round + +http://research.microsoft.com/en-us/projects/crispr +https://www.microsoft.com/en-us/research/?from=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fprojects%2Fcrispr + +- cloud resources for the research of off target + - 17k cores, run 3 weeeks of 24 hours a day + +20170426 - single cell analysis +- rapid daat growth + - cell cumbers + - high data volumes + - 10e5 x 10e4 genes + - 30 gig sparse data + + + + +20160920 - probabilistic programming - CU profs +- question + - have data + - have model + - then infer, and now re-evaluate model +- edwardlib.org + + +20160914 - compressed sensing +- if have compoite measurement matrix * random composition matirx = gene expression matrix + - if 2 geners are colrrelated, then don't boter measuring more than e of them' + - can also skip co-regulated genes (co-expressed) + - common in bilogy + +- housekeeping geens: genes that are on in every tissue + - but there are ways (by expressed measurement) what tissue they came from + +- helps to decompress high dimensioal signal + +- note: a gene that might be oin a correlation group in ne cell type could be in naother correlation group in another cell type +- sparse mnf +- this is a problem of matrix factorization +- SMAF - sparse module activity factorization + + + + +20160511 - MIA, David Blei (CU): stochastic variotion inference models +- big quesiton on robustsness +- how to scale stochastic varitional inference + - O(n) for each pass where you check for convergence, if go thrpugh each record to asses vars towards ancestery + - less if sample random samples to build model for each pass where you check for convergence + - black box variotonal inference' + - github.com/bei-lab/edward + - similar to Stan software +- ex used + - the admixture model of Pirtchard + - +- papers + - wainwright and jordan (2008), 300 page paper + - Blai et al, 2016, under review + - Bishop 2006 book, variation inference + - MacKay, Variational inference +- bayesan stats + - for x observation, z latents vars (hidden) + - model: p(x, z|alpha) + - inference: p(z|x) = p(z,x) / p(x) + - probs of Z given x + - where p(x) = SUMover z(p(z,x)) + - marginal distribution +- debate between being bayesianist and frequentist +- PSD model (ad mixture model) + - setup + - data + - i as index of individuals + - l: locus on genome + - y(ilm, ilf): data with phased data on mom/dad for person on lucus l and indiv i + - model populations + - k: # of populations + - BETA k, l: dist of alleles for pop k at locus l + - model + - for each population, BETA k,l ~ distribution for each pop at each locus (4, 1, for each letter) + - for each indix, THETAi ~ dir(ALPHA) -> hidden vars + - for each locus, + - z(ilm) ~ THETAi, -> hidden + - z(ilf) ~ THETAi -> hidden + - y(ilm) ~ BETA z,l.m -> observed + - y(ilf) ~ BETA z,l.f -> observed + - so 2 observed vars, 3 hidden vars (modeled as distributions) +- variational inference + - mai idea: posit a family of distribs over the latent vars +- stochastic optimaiation + - the drunk story + - try to get from boston to LA, assuem everyone is drunk + - ask person1 where LA is, walk 700 miles + - ask person you run into, person2, where LA is, walk 350 miles into direction + - repeat until get to LA + - assumption is that if distro of person pointing to LA has mean as LA + - in practice + - smple indiv + - update indiv params of pops assign and + - pretend ths indiv is only person in pop + - then ask what pop params would be for that + - then set pop params 1/2 way between thsi and what pop params were + - repeat +- variation inference vs gib sampler + + + + + + +- look into + - Balding Nichols (model) + - https://en.wikipedia.org/wiki/Balding%E2%80%93Nichols_model + - ELBO + - term to maximize + - CAVI + + + + +20160217 - MIA - gene regulation in space and time +- caussal in ference + - infer regulatory network from gene expression data + - detarermine causal inference from observational data + - desirable properties of causal inference algorithms for genetics + - pc-algorithm, assumes strong faithfulness condition + - consistent algorithm for learning dags + - learn directed graph from conditional independence information + - caual effects are not allowed to cancel oyt + - if medcine is good for kideney, medicine bad for immune system, im good for kidney + - doesn't mean no + on kidney for medicine + - 3 node ex of dag (x1 -> x2, x1 -> x3, x2 -> x3), c is guassian noise + - x1 = c1 + - x2 = @12 * x1 + c2 + - x3 = @13 * x1 + @23 * x2 + c3 +- chromosome territories + - interphase chromosomes coccupy distinct space +- find minimal configuration of ellipsoids + - measure overlap by largest circle I can fit into the overlap + - take norm as weak to avoid solution with a few large overtlaps + - want distributed overlap + - good way to determine chromosome territories + - redict new neighbors as shape of container changes + - put cell in different tissue, see how genes get reorganized + - cell differentiatino and gene expressio +- highly expressed genes tend to be at the center of the nucleus + - also the smaller ones + - genes that are atthe overlap are co regulated and expressed + + + + diff --git a/Notes/ML/mlMeetups.txt b/Notes/ML/mlMeetups.txt index 5977f7c..9735598 100644 --- a/Notes/ML/mlMeetups.txt +++ b/Notes/ML/mlMeetups.txt @@ -1,5 +1,7 @@ + + 20220830 - ML kgalle 4 week course, tampa - process for data cleaing - find duplicate rows diff --git a/Notes/ML/pySparkCheatSheet.txt b/Notes/ML/pySparkCheatSheet.txt deleted file mode 100644 index 35b8ed8..0000000 --- a/Notes/ML/pySparkCheatSheet.txt +++ /dev/null @@ -1,41 +0,0 @@ - - -- describe - df_export.printSchema() - df_export.count() - df_export.describe() - df_export.show() - -- data aggregation - df_export.groupBy("chromosome").count().orderBy("chromosome").show(25, False) - - -- select subset of columns - df_export = df_nonnull_load.select("dbSnp", 'chromosome', 'position') - df.where(F.col("count").isNull()).show() - - -- split column into other columns - split_col = pyspark.sql.functions.split(df['my_str_col'], '-') - df = df.withColumn('NAME1', split_col.getItem(0)) - -- export data - df.coalesce(1).write.csv('result.csv') # one file - - df_export.write.mode('overwrite').csv(out_file) - os.system("cat output/test/p* > output/test.csv") # one file; will not work in distributed env - - df_export.write.mode('overwrite').option("delimiter", "\t").csv(out_dir) - - - srcdir = '%s/variants/*/%s' % (s3dir, args.phenotype) - outdir = '%s/out/metaanalysis/variants/%s' % (s3dir, args.phenotype) - - - - - --- scratch - opts = argparse.ArgumentParser() - opts.add_argument('phenotype') - diff --git a/Notes/ML/scikitCheatSheet.txt b/Notes/ML/scikitCheatSheet.txt deleted file mode 100644 index 7346a0b..0000000 --- a/Notes/ML/scikitCheatSheet.txt +++ /dev/null @@ -1,5 +0,0 @@ - - -Time series ------------ -- needs to be normalized \ No newline at end of file diff --git a/Notes/Meetups/2020gdgMeetupts.txt b/Notes/Meetups/2020gdgMeetupts.txt index fd8e862..fafaaf6 100644 --- a/Notes/Meetups/2020gdgMeetupts.txt +++ b/Notes/Meetups/2020gdgMeetupts.txt @@ -1,5 +1,15 @@ + +20221103 - pacific region dev fest +- agenda (of interest) + - Keynote - How to build a career in cloud - vergadia + - A survey of the state of the art in language models for code - goncharov + - Similarity-based machine learning - hapke + - Generative Art & Design with TensorFlow - maynard-reid + - Real-time Machine Learning - chloe he + + 20201205 - gdg silicon valley devfest 2020 1330 - firebase with kyle paul diff --git a/Notes/Meetups/2021geneticsMeetup.txt b/Notes/Meetups/2021geneticsMeetup.txt index 11850ee..4c36918 100644 --- a/Notes/Meetups/2021geneticsMeetup.txt +++ b/Notes/Meetups/2021geneticsMeetup.txt @@ -1,5 +1,35 @@ +20221117 - mpg, gocalo abecasis +- regeneron genome center + - commited to sequencing 7 million samples + +- notes + - for poligenic risk score, mostly use common variants with sorinkling of high effect rare variants + - for cost effective sequencing, so exome with imputation with some gwas as well + - key is to be able to automate the processing by robots + - get increased results with assaying samples with exomes and umputation (samer cost) + - get a little better results with whoel sequencing, but at a much larger cost + +- sardinia study on BCL11A persistent fetal hemoglobin + - fetal hemoglobin is for womb and newborns to better exchange oxygen from host mother + - turned off in adulthood to other type if hemoglobin + - for heterogizous variant, were resistmt to malaria, which is why high concentration of variant in sardinia + - but have serious bad phenotype if homozygous + - eventually, led to vertex sycle cell breathrpugh + +- covid genetics study + - saw ACE2 prmoter signal + - if had this variant, had 40% lower infection and 40% percent lower hospitaliztion + +- use stringent pvalye 5e-08 gor gene using burden test + - get fewer signals, but theyr are more interesting + + +- lookm up + - fetal hemoglobin + - how sardinia gwas study led to sickle cell breakthough + 20220425 - stanford drug discovery - r&d talk #1 - servier diff --git a/Notes/Yoga/myYogaFlows.txt b/Notes/Yoga/myYogaFlows.txt index 94485b1..c031c36 100644 --- a/Notes/Yoga/myYogaFlows.txt +++ b/Notes/Yoga/myYogaFlows.txt @@ -23,9 +23,22 @@ poses: - balance - T pose - T pose to upright, to T pose flow + - tree pose + - bonzai pose + - balance on block + - dancer pose +- lats + - child prayer pose + active poses: ------------- - from T shape pose on one leg, with blocks in front, go back to mountain, tough floating toe, back to T shape + - from , straight leg off to side, the slide back to child pose +transitions: +------------ + - from warrior 1 ro T position + - from warrior 2 to half moon + - T pose to upright w/ knee in chest, back to T diff --git a/PreProcessing/Text/Vectorizing/nlpFunctions.py b/PreProcessing/Text/Vectorizing/nlpFunctions.py new file mode 100644 index 0000000..a7a79c5 --- /dev/null +++ b/PreProcessing/Text/Vectorizing/nlpFunctions.py @@ -0,0 +1,36 @@ + +# imports +import pandas as pd +import numpy as np +from nltk.corpus import stopwords +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import cosine_similarity + + +# test +print("test") + +# get data +file = open("mobydick.txt") +mobydick = file.read.replace("\n", " " ) +file.close() +file = open("hamlet.txt") +hamlet = file.read.replace("\n", " " ) +file.close() + +# setup dataframe as bag of words +list_labels = ['moby', 'hamlet'] +list_corpus = [hamlet, mobydick] + +# vectorize all the words per works + +# print +df.iloc[:, 5000:5010] + +# show correlation matrix in 2d +cosinematrix = np.around(cosine_similarity(df), decimals=2) +df_cosine = pd.DataFram(data=cosinematrix) +df_cosine = df_cosine.rename(index={0: 'hamlet', 1:'moby'}) +df_cosine = df_cosine.rename(columns={0: 'hamlet', 1:'moby'}) + +df_cosine diff --git a/PreProcessing/Text/Vectorizing/spacyLib.py b/PreProcessing/Text/Vectorizing/spacyLib.py new file mode 100644 index 0000000..6c0b061 --- /dev/null +++ b/PreProcessing/Text/Vectorizing/spacyLib.py @@ -0,0 +1,33 @@ + + +# imports +import spacy +from collections import Counter +from spacy import displacy + +# load +nlp = spacy.load("mobydick.txt") + +with open(file_name, 'r') as file: + doc = nlp(file.read) + +print("type of doc: {}".format(type(doc))) + + +# create tokens +tokens = [token.text for token in doc] + +# remove stop words (and.or, etc) +words = [token.text for token in doc if token.is_stop != True and token.is_punct != True] +print("unfiltered word list: {}".format(len(words))) + +words = [word for word in words if word != "\n' and word != '\n\n"] +print("filtered word list: {}".format(len(words))) + +# word freq +word_freq = Counter(words) + + +# load + +# get entities