diff --git a/DataAnalysis/Visualization/Intro/CT.html b/DataAnalysis/Visualization/Intro/CT.html
new file mode 100644
index 0000000..bec05df
--- /dev/null
+++ b/DataAnalysis/Visualization/Intro/CT.html
@@ -0,0 +1,1311 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DataAnalysis/Visualization/Intro/foliumExample.py b/DataAnalysis/Visualization/Intro/foliumExample.py
new file mode 100644
index 0000000..e4eaa8a
--- /dev/null
+++ b/DataAnalysis/Visualization/Intro/foliumExample.py
@@ -0,0 +1,49 @@
+
+
+# https://www.youtube.com/watch?v=xPk7S-Eb4J4
+
+# imports
+import pandas as pd
+import folium
+
+# this makes it so that you see all the columns in a pd.show()
+pd.set_option('display.max_columns', None)
+
+# load the data
+# https://github.com/practicalaifab/folium/blob/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv
+# https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv
+df = pd.read_csv("https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv")
+
+# get list of hoospitals
+
+# filter for only MA hospitals
+ma = df[df['STATE'] == 'MA']
+ma = ma[['NAME', 'LATITUDE', 'LONGITUDE']]
+
+# display
+# map.head()
+
+
+# get the mean lat/lon for the map crteation
+lat_mean = ma['LATITUDE'].mean()
+lon_mean = ma['LONGITUDE'].mean()
+
+# create folium map
+map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15)
+
+# need to creat list of hospitals to put them on the map
+list_hosp = ma.values.tolist()
+
+# loop over list
+for index in list_hosp:
+ # add to map
+ map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green')))
+
+
+# save map as html file
+map.save("ma.html")
+
+
+# df.show()
+
+
diff --git a/DataAnalysis/Visualization/Intro/foliumWithFunction.py b/DataAnalysis/Visualization/Intro/foliumWithFunction.py
new file mode 100644
index 0000000..6415929
--- /dev/null
+++ b/DataAnalysis/Visualization/Intro/foliumWithFunction.py
@@ -0,0 +1,90 @@
+
+
+# https://www.youtube.com/watch?v=xPk7S-Eb4J4
+
+# imports
+import pandas as pd
+import folium
+
+# this makes it so that you see all the columns in a pd.show()
+pd.set_option('display.max_columns', None)
+
+# load the data
+# https://github.com/practicalaifab/folium/blob/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv
+# https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv
+df = pd.read_csv("https://raw.githubusercontent.com/practicalaifab/folium/codespace-practicalaifab-probable-space-umbrella-r4476xv556q356qr/data/hospitals.csv")
+
+
+# function
+def choose_state(data, state_option):
+ state = data[data['STATE'] == state_option]
+ state = state[['NAME', 'LATITUDE', 'LONGITUDE']]
+
+ # return
+ return state
+
+def plot_state(data):
+ # get lon/lat
+ lat_mean = data['LATITUDE'].mean()
+ lon_mean = data['LONGITUDE'].mean()
+
+ # get the map
+ map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15)
+
+ # populate the map
+ list_hosp = data.values.tolist()
+ for index in list_hosp:
+ # add to map
+ map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green')))
+
+ # return
+ return map
+
+if __name__ == "__main__":
+ state = 'CT'
+ # get the state data
+ df_state = choose_state(df, state)
+
+ print(df_state.info())
+
+ # get the map
+ map = plot_state(df_state)
+
+ # save map as html file
+ map.save("{}.html".format(state))
+
+
+
+# # get list of hoospitals
+
+# # filter for only MA hospitals
+# ma = df[df['STATE'] == 'MA']
+# ma = ma[['NAME', 'LATITUDE', 'LONGITUDE']]
+
+# # display
+# # map.head()
+
+
+# # get the mean lat/lon for the map crteation
+# lat_mean = ma['LATITUDE'].mean()
+# lon_mean = ma['LONGITUDE'].mean()
+
+# # create folium map
+# map = folium.Map(location=[lat_mean, lon_mean], zoom_start=15)
+
+# # need to creat list of hospitals to put them on the map
+# list_hosp = ma.values.tolist()
+
+# # loop over list
+# for index in list_hosp:
+# # add to map
+# map.add_child(folium.Marker(location=[index[1], index[2]], popup=index[0], icon=folium.Icon(color='green')))
+
+
+# # save map as html file
+# map.save("ma.html")
+
+
+# df.show()
+
+
diff --git a/DataAnalysis/Visualization/Intro/ma.html b/DataAnalysis/Visualization/Intro/ma.html
new file mode 100644
index 0000000..833bbcb
--- /dev/null
+++ b/DataAnalysis/Visualization/Intro/ma.html
@@ -0,0 +1,3795 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DccKP/Burden/Json/burdenQuery.json b/DccKP/Burden/Json/burdenQuery.json
new file mode 100644
index 0000000..f86004b
--- /dev/null
+++ b/DccKP/Burden/Json/burdenQuery.json
@@ -0,0 +1,21 @@
+{
+ "covariates": [
+ "C1",
+ "C2",
+ "C3",
+ "SEX"
+ ],
+ "mdv": "mdv27",
+ "dataset_id": "samples_55k_multi",
+ "variants": [
+ "20_16223957_C_T",
+ "8_118184783_C_T",
+ "8_118165282_C_T"
+ ],
+ "ci_level": 0.95,
+ "phenotype": "t2d",
+ "calc_ci": true,
+ "filters": [],
+ "allele_type": "bi",
+ "samples": []
+}
\ No newline at end of file
diff --git a/DccKP/Burden/Json/federatedBurdenQuery.json b/DccKP/Burden/Json/federatedBurdenQuery.json
new file mode 100644
index 0000000..551cdd3
--- /dev/null
+++ b/DccKP/Burden/Json/federatedBurdenQuery.json
@@ -0,0 +1,10 @@
+{
+ "mdv": "mdv25",
+ "variants": [
+ "8_118184783_C_T"
+ ],
+ "ci_level": 0.95,
+ "phenotype": "WAISTC_CM",
+ "calc_ci": true,
+ "samples": []
+}
\ No newline at end of file
diff --git a/DccKP/Burden/Json/federatedBurdenQuery2.json b/DccKP/Burden/Json/federatedBurdenQuery2.json
new file mode 100644
index 0000000..ca45ffb
--- /dev/null
+++ b/DccKP/Burden/Json/federatedBurdenQuery2.json
@@ -0,0 +1,21 @@
+{
+ "covariates": [
+ "C1",
+ "C2",
+ "C3",
+ "SEX"
+ ],
+ "mdv": "mdv27",
+ "dataset_id": "samples_55k_multi",
+ "variants": [
+ "20_16223957_C_T",
+ "8_118184783_C_T",
+ "8_118165282_C_T"
+ ],
+ "ci_level": 0.95,
+ "phenotype": "t2d",
+ "calc_ci": true,
+ "filters": [],
+ "allele_type": "multi",
+ "samples": []
+}
\ No newline at end of file
diff --git a/DccKP/Burden/Json/federatedKpQuery.json b/DccKP/Burden/Json/federatedKpQuery.json
new file mode 100644
index 0000000..9b634b3
--- /dev/null
+++ b/DccKP/Burden/Json/federatedKpQuery.json
@@ -0,0 +1,47 @@
+{
+ "passback": "123abc",
+ "entity": "variant",
+ "limit": 50,
+ "count": false,
+ "properties": {
+ "cproperty": [
+ "MOST_DEL_SCORE",
+ "VAR_ID",
+ "DBSNP_ID"
+ ],
+ "orderBy": [],
+ "dproperty": {
+ "MAF": [
+ "GWAS_OxBB_mdv1"
+ ],
+ "MAC": [
+ "GWAS_OxBB_mdv1"
+ ]
+ },
+ "pproperty": {
+ "BETA": {
+ "GWAS_OxBB_mdv1": [
+ "FG"
+ ]
+ }
+ }
+ },
+ "filters": [
+ {
+ "dataset_id": "blah",
+ "phenotype": "blah",
+ "operand": "CHROM",
+ "operator": "EQ",
+ "value": "20",
+ "operand_type": "STRING"
+ },
+ {
+ "dataset_id": "GWAS_OxBB_mdv1",
+ "phenotype": "blah",
+ "operand": "MAF",
+ "operator": "GT",
+ "value": 0,
+ "operand_type": "FLOAT"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DccKP/Hail/hailRegionCall.json b/DccKP/Hail/hailRegionCall.json
new file mode 100644
index 0000000..43cfe52
--- /dev/null
+++ b/DccKP/Hail/hailRegionCall.json
@@ -0,0 +1,26 @@
+{
+ "passback": "example",
+ "api_version": 1,
+ "variant_filters": [
+ {
+ "operand": "chrom",
+ "operator": "eq",
+ "value": "10",
+ "operand_type": "string"
+ },
+ {
+ "operand": "pos",
+ "operator": "gte",
+ "value": 114550452,
+ "operand_type": "integer"
+ },
+ {
+ "operand": "pos",
+ "operator": "lte",
+ "value": 115067678,
+ "operand_type": "integer"
+ }
+ ],
+ "limit": 5000,
+ "count": false
+}
\ No newline at end of file
diff --git a/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql b/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql
index c0e5c2b..4e932d1 100644
--- a/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql
+++ b/DccKP/Translator/Aggregator/PathwayGenes/createPathwayInformation.sql
@@ -10,7 +10,7 @@ create table tran_upkeep.data_pathway (
pathway_updated_name varchar(2000),
systematic_name varchar(200),
pmid varchar(200),
- exact_souurce varchar(200),
+ exact_source varchar(200),
msig_url varchar(2000),
ontology_id varchar(200),
gene_count int(9) not null,
diff --git a/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py b/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py
new file mode 100644
index 0000000..a271534
--- /dev/null
+++ b/DccKP/Translator/Client/SenmedDB/biothingsSenmedDbRetriver.py
@@ -0,0 +1,146 @@
+
+# imports
+import json
+import sys
+import logging
+import datetime
+import os
+import requests
+from pathlib import Path
+import re
+import csv
+import pandas as pd
+
+# constants
+handler = logging.StreamHandler(sys.stdout)
+logger = logging.getLogger(__name__)
+dir_code = "/Users/mduby/Code/WorkspacePython/"
+dir_code = "/home/javaprog/Code/PythonWorkspace/"
+dir_data = "/Users/mduby//Data/Broad/"
+dir_data = "/home/javaprog/Data/Broad/"
+sys.path.insert(0, dir_code + 'MachineLearningPython/DccKP/Translator/TranslatorLibraries')
+import translator_libs as tl
+location_servers = dir_code + "MachineLearningPython/DccKP/Translator/Misc/Json/trapiListServices.json"
+date_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+location_results = dir_data + "Translator/Workflows/PathwayPpargT2d/SenmedDb/"
+file_result = location_results + "sandrinePapersSenmedDbUmls.csv"
+url_biothings_senmeddb = "https://biothings.ncats.io/semmeddb/query?q=pmid:{}&size=100"
+max_count = 200
+
+# list of papers
+map_papers = {}
+# map_papers['16150867'] = "3-phosphoinositide-dependent protein kinase-1 activates the peroxisome proliferator-activated receptor-gamma and promotes adipocyte differentiation, Yin "
+# map_papers['8001151'] = "Stimulation of adipogenesis in fibroblasts by PPAR gamma 2, a lipid-activated transcription factor, Tontonoz"
+# map_papers['12021175'] = "Gene expression profile of adipocyte differentiation and its regulation by peroxisome proliferator-activated receptor-gamma agonists, Gerhold"
+# map_papers['10339548'] = "A peroxisome proliferator-activated receptor gamma ligand inhibits adipocyte differentiation. Oberfield"
+# map_papers['7838715'] = "Adipocyte-specific transcription factor ARF6 is a heterodimeric complex of two nuclear hormone receptors, PPAR gamma and RXR alpha, Tontonoz"
+# map_papers['10622252'] = "Dominant negative mutations in human PPARgamma associated with severe insulin resistance, diabetes mellitus and hypertension, Barroso"
+# map_papers['9806549'] = "A Pro12Ala substitution in PPARgamma2 associated with decreased receptor activity, lower body mass index and improved insulin sensitivity, Deeb"
+# map_papers['25157153'] = "Rare variants in PPARG with decreased activity in adipocyte differentiation are associated with increased risk of type 2 diabetes, Majithia"
+
+
+# # 20220529 - new papers
+# map_papers['34900790'] = "The role of the PPARG (Pro12Ala) common genetic variant on type 2 diabetes mellitus risk"
+# map_papers['35462933'] = "PRDM16 Regulating Adipocyte Transformation and Thermogenesis: A Promising Therapeutic Target for Obesity and Diabetes"
+# map_papers['35364246'] = "Therapeutic implications of sonic hedgehog pathway in metabolic disorders: Novel target for effective treatment"
+# map_papers['35341481'] = "Loss of thymidine phosphorylase activity disrupts adipocyte differentiation and induces insulin-resistant lipoatrophic diabetes"
+# map_papers['35054888'] = "Effects of Isorhamnetin on Diabetes and Its Associated Complications: A Review of In Vitro and In Vivo Studies and a Post Hoc Transcriptome Analysis of Involved Molecular Pathways"
+# map_papers['34545810'] = "Impaired mRNA splicing and proteostasis in preadipocytes in obesity-related metabolic disease"
+# map_papers['33959308'] = "Curcumin improves adipocytes browning and mitochondrial function in 3T3-L1 cells and obese rodent model"
+# map_papers['14684744'] = "Dioxin increases C/EBPbeta transcription by activating cAMP/protein kinase A"
+# map_papers['14530861'] = "The FOXC2 -512C>T variant is associated with hypertriglyceridaemia and increased serum C-peptide in Danish Caucasian glucose-tolerant subjects"
+# map_papers['12855691'] = "Overexpression of sterol regulatory element-binding protein-1a in mouse adipose tissue produces adipocyte hypertrophy, increased fatty acid secretion, and fatty liver"
+# map_papers['12677228'] = "The Role of PPARgamma Ligands as Regulators of the Immune Response"
+# map_papers['11928067'] = "Pro12Ala polymorphism in the peroxisome proliferator-activated receptor-gamma2 (PPARgamma2) is associated with higher levels of total cholesterol and LDL-cholesterol in male caucasian type 2 diabetes patients"
+# map_papers['27909015'] = "Diabetic human adipose tissue-derived mesenchymal stem cells fail to differentiate in functional adipocytes"
+# map_papers['27815534'] = "Biological roles of microRNAs in the control of insulin secretion and action"
+# map_papers['27657995'] = "Effects of Streptozotocin-Induced Diabetes on Proliferation and Differentiation Abilities of Mesenchymal Stem Cells Derived from Subcutaneous and Visceral Adipose Tissues"
+# map_papers['27493874'] = "Diabetic mice exhibited a peculiar alteration in body composition with exaggerated ectopic fat deposition after muscle injury due to anomalous cell differentiation"
+# map_papers['27445976'] = "Cooperation between HMGA1 and HIF-1 Contributes to Hypoxia-Induced VEGF and Visfatin Gene Expression in 3T3-L1 Adipocytes"
+
+# 20221118 - search for sandrine's chem papers
+# (Chou DH-C et al., ACS Med Chem Lett 2011, PMID: 21927648,Chou DH-C et al., J Am Chem Soc 2015, PMID: 26042473,Vetere, Amedeo, et al., Nature reviews Drug discovery, PMID: 24525781)
+
+map_papers['21927648'] = "Chou DH-C et al., ACS Med Chem Lett 2011"
+map_papers['26042473'] = "Chou DH-C et al., J Am Chem Soc 2015"
+map_papers['24525781'] = "Vetere, Amedeo, et al., Nature reviews Drug discovery"
+
+def query_biothings(paper_id, paper_name, log=False):
+ '''
+ find the journal if in the results
+ '''
+ # initialize
+ pubmed_id = 'PMID:' + paper_id
+ list_results = []
+ is_found = False
+ url_query = url_biothings_senmeddb.format(paper_id)
+
+ # log
+ if log:
+ print("looking for pubmed id: {}".format(url_query))
+
+ # query the service
+ response = requests.get(url_query)
+
+ # try and catch exception
+ try:
+ json_output = response.json()
+ # if log:
+ # print("got result: \n{}".format(json_output))
+ except ValueError:
+ print("GOT ERROR: skipping")
+
+ # pick put the data
+ map_result = {'pubmed_id': paper_id, 'info': paper_name[0:60], 'predicate': None, 'subject': None, 'subject_type': None, 'object': None, 'object_type': None}
+ if json_output:
+ if isinstance(json_output, dict):
+ if json_output.get('hits'):
+ for child in json_output.get('hits'):
+ is_found = True
+ map_result = child.get('predicate')
+ map_result = {'pubmed_id': paper_id, 'info': paper_name[0:60], 'predicate': child.get('predicate'),
+ 'subj_umls': child.get('subject').get('umls'),
+ 'subject': child.get('subject').get('name'), 'subject_type': child.get('subject').get('semantic_type_name'),
+ 'obj_umls': child.get('object').get('umls'),
+ 'object': child.get('object').get('name'), 'object_type': child.get('object').get('semantic_type_name'),}
+ list_results.append(map_result)
+
+ # add to list
+ if not is_found:
+ list_results.append(map_result)
+
+ # return
+ return list_results
+
+if __name__ == "__main__":
+ # initialize
+ count = 0
+ list_result = []
+
+ # loop through the paper ids
+ for key, value in map_papers.items():
+ # test the max count
+ if count < max_count:
+ count += 1
+
+ # get the biothings data for the paper
+ list_temp = query_biothings(key, value, log=True)
+
+ # add to the results
+ list_result = list_result + list_temp
+
+ # print the results
+ print("\n=====results")
+ for child in list_result:
+ print(child)
+
+ # create dataframe
+ df_papers = pd.DataFrame(list_result)
+ #temporaly display 999 rows
+ with pd.option_context('display.max_rows', 999):
+ print (df_papers)
+
+ # write out the file
+ df_papers.to_csv(file_result, sep='\t')
+ print("wrote out the file to: {}".format(file_result))
+
diff --git a/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc b/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc
index 958f3ec..2e39ba4 100644
Binary files a/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc and b/DccKP/Translator/TranslatorLibraries/__pycache__/translator_libs.cpython-38.pyc differ
diff --git a/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json b/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json
index 62bcded..ff92296 100644
--- a/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json
+++ b/DccKP/Translator/Workflows/Json/Queries/Pathways/PpargPathways/pathwayLipidDiffReactomeQuery.json
@@ -14,7 +14,6 @@
"categories": [
"biolink:Pathway"
],
- "constraints": [],
"ids": [
"REACT:R-HSA-381340"
],
@@ -24,7 +23,6 @@
"categories": [
"biolink:Disease"
],
- "constraints": [],
"is_set": false
}
}
diff --git a/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json b/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json
index 1343e83..46fe255 100644
--- a/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json
+++ b/DccKP/Translator/Workflows/Json/Queries/Pathways/ppargT2dPathwaysQuery.json
@@ -3,12 +3,10 @@
"query_graph": {
"edges": {
"e01": {
- "constraints": [],
"object": "pathway",
"subject": "gene"
},
"e02": {
- "constraints": [],
"object": "disease",
"subject": "pathway"
}
@@ -18,7 +16,6 @@
"categories": [
"biolink:Disease"
],
- "constraints": [],
"ids": [
"MONDO:0005148"
],
@@ -28,7 +25,6 @@
"categories": [
"biolink:Gene"
],
- "constraints": [],
"ids": [
"NCBIGene:5468"
],
@@ -38,7 +34,6 @@
"categories": [
"biolink:Pathway"
],
- "constraints": [],
"is_set": false
}
}
diff --git a/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt b/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt
new file mode 100644
index 0000000..5eba448
--- /dev/null
+++ b/Notes/CheatSheets/Genetics/geneticsTermsCheatSheet.txt
@@ -0,0 +1,20 @@
+
+
+- genetics significance
+ - We want to use that for the gene pheWAS plots but use 5e-8 for variants
+
+
+
+ - tools
+ - gregor - for annotations/enrichments
+ - magma - for association stats
+ - LDSC - LD score regression
+ - takes into account variants that travel together and their effect sizes
+
+
+
+Genetic variation can affect the levels of protein; consider a variant that reduces the amount of mRNA transcribed, this could have a profound effect on the amount of mRNA available to translate into a protein.
+
+Consider another variant that does not affect the abundance of the mRNA transcribed, but alters one of the many important sequences that are required for translation (co-factor binding sites, ribosomal binding sites, start site, etc.).
+
+These are just a couple of examples that first came to mind, but there are many more. For instance, a variant may increase expression of a particular microRNA that in turn inhibits the translation of another mRNA molecule. I have not included any references as this is more a logical exercise - doubtless there are many more ways a genetic variant can affect protein abundance.
diff --git a/Notes/CheatSheets/ML/pysparkCheatSheet.txt b/Notes/CheatSheets/ML/pysparkCheatSheet.txt
index 6c20549..c9ef3c3 100644
--- a/Notes/CheatSheets/ML/pysparkCheatSheet.txt
+++ b/Notes/CheatSheets/ML/pysparkCheatSheet.txt
@@ -50,3 +50,47 @@ def round_down(x):
round_down_udf = udf(round_down, IntegerType()) # 2nd arg is type returned
df.select(round_down_udf('fare').alias('int_fare'))
+
+
+
+- describe
+ df_export.printSchema()
+ df_export.count()
+ df_export.describe()
+ df_export.show()
+
+- data aggregation
+ df_export.groupBy("chromosome").count().orderBy("chromosome").show(25, False)
+
+
+- select subset of columns
+ df_export = df_nonnull_load.select("dbSnp", 'chromosome', 'position')
+ df.where(F.col("count").isNull()).show()
+
+
+- split column into other columns
+ split_col = pyspark.sql.functions.split(df['my_str_col'], '-')
+ df = df.withColumn('NAME1', split_col.getItem(0))
+
+- export data
+ df.coalesce(1).write.csv('result.csv') # one file
+
+ df_export.write.mode('overwrite').csv(out_file)
+ os.system("cat output/test/p* > output/test.csv") # one file; will not work in distributed env
+
+ df_export.write.mode('overwrite').option("delimiter", "\t").csv(out_dir)
+
+
+ srcdir = '%s/variants/*/%s' % (s3dir, args.phenotype)
+ outdir = '%s/out/metaanalysis/variants/%s' % (s3dir, args.phenotype)
+
+
+
+
+
+-- scratch
+ opts = argparse.ArgumentParser()
+ opts.add_argument('phenotype')
+
+
+
\ No newline at end of file
diff --git a/Notes/ML/pytorchCheatSheet.txt b/Notes/CheatSheets/ML/pytorchCheatSheet.txt
similarity index 100%
rename from Notes/ML/pytorchCheatSheet.txt
rename to Notes/CheatSheets/ML/pytorchCheatSheet.txt
diff --git a/Notes/CheatSheets/ML/sklearnCheatSheet.txt b/Notes/CheatSheets/ML/sklearnCheatSheet.txt
index 2a9d09c..6221014 100644
--- a/Notes/CheatSheets/ML/sklearnCheatSheet.txt
+++ b/Notes/CheatSheets/ML/sklearnCheatSheet.txt
@@ -6,3 +6,7 @@ model_selection:
- StratifiedKFold - takes group information into account for balanced folds for classification
- train_test_split() - train/test splitting
+
+Time series
+-----------
+- needs to be normalized
diff --git a/Notes/CheatSheets/Python/pythonCheatSheet.txt b/Notes/CheatSheets/Python/pythonCheatSheet.txt
index c07a654..b106884 100644
--- a/Notes/CheatSheets/Python/pythonCheatSheet.txt
+++ b/Notes/CheatSheets/Python/pythonCheatSheet.txt
@@ -18,11 +18,21 @@ json:
with open('strings.json') as f:
d = json.load(f)
print(d)
+
+ - display json
+ print("build trapi payload: \n{}".format(json.dumps(payload, indent=2))
virtual env:
------------
- python3 -m venv tutorial-env
+strings:
+--------
+
+pandas:
+-------
+ - df_results = pd.DataFrame.from_dict(list_gene_chemical)
+
lists:
------
- chunks = [data[x:x+100] for x in range(0, len(data), 100)] # split list into sublists of size
@@ -64,6 +74,14 @@ files:
for line in file:
print(line.rstrip())
+jupyter:
+--------
+- expand seen rows for pandas
+ from IPython.display import display
+ pd.options.display.max_rows = 999
+ display(df_results)
+
+
misc:
@@ -76,12 +94,33 @@ databases:
cur = db.cursor()
cur.execute(sql).fetchall()/fetchone()
+
+sqlalchemy:
+-----------
+- return new rowid
+ with engine.connect() as conn:
+ sql_params = data.dict()
+ sql_params.update({'s3_bucket_id': s3_record_id, 'metadata': json.dumps(data.metadata)})
+ res = conn.execute(text("""
+ INSERT INTO records (s3_bucket_id, name, metadata, data_source_type, data_source, data_type, genome_build,
+ ancestry, data_submitter, data_submitter_email, institution, sex, global_sample_size, t1d_sample_size,
+ bmi_adj_sample_size, status, additional_data) VALUES(:s3_bucket_id, :name, :metadata, :data_source_type,
+ :data_source, :data_type, :genome_build, :ancestry, :data_submitter, :data_submitter_email, :institution,
+ :sex, :global_sample_size, :t1d_sample_size, :bmi_adj_sample_size, :status, :additional_data)
+ """), sql_params)
+ conn.commit()
+ s3.create_record_directory(s3_record_id)
+ return s3_record_id, res.lastrowid
+
objects:
--------
- __init__(self)
- __repl__(self)
- __str__(self)
+ gene_association: GenePhenotypeAssociation
+ for gene_association in list_gene_assoc:
+
debug:
------
- help(