-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathNominalResultsParser.py
executable file
·122 lines (90 loc) · 4.94 KB
/
NominalResultsParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
print " QTLTools NOMINAL RESULTS PARSER"
print ""
print ""
print "* Written by : Tim Bezemer | t.bezemer-2@umcutrecht.nl"
print "* Updated by : Jacco Schaap | jacco_schaap@hotmail.com"
print "* Suggested for by : Sander W. van der Laan | s.w.vanderlaan-2@umcutrecht.nl"
print "* Last update : 2018-03-02"
print "* Name : NominalResultsParser"
print "* Version : v1.2.5"
print ""
print "* Description : In case of an eQTL analysis this script will collect all "
print " analysed genes and list their associated ProbeIDs as well as the"
print " number of variants analysed."
print " In case of a mQTL analysis this script will collect all "
print " analysed CpGs and their associated genes, as well as the "
print " the number of variants analysed."
print " In both cases it will produce a LocusZoom (v1.3+) input file"
print " which contains the variant associated (MarkerName) and the "
print " p-value (P-value)."
print ""
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
import gzip
import pandas as pd
from sys import argv, exit
from os import mkdir
from os.path import isdir, isfile
from subprocess import call
import numpy as np
fn_nom = argv[1] # Nominal data. Either clumped or normal
def main():
parser()
#if clump == 'Y':
# read_summary()
### created by Tim Bezemer
def parser():
if len(argv) < 1 or not isfile(argv[1]):
print "Invalid filename was supplied."
print "Usage: " + argv[0] + " [filename]"
print "Please make sure the data file contains the following columns:"
print "Locus\tGeneName\tProbeID\tVARIANT\tNominal_P\t"
exit()
###data = pd.read_csv(fn, '\t')
data = pd.read_csv(fn_nom)
print "Checking for/creating directories loci/ and probes/ ..."
if not isdir("_loci"): mkdir("_loci")
if not isdir("_probes"): mkdir("_probes")
# One file per locus (VariantID as name), containing all gene names and associated probes. Count Gene-Probe pairs.
loci_ids = set(data['Locus'])
loci = {}
for l in loci_ids:
print "Generating mapping for locus " + l
loci[l] = dict()
with open("_loci/" + l + ".txt", "w") as locus_mapping:
print >> locus_mapping, "Locus\tGeneName\tProbeID\tN_Variants\tN_Sign_Variants"
GeneNames = list(set(data[data['Locus'] == l]['GeneName']))
print GeneNames
for g in GeneNames:
### print g
loci[l][g] = []
# print "\t* gene " + (g if g != np.nan else "NA")
print "\t* gene " + str(g)
ProbeIDs = list(set(data[(data['Locus'] == l) & (data['GeneName'] == g)]['ProbeID']))
for p in ProbeIDs:
loci[l][g].append(p)
print "\t\t- collecting variants for probe " + p
variants = data[(data['Locus'] == l) & (data['GeneName'] == g) & (data['ProbeID'] == p)][
['VARIANT', 'Nominal_P', 'Chr', 'BP']]
variants_p_below_threshold = data[
(data['Locus'] == l) & (data['GeneName'] == g) & (data['ProbeID'] == p) & (
data['Nominal_P'] < 0.05)][['VARIANT', 'Nominal_P', 'Chr', 'BP']]
variants.rename(columns={"VARIANT": "MarkerName", "Nominal_P": "P-value"}, inplace=True)
variants["MarkerName"] = variants.apply(lambda x: str(x['Chr']) + ":" + str(x['BP']), axis=1)
variants.rename(columns={"Chr": "RSID"}, inplace=True)
variants["RSID"] = data[(data['Locus'] == l) & (data['GeneName'] == g) & (data['ProbeID'] == p)][
'VARIANT']
# variants["RSID"] = variants.apply(lambda x: str(x['VARIANT']), axis=1)
variants = variants.drop('BP', axis=1)
# print "***DEBUG*** show variant on next line (second time):"
# print variants
n_of_variants = len(variants)
n_of_variants_below_threshold = len(variants_p_below_threshold)
# Output locus, gene, probeID, the variant count, and the N of significant hits per gene to the mapping file
print >> locus_mapping, "\t".join([l, g, p, str(n_of_variants), str(n_of_variants_below_threshold)])
# Construct the file containing variants for LocusZoom
with open("_probes/" + "_".join([l, g, p]) + ".lz", "w") as probe_file:
variants.to_csv(probe_file, sep='\t', index=False)
print "Pfieuw. That was a lot! Let's have a beer."
main()