-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalt_models.py
266 lines (231 loc) · 16 KB
/
alt_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python3
# Change log
# * v1.0.0 2024-04-11: Initial version.
# Version information
VERSION_NAME = 'AltModels'
VERSION = '1.0.0'
VERSION_DATE = '2024-04-11'
COPYRIGHT = 'Copyright 1979-2024. Sander W. van der Laan | s.w.vanderlaan [at] gmail [dot] com | https://vanderlaanand.science.'
COPYRIGHT_TEXT = f'\nThe MIT License (MIT). \n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and \nassociated documentation files (the "Software"), to deal in the Software without restriction, \nincluding without limitation the rights to use, copy, modify, merge, publish, distribute, \nsublicense, and/or sell copies of the Software, and to permit persons to whom the Software is \nfurnished to do so, subject to the following conditions: \n\nThe above copyright notice and this permission notice shall be included in all copies \nor substantial portions of the Software. \n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, \nINCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR \nPURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS \nBE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, \nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE \nOR OTHER DEALINGS IN THE SOFTWARE. \n\nReference: http://opensource.org.'
# Fixed variables
# Some general settings
SOFTWARE = "/hpc/local/Rocky8/dhl_ec/software"
# Path to PLINK
# PLINK19 = "{SOFTWARE}/plink19"
PLINK19 = "/Users/slaan3/bin/plink19"
PLINK2 = "/Users/slaan3/bin/plink2"
# Genetic data paths
# GENDATA = "/hpc/dhl_ec/data/_ae_originals/AEGS_COMBINED_EAGLE2_1000Gp3v5HRCr11/aegs.qc.1kgp3hrcr11.idfix"
GENDATA = "/Users/slaan3/PLINK/_AE_ORIGINALS/AEGS_COMBINED_EAGLE2_1000Gp3v5HRCr11/aegs.qc.1kgp3hrcr11.idfix"
# Import required packages
import os
import argparse
from argparse import RawTextHelpFormatter
import shutil
import time
from datetime import datetime
from datetime import timedelta
import subprocess
def submit_jobs(phenotypes, phenofile, covariates, glm, snp, chromosome, projectdir, outputdir, model=None, interaction=None, interactionparam=None, sex="sex", execute=False, submit=False, mailtype="FAIL", memory="8G", time="00:15:00", email="s.w.vanderlaan-2@umcutrecht.nl", verbose=False):
# Remove any extra characters from paths
phenofile = phenofile[0] # Extract the first element of the list
# covariates = ", ".join(covariates) # Join covariates into a comma-separated string
# selection = selection[0] # Extract the first element of the list
# selectionfile = selectionfile[0] # Extract the first element of the list
# Format project directory and output directory paths
projectdir = os.path.abspath(projectdir)
outputdir = os.path.abspath(outputdir)
# if model is None:
# model = ""
# if sex is None:
# sex = ""
# if interaction is None:
# interaction = ""
# Print extra information
print(f"Running {glm} regression for SNP {snp} on chromosome {chromosome} using model {model}.")
# check sex correction
if sex == "sex":
print(f"By default analyses are corrected for sex (based on the presence of chromosome X).")
elif sex == "no-x-sex":
print(f"Analyses are not corrected for sex (based on the presence of chromosome X).")
else:
print(f"Sex correction: {sex}")
# check interaction
if interaction == "interaction":
print(f"Adding interaction tests using --parameter {interactionparam}.")
else:
print(f"No interaction tests are executed.")
# check if model is provided
if model is None:
print(f"No model is provided. Using default additive {glm} model.")
else:
print(f"Using {model} model for {glm} analysis.")
# print additional information
if verbose:
print(f"PLINK location: {PLINK19}")
print(f"Phenotypes are read from file {phenofile}.")
print(f"The following phenotypes are tested: [{', '.join(phenotypes)}]")
print(f"In addition the following covariates are included: [{', '.join(covariates)}]")
# print(f"Data are filtered on value {selection} in {selectionfile}.")
print(f"Project directory: {projectdir}")
print(f"Output directory: {outputdir}")
if submit:
print(f"Submitting SLURM jobs.")
print(f"Email address: {email}")
print(f"Mail type settings: {mailtype}")
print(f"Memory allocation: {memory}")
print(f"Time allocation: {time}")
else:
print(f"Printing commands.")
# Define PLINK command based on model
# ref: https://www.cog-genomics.org/plink/1.9/input#vcf
# --const-fid converts sample IDs to within-family IDs while setting all family IDs to a single value (default '0').
# ref: https://www.cog-genomics.org/plink/1.9/input#plink_irreg
# --no-sex
# ref: https://www.cog-genomics.org/plink/1.9/assoc#linear
# interaction
if model is None:
if interaction:
if interactionparam is None:
plink_command = f"{PLINK2} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --glm {sex} {interaction} --no-sex --covar-variance-standardize"
else:
plink_command = f"{PLINK2} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --glm {sex} {interaction} --parameters {interactionparam} --no-sex --covar-variance-standardize"
else:
plink_command = f"{PLINK19} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --{glm} {sex} --no-sex --missing-code -9"
else:
if interaction:
if interactionparam is None:
plink_command = f"{PLINK2} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --glm {model} {sex} {interaction} --no-sex --covar-variance-standardize"
else:
plink_command = f"{PLINK2} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --glm {model} {sex} {interaction} --parameters {interactionparam} --no-sex --covar-variance-standardize"
else:
plink_command = f"{PLINK19} --vcf {GENDATA}.chr{chromosome}.vcf.gz --const-fid --{glm} {model} {sex} --no-sex --missing-code -9"
# Construct output name
if model is None:
output_name = f"{glm}_chr{snp.replace(':', '-')}"
else:
output_name = f"{glm}_{model}_chr{snp.replace(':', '-')}"
# Submit jobs for each phenotype
for phenotype in phenotypes:
# Command to run PLINK
command = f"{plink_command} --pheno {projectdir}/{phenofile} --pheno-name {phenotype} --covar {projectdir}/{phenofile} --covar-name {', '.join(covariates)} --snp {snp} --out {outputdir}/{output_name}_{phenotype}"
# Submit job or print command
if submit:
os.system(f"sbatch --mail-user {email} --mail-type {mailtype} --mem {memory} --time {time} --job-name {phenotype} --output {outputdir}/slurm-%j.out --error {outputdir}/slurm-%j.err --wrap '{command}'")
else:
print(f"- testing {phenotype}")
if verbose:
print(command)
print()
# Execute the command
if execute:
subprocess.run(command, shell=True)
# Define main function
def main():
parser = argparse.ArgumentParser(description=f'''
+ {VERSION_NAME} v{VERSION} +
This script runs logistic (`--logistic`) or linear regression (`--linear`) in PLINK v1.9 for a
given SNP (`--snp`) and chromosome (`--chr`) and a given list of phenotypes (`--phenotypes`)
and covariates (`--covariates`). The phenotypes and covariates are assumed to be in the same
file (`--phenofile`). By default analyses are corrected for sex (`--sex`).
The analysis will be run within a given project directory (`--projectdir`) and output
directory (`--outputdir`).
Optionally, the script will run the analysis for each phenotype separately while applying
a given genetic model (`--model`). By default the 'additive' model is assumed.
Optionally, the script will run an interaction analysis (`--interaction`), by default
this is done on all combinations, which can be changed using `--interactionparam`. The
interaction analysis is run in PLINK v2.0.
By default the script will print the commands to run the analysis. Use the `--execute` flag to
run the commands. Optionally, the script will submit SLURM jobs for each phenotype. In that case
a user can provide their email address (`--email`) and mail type settings (`--mailtype`). The
memory allocation for each job can be set using the `--memory` flag, and the time allocation
using the `--time` flag.
Use the `--verbose` flag to print extra information.
To get more information on the script, please use the `--help` flag. The `--version` flag will provide
you with the version information.
It is assumed that the genetic data is stored in VCF v4.2 format.
Example usage:
python alt_models.py --glm logistic --model genotypic --phenotypes pheno1 pheno2 pheno3 --phenofile phenocov.txt --covariates cov1 cov2 cov3 --selection "selected" --selectioncol 3 --snp rs123456 --chr 20 --projectdir /hpc/project/super_project --outputdir /hpc/project/super_project/output
''',
epilog=f'''
+ {VERSION_NAME} v{VERSION}. {COPYRIGHT} \n{COPYRIGHT_TEXT}+''',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--phenotypes", "-ph", nargs="+", help="List of phenotypes. For example: 'phenotype1 phenotype2 phenotype3'.")
parser.add_argument("--phenofile", "-pf", nargs="+", help="File containing the phenotypes. For example: 'phenocov.txt'.")
parser.add_argument("--covariates", "-c", nargs="+", help="List of covariates. For example: 'covariate1 covariate2 covariate3'.")
# parser.add_argument("--selection", "-sel", nargs="+", help="Value to filter data on; assuming this column is present in the phenocov-file. For example: 'selected'. Optional. [NOT IMPLEMENTED YET]")
# parser.add_argument("--selectionfile", "-sf", nargs="+", help="File containing the to-be selected samples. For example: 'selection.txt'. Optional. [NOT IMPLEMENTED YET]")
parser.add_argument("--glm", "-g", choices=["logistic", "linear"], default="logistic", help="Statistical test to use. Options: linear, logistic. Default: logistic.")
parser.add_argument("--model", "-m", choices=["genotypic", "dominant", "recessive"], help="Model to use. Options: genotypic, dominant, recessive. Default: genotypic. Optional")
parser.add_argument("--interaction", "-i", choices=["interaction"], help="Add in interaction analysis. Optional.")
parser.add_argument("--interactionparam", "-ip", choices=["1", "2", "3", "4", "5", "6", "7"], help="Add interaction parameter (see https://www.cog-genomics.org/plink/1.9/assoc#linear); use with `--interaction`. Optional.")
parser.add_argument("--sex", choices=["sex", "no-x-sex"], default="sex", help="By default sex is added as covariate (based on the presence of chromosome X). Default: sex. Optional.")
parser.add_argument("--snp", help="SNP to test. For example: 'rs123456' or '20:123456'.")
parser.add_argument("--chromosome", "-chr", help="Chromosome to analyze. For example: '20'.")
parser.add_argument("--projectdir", "-p", default = "/hpc/project/super_project", help="Project directory. For example: '/hpc/project/super_project'")
parser.add_argument("--outputdir", "-o", default = "/hpc/project/super_project/output", help="Output directory. For example: '/hpc/project/super_project/output'")
parser.add_argument("--execute", "-ex", default=False, help="Execute the command. Default: print commands. Optional.")
parser.add_argument("--submit", "-s", default=False, help="Submit SLURM jobs. Default: print commands. Optional.")
parser.add_argument("--email", "-e", default="s.w.vanderlaan-2@umcutrecht.nl", help="Your email address. Default: s.w.vanderlaan-2@umcutrecht.nl. Optional.")
parser.add_argument("--mailtype", "-mail", default="FAIL", help="Mail type settings. Options: NONE, BEGIN, END, FAIL, REQUEUE, ALL. Default: FAIL. Optional.")
parser.add_argument("--memory", "-mem", default="8G", help="Memory allocation for SLURM job. Default: 8G. Optional.")
parser.add_argument("--time", "-t", default="00:15:00", help="Time allocation for SLURM job. Default: 00:15:00. Optional.")
parser.add_argument('--verbose', '-v', action='store_true', help='Print extra information. Optional.')
parser.add_argument('--version', '-V', action='version', version=f'%(prog)s {VERSION} ({VERSION_DATE}).')
args = parser.parse_args()
# Start the script
print(f"+ {VERSION_NAME} v{VERSION} ({VERSION_DATE}) +")
print(f"\nStarting the testing of alternative models on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")
# Check if required arguments are provided
# if args.interaction == "interaction" and args.interactionparam is None:
# print(f"Error: Interaction analysis is requested, but no parameter is provided (see https://www.cog-genomics.org/plink/1.9/assoc#linear).")
# parser.print_help()
# exit(1)
required_args = ['phenotypes', 'phenofile', 'covariates', 'glm', 'snp', 'chromosome', 'projectdir', 'outputdir']
missing_args = [arg for arg in required_args if not getattr(args, arg)]
if missing_args:
print(f"Error: The following required arguments are missing: {', '.join(missing_args)}")
parser.print_help()
exit(1)
# Check if project directory and output directory exist
if not os.path.exists(args.projectdir):
print(f"Error: Project directory '{args.projectdir}' does not exist.")
exit(1)
if not os.path.exists(args.outputdir):
print(f"Output directory '{args.outputdir}' does not exist. Making it for you.")
os.makedirs(args.outputdir)
# Start the timer
start_time = time.time()
# Get today's date
today_date = datetime.now()
# Format the date as yyyymmdd
formatted_today = today_date.strftime("%Y%m%d")
# Submit jobs or printing command
if args.submit:
submit_jobs(args.phenotypes, args.phenofile, args.covariates, args.glm, args.snp, args.chromosome, args.projectdir, args.outputdir, args.model, args.interaction, args.interactionparam, args.sex, submit=True, mailtype=args.mailtype, memory=args.memory, time=args.time, email=args.email, verbose=args.verbose)
# can probably be removed
# if args.model is None:
# submit_jobs(args.phenotypes, args.phenofile, args.covariates, args.glm, "", args.snp, args.chromosome, args.projectdir, args.outputdir, args.interaction, args.sex, submit=True, mailtype=args.mailtype, memory=args.memory, time=args.time, email=args.email, verbose=args.verbose)
else:
submit_jobs(args.phenotypes, args.phenofile, args.covariates, args.glm, args.snp, args.chromosome, args.projectdir, args.outputdir, args.model, args.interaction, args.interactionparam, args.sex, execute=args.execute, submit=False, verbose=args.verbose)
# can probably be removed
# if args.submit:
# submit_jobs(args.phenotypes, args.phenofile, args.covariates, args.selection, args.selectioncol, args.glm, args.model, args.snp, args.chr, args.projectdir, args.outputdir, args.sex, args.submit, args.mailtype, args.memory, args.time, args.email, args.verbose)
# Calculate the elapsed time in seconds
elapsed_time = time.time() - start_time
# Convert seconds to a timedelta object
time_delta = timedelta(seconds=elapsed_time)
# Extract hours, minutes, seconds, and milliseconds
hours, remainder = divmod(time_delta.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = round(time_delta.microseconds / 1000)
# Print the script execution time in the desired format
formatted_time = f"{hours} hours, {minutes} minutes, {seconds} seconds, {milliseconds} milliseconds"
print(f"\nScript executed on {today_date.strftime('%Y-%m-%d')}. Total execution time was {formatted_time} (minus writing time).")
# Run the main function
if __name__ == "__main__":
main()
# Print the version number
print(f"\n+ {VERSION_NAME} v{VERSION} ({VERSION_DATE}). {COPYRIGHT} +")
print(f"{COPYRIGHT_TEXT}")
# End of file