-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcutrun.sh
executable file
·446 lines (399 loc) · 13.1 KB
/
cutrun.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/bin/bash
### Author: Niek Wit (University of Cambridge) 2021
threads=""
rename_config=""
genome=""
dedup=""
cutoff=0
align_mm=0
big_wig=""
reads_length=""
peak=""
ngsplot=""
qc=""
PICARD=$(find $HOME -name picard.jar) #make sure you only have one copy of picard.jar!
SCRIPT_DIR=$(find $HOME -type d -name "CUT-RUN") #make sure you only have one copy of the CUT-RUN folder!
WORK_DIR=$(pwd)/
function usage {
echo "Usage: $0 [-g <genome build>] [-r OPTIONAL:renames NGS files] [-m <INT> mismatches allowed for alignment (standard is zero) OPTIONAL] [-t <INT> number of CPU threads to be used]"
exit 2
}
while getopts 't:g:rdm:f:blpnq?h' c
do
case $c in
t)
threads=$OPTARG;;
g)
genome="$OPTARG";;
r)
rename_config="rename.config";;
d)
dedup="TRUE";;
m)
align_mm=$OPTARG;;
f)
cutoff=$OPTARG;;
b)
big_wig="TRUE";;
l)
reads_length="TRUE";;
p)
peak="TRUE";;
n)
ngsplot="TRUE";;
q)
qc="TRUE";;
h|?)
usage;;
esac
done
###checking input variables
if [[ $align_mm == 0 ]] || [[ $align_mm == 1 ]];
then
:
else
echo "ERROR: -m parameter must be 0 or 1"
usage
exit 1
fi
if [[ ! $cutoff =~ ^-?[0-9]+$ ]];
then
echo "ERROR: filter cutoff should be an integer."
exit 1
fi
if [[ -z "$threads" ]]
then
threads=$(nproc --all)
elif [[ ! -z "$threads" ]] && [[ ! $threads =~ ^-?[0-9]+$ ]]
then
echo "ERROR: number of threads should be an integer."
exit 1
fi
###renames files if -r flag, and rename template is present
if [[ ! -z "$rename_config" ]] && [[ -e "rename.config" ]];
then
input=$rename_config
while IFS= read -r line
do
original_file=$(echo "$line" | cut -d ";" -f 1) #splits line of config file into original file name
new_file=$(echo "$line" | cut -d ";" -f 2) #splits line of config file into new file name
mv "raw-data/${original_file}" "raw-data/${new_file}"
done < "$input"
fi
###checks if file extension is .fq or .fastq
input_format=$(ls raw-data/*.gz | head -1)
if [[ $input_format == *"fastq"* ]]
then
input_extension="fastq.gz"
elif [[ $input_format == *"fq"* ]]
then
input_extension="fq.gz"
fi
###Fastq quality control
fastqc_folder="fastqc/"
if [[ ! -d "$fastqc_folder" ]];
then
echo "Performing FASTQC"
mkdir fastqc
fastqc -q --threads "$threads" -o fastqc/ raw-data/*$input_extension
echo "Performing MultiQC"
multiqc -o fastqc/ fastqc/ . 2>> multiqc.log
else
echo "FASTQC/MultiQC already performed"
fi
###trimming samples
trim_folder="trim/"
if [[ ! -d "$trim_folder" ]];
then
echo "Performing read trimming"
mkdir -p trim
for read1 in raw-data/*"_1.$input_extension"
do
echo $read1 >> trim.log
read2="${read1%_1."$input_extension"}_2."$input_extension""
trim_galore -j 4 -o ./trim --paired $read1 $read2 2>> trim.log
done
else
echo "Trimming already performed"
fi
###aligning samples
bam_folder="bam/"
if [[ ! -d "$bam_folder" ]];
then
mkdir -p bam
#touch align.log
index_path=$(cat "$SCRIPT_DIR/genome.yml" | shyaml get-value genome.$genome.0 | awk '{print$2}')
black_list_path=$(cat "$SCRIPT_DIR/genome.yml" | shyaml get-value genome.$genome.1 | awk '{print$2}')
for read1 in trim/*"_1_val_1.fq.gz"
do
read2="${read1%_1_val_1.fq.gz}_2_val_2.fq.gz"
file_name="${read1##*/}"
base_name="${file_name%_1_val_1.fq.gz}"
echo "$base_name" >> align.log
align_output="bam/$base_name.bam"
echo "Aligning $base_name"
bowtie2 -p "$threads" --local --very-sensitive-local --no-unal --no-mixed --no-discordant --phred33 -I 10 -X 700 -x "$index_path" -1 "$read1" -2 "$read2" 2>> align.log | samtools view -q 15 -F 260 -bS -@ "$threads" - | bedtools intersect -v -a "stdin" -b "$black_list_path" -nonamecheck | samtools sort -@ "$threads" - > "$align_output"
samtools index -b -@ "$threads" "$align_output"
done
else
echo "Alignment already performed"
fi
#--dovetail --phred33 #Yuan settings
#--local --very-sensitive-local --no-unal --no-mixed --no-discordant --phred33 -I 10 -X 700 #Henikoff settings
###deduplication
dedup_folder="deduplication/"
if [[ "$dedup" == "TRUE" ]] && [[ ! -d "$dedup_folder" ]];
then
mkdir -p deduplication
touch deduplication.log
for bam in bam/*.bam
do
echo "Removing duplicates $bam"
dedup_output="${bam%.bam}-dedup.bam"
dedup_output=deduplication/${dedup_output##*/}
java -jar $PICARD MarkDuplicates INPUT="$bam" OUTPUT="$dedup_output" REMOVE_DUPLICATES=TRUE METRICS_FILE=$dedup_output-metric.txt 2>> deduplication.log
samtools index -b -@$threads "$dedup_output"
done
else
echo "Deduplication already performed/not selected"
fi
###QC analysis of samples
qc_folder="qc/"
qc_dedup_folder="qc-dedup/"
function qc {
sorted_sample_list=$(ls $bam_folder/*.bam | tr "\n" " ")
multiBamSummary bins --numberOfProcessors "$threads" -b $sorted_sample_list -o $qc_folder/multibamsummary.npz 2>> qc.log #deeptools
plotPCA -in $qc_folder/multibamsummary.npz -o $qc_folder/PCA_readCounts_all.png -T "Principle Component Analysis" 2>> qc.log #deeptools
plotCorrelation -in $qc_folder/multibamsummary.npz --corMethod spearman --skipZeros --plotTitle "Spearman Correlation of Read Counts" --colorMap viridis --whatToPlot heatmap -o $qc_folder/heatmap_SpearmanCorr_readCounts.png --outFileCorMatrix SpearmanCorr_readCounts.tab #deeptools
}
if [[ "$qc" == "TRUE" ]];
then
if [[ -d "$bam_folder" ]] && [[ ! -d "$qcfolder" ]];
then
qc
elif [[ -d "$dedup_folder" ]] && [[ ! -d "$qc_folder_dedup" ]];
then
bam_folder=$dedup_folder
qc_folder=$qc_dedup_folder
qc
else
echo "QC analysis already performed"
fi
fi
###generate histogram of reads lengths
function reads_length {
#get frequency of read lengths
base_file="${file%.bam}"
base_file="${base_file##*/}"
out_file="$outdir/read-length-$base_file.txt"
echo "read_length" >> "$out_file"
samtools view -@$threads "$file" | head -n 1000000 | cut -f 10 | perl -ne 'chomp;print length($_) . "\n"' | sort >> "$out_file"
#plot histogram of frequency of read lengths
Rscript $SCRIPT_DIR/read_length.R "$WORK_DIR" "$out_file" "$base_file" "$outdir"
}
reads_length_folder=read-lengths
reads_length_folder_dedup="$reads_length_folder"-dedup
if [[ "$reads_length" == "TRUE" ]] && [[ ! -d "$reads_length_folder" ]];
then
echo "Generating histograms of read lengths"
mkdir "$reads_length_folder"
outdir="$reads_length_folder"
for file in bam/*.bam
do
reads_length
done
elif [[ "$reads_length" == "TRUE" ]] && [[ ! -d "$reads_length_folder_dedup" ]];
then
echo "Generating histograms of read lengths"
mkdir "$reads_length_folder_dedup"
outdir="$reads_length_folder_dedup"
for file in $dedup_folder/*.bam
do
reads_length
done
else
echo "Read length histograms already generated"
fi
###create bigWig files
#load bamCoverage settings:
if [[ "$big_wig" == "TRUE" ]];
then
binsize=$(cat settings.yaml | shyaml get-value bigWig.binSize)
normalizeusing=$(cat settings.yaml | shyaml get-value bigWig.normalizeUsing)
extendreads=$(cat settings.yaml | shyaml get-value bigWig.extendReads)
effectivegenomesize=$(cat settings.yaml | shyaml get-value bigWig.effectiveGenomeSize)
#checking bamCoverage settings:
if [[ ! $binsize =~ ^-?[0-9]+$ ]];
then
echo "ERROR: binSize should be an integer."
exit 1
fi
if [[ $normalizeusing != "RPKM" ]] && [[ $normalizeusing != "CPM" ]] && [[ $normalizeusing != "BPM" ]] && [[ $normalizeusing != "RPGC" ]] && [[ $normalizeusing != "None" ]];
then
echo "ERROR: invalid normalisation method chosen."
echo "Available methods: RPKM, CPM, BPM, RPGC or None"
exit 1
fi
if [[ ! $extendreads =~ ^-?[0-9]+$ ]];
then
echo "ERROR: extendReads should be an integer."
exit 1
fi
if [[ ! $effectivegenomesize =~ ^-?[0-9]+$ ]];
then
echo "ERROR: effectiveGenomeSize should be an integer."
echo "See https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html"
exit 1
fi
#creates bigWig files
bigwig_dir=bigwig_bin$binsize
bigwig_dir_dedup=bigwig_dedup_bin$binsize
function big_wig {
#bigwig_output="${file%-dedupl-sort-bl.bam}-norm.bw"
bigwig_output=${bigwig_output##*/}
bamCoverage -p $threads --binSize "$binsize" --normalizeUsing "$normalizeusing" --extendReads "$extendreads" --effectiveGenomeSize "$effectivegenomesize" -b $file -o $bigwig_dir/$bigwig_output 2>> bigwig.log
}
if [[ -d "$bam_folder" ]] && [[ ! -d "$bigwig_dir" ]];
then
echo "Creating BigWig files"
mkdir -p "$bigwig_dir"
for file in bam/*bam
do
bigwig_output="${file%.bam}-norm.bw"
big_wig
done
elif [[ -d "$dedup_folder" ]] && [[ ! -d "$bigwig_dir_dedup" ]];
then
echo "Creating BigWig files"
mkdir -p "$bigwig_dir_dedup"
bigwig_dir="$bigwig_dir_dedup"
for file in "$dedup_folder"/*bam
do
bigwig_output="${file%.bam}-dedup-norm.bw"
big_wig
done
fi
fi
###filter out reads below cutoff
filter_folder="filter/"
function filter_reads {
file_name="${bam##*/}"
base_name=${file_name%.bam}
samtools view -h -@ "$threads" "$bam" | awk -v x=$cutoff 'length($10) < x || $1 ~ /^@/' | samtools view -bS -@ "$threads" - > "filter/$base_name.${cutoff}bp.bam"
} #function to filter reads out below given cutoff
if [[ "$cutoff" == 0 ]];
then
echo "No size filtering of reads selected"
elif [[ ! -d "$filter_folder" ]] && [[ -d "$dedup_folder" ]];
then
mkdir filter
echo "Filtering reads <${cutoff} bp"
for bam in deduplication/*.bam
do
filter_reads
done
elif [[ ! -d "$filter_folder" ]] && [[ ! -d "$dedup_folder" ]];
then
mkdir filter
echo "Filtering reads <${cutoff} bp"
for bam in bam/*.bam
do
filter_reads
done
else
echo "Size filtering already performed"
fi
###create metagene plots
ngsplot_folder="ngsplot"
ngsplot_folder_dedup="$ngsplot_folder-dedup"
feature=$(cat settings.yaml | shyaml get-value ngsplot.feature)
window=$(cat settings.yaml | shyaml get-value ngsplot.window)
function ngs_plot {
echo "Generating metagene plots and heatmaps"
for file in $bam_folder/*.bam
do
case $file in
*input*) continue;;
*)
ngs_output=${file%.bam}
ngs_output=${ngs_output##*/}
out_dir="$ngsplot_folder/$ngs_output"
mkdir -p "$out_dir"
ngs.plot.r -G "$genome" -R "$feature" -C $file -O "$out_dir/${ngs_output}_$feature" -T $ngs_output -L "$window"
esac
done
}
if [[ "$ngsplot" == "TRUE" ]];
then
#check input variables
if [[ $feature != "tss" ]] && [[ $feature != "tes" ]] && [[ $feature != "genebody" ]] && [[ $feature != "exon" ]] && [[ $feature != "cgi" ]] && [[ $feature != "enhancer" ]] && [[ $feature != "dhs" ]] && [[ $feature != "bed" ]];
then
echo "ERROR: wrong genomic feature selected for ngs.plot.R"
echo "Available options: tss, tes, genebody, exon, cgi, enhancer, dhs or bed"
exit 1
fi
if [[ ! $window =~ ^-?[0-9]+$ ]];
then
echo "ERROR: window size should be an integer for ngs.plot.R"
exit 1
fi
#plot
if [[ -d "$bam_folder" ]] && [[ ! -d "$ngsplot_folder" ]];
then
ngs_plot
elif [[ -d "$dedup_folder" ]] && [[ ! -d "$ngsplot_folder_dedup" ]];
then
bam_folder=$dedup_folder
ngsplot_folder=$ngsplot_folder_dedup
ngs_plot
else
echo "Metagene plots and heatmaps already generated"
fi
fi
###peak calling
#Load MACS2 settings
format=$(cat settings.yaml | shyaml get-value MACS2.format)
macs2_genome=$(cat settings.yaml | shyaml get-value MACS2.genome)
genome_size=$(cat settings.yaml | shyaml get-value MACS2.genome-size)
qvalue=$(cat settings.yaml | shyaml get-value MACS2.qvalue)
extsize=$(cat settings.yaml | shyaml get-value MACS2.extsize)
if [[ "$peak" == "TRUE" ]];
then
#checking MACS2 settings
if [[ $format != "ELAND" ]] && [[ $format != "BED" ]] && [[ $format != "ELANDMULTI" ]] && [[ $format != "ELANDEXPORT" ]] && [[ $format != "SAM" ]] && [[ $format != "BAM" ]] && [[ $format != "BOWTIE" ]] && [[ $format != "BAMPE" ]] && [[ $format != "BEDPE" ]] && [[ $format != "AUTO" ]];
then
echo "ERROR: invalid file format chosen."
echo "Compatible formats: ELAND, BED, ELANDMULTI, ELANDEXPORT, SAM, BAM, BOWTIE, BAMPE or BEDPE"
echo "Automatic detection can be set with AUTO (does not work with BEDPE or BEDPE formats)"
exit 1
fi
if [[ $macs2_genome != "hs" ]] && [[ $macs2_genome != "mm" ]] && [[ $macs2_genome != "ce" ]] && [[ $macs2_genome != "dm" ]]
then
echo "ERROR: invalid genome chosen."
echo "Available genomes: hs, mm, ce and dm"
exit 1
fi
# if [[ ! $genome_size =~ ^-?[0-9]+$ ]];
# then
# echo "ERROR: genome size should be an integer."
# exit 1
# fi
if [[ $qvalue > 0.05 ]];
then
echo "WARNING: q-value is higher than default 0.05"
#exit 1
fi
echo "Calling and annotating peaks"
mkdir -p peaks
sed '1d' macs2-samples.conf > macs2-samples-temp.conf #removes header from settings part
input="macs2-samples-temp.conf"
while IFS= read -r line
do
macs2_sample=$(echo $line | awk '{print$1}')
macs2_input=$(echo $line | awk '{print$2}')
macs2_output_name=$(echo $line | awk '{print$3}')
macs2 callpeak -t "$macs2_sample" -c "$macs2_input" -n "$macs2_output_name" --outdir "peaks/$macs2_output_name" -g "$genome" -f "$format" --qvalue "$qvalue" --extsize "$extsize" -B 2>> peak.log #MACS2
annotatePeaks.pl "peaks/${macs2_output_name}_summits.bed" $genome > "peaks/${macs2_output_name}_annotated_peaks.txt" 2>> peak.log #HOMER
done < "$input"
rm macs2-samples-temp.conf
fi