-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathproseq2.0.bsh
1181 lines (1048 loc) · 52.6 KB
/
proseq2.0.bsh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/bash
#
INPUT_LINE=`echo $@`
while test $# -gt 0; do
case "$1" in
-h|--help)
echo ""
echo "Preprocesses and aligns PRO-seq data."
echo ""
echo "Takes PREFIX.fastq.gz (SE), PREFIX_R1.fastq.gz, PREFIX_R2.fastq.gz (PE)"
echo "or *.fastq.gz in the current working directory as input and writes"
echo "BAM and bigWig files as output to the user-assigned output-dir."
echo "The output bigWig files ending with _minus.bw or _plus.bw are raw read counts without normalization."
echo "The RPM normalized outputs end with a suffix of .rpm.bw."
echo ""
echo "Requirements in current working directory:"
echo "cutadapt 1.8.3, seqtk, prinseq-lite.pl 0.20.2, bwa, samtools, bedtools, and bedGraphToBigWig."
echo ""
echo "bash proseq2.0.bsh [options]"
echo ""
echo "options:"
echo ""
echo "To get help:"
echo "-h, --help Show this brief help menu."
echo ""
echo "Required options:"
echo "-SE, --SEQ=SE Single-end sequencing."
echo "-PE, --SEQ=PE Paired-end sequencing."
echo "-i, --bwa-index=PATH Path to the BWA index of the target genome"
echo " (i.e., bwa index)."
echo "-c, --chrom-info=PATH Location of the chromInfo table."
echo ""
echo "I/O options:"
echo "-I, --fastq=PREFIX Prefix for input files."
echo " Paired-end files require identical prefix"
echo " and end with _R1.fastq.gz and _R2.fastq.gz"
echo " eg: PREFIX_R1.fastq.gz, PREFIX_R2.fastq.gz."
echo "-T, --tmp=PATH Path to a temporary storage directory."
echo "-O, --output-dir=DIR Specify a directory to store output in."
echo ""
echo "Required options for SE"
echo "-G, --SE_READ=RNA_5prime Single-end sequencing from 5' end of"
echo " nascent RNA, like GRO-seq."
echo "-P, --SE_READ=RNA_3prime Single-end sequencing from 3' end of"
echo " nascent RNA, like PRO-seq."
echo ""
echo "Options for PE"
echo "--RNA5=R1_5prime Specify the location of the 5' end of RNA"
echo " [default: R1_5prime]."
echo "--RNA3=R2_5prime Specify the location of the 3' end of RNA"
echo " [default: R2_5prime]."
echo " Available options: R1_5prime: the 5' end of R1 reads"
echo " R2_5prime: the 5' end of R2 reads"
echo "-5, --map5=TRUE Report the 5' end of RNA [default on, --map5=TRUE]."
echo "-3, --map5=FALSE Report the 3' end of RNA,"
echo " only available for PE [default off, --map5=TRUE]."
echo "-s, --opposite-strand=TRUE"
echo " Enable this option if the RNA are at the different strand"
echo " as the reads set at RNA5 [default: disable]."
echo ""
echo "Optional operations:"
echo "--ADAPT_SE=TGGAATTCTCGGGTGCCAAGG"
echo " 3' adapter to be removed from the 3' end of SE reads."
echo " [default:TGGAATTCTCGGGTGCCAAGG]"
echo "--ADAPT1=GATCGTCGGACTGTAGAACTCTGAACG"
echo " 3' adapter to be removed from the 3' end of R2."
echo " [default:GATCGTCGGACTGTAGAACTCTGAACG]"
echo "--ADAPT2=AGATCGGAAGAGCACACGTCTGAACTC"
echo " 3' adapter to be removed from the 3' end of R1."
echo " [default:AGATCGGAAGAGCACACGTCTGAACTC]"
echo ""
echo "--UMI1=0 The length of UMI barcode on the 5' of R1 read. "
echo " [default: 0]"
echo "--UMI2=0 The length of UMI barcode on the 5' of R2 read. "
echo " [default: 0]"
echo "When UMI1 or UMI2 are set > 0, the pipeline will perform PCR deduplicate."
echo ""
echo "--Force_deduplicate=FALSE"
echo " When --Force_deduplicate=TRUE, it will force the pipeline to"
echo " perform PCR deduplicate even there is no UMI barcode"
echo " (i.e. UMI1=0 and UMI2=0). [default: FALSE]"
echo "--ADD_B1=0 The length of additional barcode that will be trimmed"
echo " on the 5' of R1 read. [default: 0]"
echo "--ADD_B2=0 The length of additional barcode that will be trimmed"
echo " on the 5' of R2 read. [default: 0]"
echo "--thread=1 Number of threads can be used [default: 1]"
echo ""
echo "-4DREG Using the pre-defined parameters to get the most reads"
echo " for dREG package. Please use this flag to make the bigWig"
echo " files compatible with dREG algorithm. [default: off, only available to SE] "
echo "-aln Use BWA-backtrack [default: SE uses BWA-backtrack (aln), PE uses BWA-MEM (mem)]"
echo "-mem Use BWA-MEM [default: SE uses BWA-backtrack (aln), PE uses BWA-MEM (mem)]"
echo "--MAP_LENGTH Set a data-set wide length cutoff for mapping (e.g. --MAP_LENGTH=36) "
echo " or map the whole reads after QC (off, --MAP_LENGTH=0). [default: off]"
exit 0
;;
-SE)
export SEQ="SE"
shift
;;
-PE)
export SEQ="PE"
shift
;;
-i)
shift
if test $# -gt 0; then
export BWAIDX=$1
else
echo "no BWA index specified"
exit 1
fi
shift
;;
--bwa-index*)
export BWAIDX=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-c)
shift
if test $# -gt 0; then
export CHINFO=$1
else
echo "no chromInfo specified"
exit 1
fi
shift
;;
--chrom-info*)
export CHINFO=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-I)
shift
if test $# -gt 0; then
export FQ_INPUT=$1
else
echo "no input prefix specified."
exit 1
fi
shift
;;
--fastq*)
export FQ_INPUT=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-T)
shift
if test $# -gt 0; then
export TMPDIR=$1
else
echo "no temp folder specified."
exit 1
fi
shift
;;
--tmp*)
export TMPDIR=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-O)
shift
if test $# -gt 0; then
export OUTPUT=$1
else
echo "no output dir specified."
exit 1
fi
shift
;;
--output-dir*)
export OUTPUT=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--thread*)
export thread=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--RNA5*) # report location of the 5 prime end of RNA
# acce
export RNA5=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--RNA3*) # report location of the 5 prime end of RNA
# acce
export RNA3=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-5)
export MAP5="TRUE"
shift
;;
--map5*)
export MAP5=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-3)
export MAP5="FALSE"
shift
;;
-s)
export OPP="TRUE"
shift
;;
--opposite-strand*)
export OPP=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--ADAPT_SE*)
export ADAPT_SE=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--ADAPT1*)
export ADAPT1=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--ADAPT2*)
export ADAPT2=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--UMI1*)
export UMI1=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--UMI2*)
export UMI2=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--Force_deduplicate*)
export Force_deduplicate=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--ADD_B1*)
export ADD_B1=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--ADD_B2*)
export ADD_B2=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-G)
export SE_OUTPUT="G"
export RNA5="R1_5prime"
export OPP="FALSE"
#export MAP5="TRUE"
export SE_READ="RNA_5prime"
shift
;;
-P)
export SE_OUTPUT="P"
export RNA3="R1_5prime"
export OPP="TRUE"
#export MAP5="TRUE"
export SE_READ="RNA_3prime"
shift
;;
--SE_READ*)
export SE_READ=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
-4DREG*)
export DREG_MODEL=1
shift
;;
-mem*)
export mem=1
shift
;;
-aln*)
export aln=1
shift
;;
--MAP_LENGTH*)
export map_L=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
--PREMAP_BWAIDX*)
export PREMAP_BWAIDX=`echo $1 | sed -e 's/^[^=]*=//g'`
shift
;;
*)
break
;;
esac
done
## CHECK ARGUMENTS.
if [ -z "$BWAIDX" ]; then
echo "--bwa-index is required."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
if [ -z "$CHINFO" ]; then
echo "--chrom-info is required."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
if [ -z "$SEQ" ]; then
echo "Please specify the input data is SE or PE."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
if [[ "$mem" == 1 && "$aln" == 1 ]]; then
echo "Please choose between -mem or -aln"
echo " use bash proseq2.0.bsh --help."
exit 1
fi
## INPUT & Parameters
# PE
if [[ "$SEQ" == "PE" ]] ; then
if [ -z "$SE_OUTPUT" ]; then
:
else
echo "-G and -P can only be used with -SE."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
if [[ "$FQ_INPUT" == "*.fastq.gz" ]]; then
FQ_INPUT=`ls *.fastq.gz | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3-| cut -d _ -f 2- |rev | sort | uniq`
fi
if [ -z "$FQ_INPUT" ]; then
echo "No input files specified. Using *.fastq.gz"
FQ_INPUT=`ls *.fastq.gz | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3-| cut -d _ -f 2- |rev | sort | uniq`
fi
## Check if input file end with _R1.fastq.gz or _R2.fastq.gz
tmp=${FQ_INPUT}
FQ_INPUT=() #array
for name in ${tmp}
do
if [ ! -f ${name}_R1.fastq.gz ]; then
echo ""
echo "##################################################################################"
echo " File ${name}_R1.fastq.gz was not found! Skip ${name}*.fastq.gz from analysis."
echo " Paired-end files require identical prefix and end with _R1.fastq.gz and _R2.fastq.gz"
echo " eg: PREFIX_R1.fastq.gz, PREFIX_R2.fastq.gz"
echo " Please make sure you have the correct file suffix! "
echo "##################################################################################"
echo ""
else
FQ_INPUT+=(${name}) # append to the array
#echo ${FQ_INPUT}
fi
done
# SE
elif [[ "$SEQ" == "SE" ]] ; then
if [[ "$SE_READ" == "RNA_5prime" ]] ; then
SE_OUTPUT="G"
RNA5="R1_5prime"
OPP="FALSE"
#MAP5="TRUE"
elif [[ "$SE_READ" == "RNA_3prime" ]] ; then
SE_OUTPUT="P"
RNA3="R1_5prime"
OPP="TRUE"
#MAP5="TRUE"
fi
if [ -z "$SE_OUTPUT" ] ; then
echo "Please specify output format for SE [-G or -P]"
exit 1
fi
if [[ "$FQ_INPUT" == "*.fastq.gz" ]]; then
FQ_INPUT=`ls *.fastq.gz | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3-| rev`
elif [ -z "$FQ_INPUT" ]; then
echo "No input files specified. Using *.fastq.gz"
FQ_INPUT=`ls *.fastq.gz | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3-| rev`
elif [[ "$FQ_INPUT" == *.fastq.gz ]]; then
FQ_INPUT=`echo $FQ_INPUT | rev | cut -d \. -f 3-| rev`
fi
else
echo "Please specify the input data is SE or PE."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
# Check input file number
if [[ ${#FQ_INPUT[@]} == 0 ]]; then # if the length of array is 0
echo "##################################################################################"
echo " No files is in the correct format."
echo " Paired-end files require identical prefix and end with _R1.fastq.gz and _R2.fastq.gz"
echo " eg: PREFIX_R1.fastq.gz, PREFIX_R2.fastq.gz"
echo " Please make sure you have the correct file suffix! Aborting."
echo "##################################################################################"
exit 1
fi
if [ -z "$OUTPUT" ]; then
now=$(date +"%m_%d_%Y")
OUTPUT=./My_proseq_output_dir-${now}
echo No output path specified. Using ${OUTPUT}
fi
if [ ! -d $OUTPUT ]; then
mkdir $OUTPUT
fi
if [ -z "$TMPDIR" ]; then
TMPDIR="./"
fi
if [ ! -d $TMPDIR ]; then
mkdir $TMPDIR
fi
# bash generate random 32 character alphanumeric string (upper and lowercase).
tmp=`cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1`
TMPR=$TMPDIR
TMPDIR=$TMPDIR/$tmp
if [ -z "$thread" ]; then
thread=1
fi
if [ -z "$ADAPT_SE" ]; then
ADAPT_SE="TGGAATTCTCGGGTGCCAAGG"
fi
if [ -z "$ADAPT2" ]; then
ADAPT2="AGATCGGAAGAGCACACGTCTGAACTC"
fi
if [ -z "$ADAPT1" ]; then
ADAPT1="GATCGTCGGACTGTAGAACTCTGAACG"
fi
if [ -z "$UMI1" ]; then
UMI1=0
elif [[ $[UMI1] -lt 0 ]]; then
echo "UMI1 only take natural number"
exit 1
fi
if [ -z "$UMI2" ]; then
UMI2=0
elif [[ $[UMI2] -lt 0 ]]; then
echo "UMI2 only take natural number"
exit 1
fi
if [ -z "$Force_deduplicate" ]; then
Force_deduplicate="FALSE"
elif [[ "$Force_deduplicate" == "TRUE" || "$Force_deduplicate" == "FALSE" ]] ; then
:
else
echo "Force_deduplicate only take TRUE or FALSE"
exit 1
fi
if [ -z "$ADD_B1" ]; then
ADD_B1=0
fi
if [ -z "$ADD_B2" ]; then
ADD_B2=0
fi
if [ -z "$map_L" ]; then
map_L=0
fi
if [[ "$SEQ" == "PE" ]]; then
if [[ -z "$RNA5" && -z "$RNA3" ]]; then
RNA5="R1_5prime"
fi
if [[ "$RNA3" == "R1_5prime" ]]; then
RNA5="R2_5prime"
elif [[ "$RNA3" == "R2_5prime" ]]; then
RNA5="R1_5prime"
fi
if [[ "$RNA5" == "R1_5prime" || "$RNA5" == "R2_5prime" ]] ; then
:
else
echo "--RNA5 and --RNA3 value can only be R1_5prime or R2_5prime."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
fi
if [ -z "$OPP" ]; then
OPP="FALSE"
fi
if [ -z "$MAP5" ]; then
MAP5="TRUE"
fi
if [[ "${MAP5}" == "FALSE" && "$SEQ" == "SE" ]] ; then
echo "For single-end (SE), can only report the 5 prime end of reads (--map5=TRUE)"
exit 1
fi
if [ "${MAP5}" == "TRUE" ] ; then
:
elif [ "${MAP5}" == "FALSE" ] ; then
:
else
echo "--map5 value can only be TRUE or FALSE."
echo " use bash proseq2.0.bsh --help."
exit 1
fi
## Check all the bioinformatics tools can be called from current working directory.
for tool in cutadapt seqtk prinseq-lite.pl bwa samtools bedtools bedGraphToBigWig sort-bed
do command -v ${tool} >/dev/null 2>&1 || { echo >&2 ${tool}" is required. Please make sure you can call the bioinformatics tools from your current working directoryb. Aborting."; exit 1; }
done
#exec 1>test.log 2>&1
exec > >(tee ${OUTPUT}/proseq2.0_Run_${tmp}.log)
exec 2>&1
## Print out
echo " "
echo "Processing PRO-seq data ..."
echo "Command line parameters:" ${INPUT_LINE}
echo " "
if [[ ! -z "$DREG_MODEL" ]]; then
echo "dREG compatible mode Yes"
fi
echo "SEQ $SEQ"
if [[ "$SEQ" == "SE" ]]; then
echo "SE_OUTPUT $SE_OUTPUT"
echo "SE_READ $SE_READ"
fi
if [[ "$SEQ" == "PE" ]]; then
echo "Location of 5' of RNA $RNA5"
echo "Location of 3' of RNA $RNA3"
fi
echo "Report 5' ends $MAP5"
echo "Report opposite strand $OPP"
echo ""
echo "Input files/ paths:"
if [[ ! -z "$PREMAP_BWAIDX" ]]; then
echo "bwa index for pre-mapping $PREMAP_BWAIDX"
fi
echo "bwa index $BWAIDX"
echo "chromInfo $CHINFO"
if [[ "$SEQ" == "SE" ]]; then
i=1
for name in ${FQ_INPUT[@]}
do
echo "input file $i ${name}.fastq.gz"
let "i++"
done
fi
if [[ "$SEQ" == "PE" ]]; then
i=1
for name in ${FQ_INPUT[@]}
do
echo "input file pair $i ${name}_R1.fastq.gz, ${name}_R2.fastq.gz"
let "i++"
done
fi
echo "temp folder $TMPDIR"
echo "output-dir $OUTPUT"
echo " "
echo "Optional operations:"
echo "ADAPT_SE $ADAPT_SE"
echo "ADAPT1 $ADAPT1"
echo "ADAPT2 $ADAPT2"
echo "UMI1 barcode length $UMI1"
echo "UMI2 barcode length $UMI2"
echo "ADD_B1 length $ADD_B1"
echo "ADD_B2 length $ADD_B2"
echo "number of threads $thread"
if [[ ${UMI2} != 0 || ${UMI1} != 0 || ${Force_deduplicate} == "TRUE" ]]; then
echo "Remove PCR duplicates TRUE"
else
echo "Remove PCR duplicates FALSE"
fi
if [[ ${map_L} != 0 ]]; then
echo "Trim Read length to ${map_L}nt for mapping"
fi
## Exits .. for debugging.
#exit 1
## Functions
## DOIT!
mkdir ${TMPDIR}
if [[ "$SEQ" == "PE" ]] ; then
#############################################
## Preprocess data. Remove adapters. Trim.
echo " "
echo "Preprocessing fastq files:"
mkdir ${TMPDIR}/noadapt
mkdir ${TMPDIR}/passQC
if [[ ${UMI2} != 0 || ${UMI1} != 0 ]]; then
dedup_L=30
mkdir ${TMPDIR}/noadapt/l${dedup_L}
mkdir ${TMPDIR}/noadapt/l${dedup_L}_nodups
fi
for name in ${FQ_INPUT[@]}
do
old_name=${name}
name=`echo ${old_name} | rev | cut -d \/ -f 1 |rev`
## read stats
echo 'Number of original input reads:' > ${TMPDIR}/${name}.QC.log
echo ${old_name}_R1.fastq.gz >> ${TMPDIR}/${name}.QC.log
zcat ${old_name}_R1.fastq.gz | grep @ -c >> ${TMPDIR}/${name}.QC.log
echo ${old_name}_R2.fastq.gz >> ${TMPDIR}/${name}.QC.log
zcat ${old_name}_R2.fastq.gz | grep @ -c >> ${TMPDIR}/${name}.QC.log
## Remove adapter, UMI barcode, additional barcode, and low quality (q=20) base from 3prime end of reads. Keep read length >=15 after trimmming
# Remove adapter
cutadapt -a ${ADAPT2} -e 0.10 --overlap 2 --output=${TMPDIR}/${name}_trim_R1.fastq --untrimmed-output=${TMPDIR}/${name}_untrim_R1.fastq ${old_name}_R1.fastq.gz -j ${thread}
cutadapt -a ${ADAPT1} -e 0.10 --overlap 2 --output=${TMPDIR}/${name}_trim_R2.fastq --untrimmed-output=${TMPDIR}/${name}_untrim_R2.fastq ${old_name}_R2.fastq.gz -j ${thread}
echo 'Removed adapters' >> ${TMPDIR}/${name}.QC.log
# Read1
# remove UMI2 and ADD_B2 from the 3 prime end of R1
n2=$[UMI2+ADD_B2]
cutadapt --cut -${n2} --minimum-length=10 ${TMPDIR}/${name}_trim_R1.fastq --output=${TMPDIR}/${name}_trim.${n2}Nremoved_R1.fastq -q 20 -j ${thread}
cutadapt --minimum-length=10 ${TMPDIR}/${name}_untrim_R1.fastq --output=${TMPDIR}/${name}_q20trim_R1.fastq -q 20 -j ${thread}
cat ${TMPDIR}/${name}_q20trim_R1.fastq ${TMPDIR}/${name}_trim.${n2}Nremoved_R1.fastq | paste - - - - | LC_ALL=C sort --temporary-directory=${TMPDIR} --parallel=${thread} -k1,1 -S 10G | tr '\t' '\n' > ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq
echo 'Removed UMI2 and ADD_B2 from the 3 prime end of R1' >> ${TMPDIR}/${name}.QC.log
# Read2
# remove UMI1 and ADD_B1 from the 3 prime end of R2
n1=$[UMI1+ADD_B1]
cutadapt --cut -${n1} --minimum-length=10 ${TMPDIR}/${name}_trim_R2.fastq --output=${TMPDIR}/${name}_trim.${n1}Nremoved_R2.fastq -q 20 -j ${thread}
cutadapt --minimum-length=10 ${TMPDIR}/${name}_untrim_R2.fastq --output=${TMPDIR}/${name}_q20trim_R2.fastq -q 20 -j ${thread}
cat ${TMPDIR}/${name}_q20trim_R2.fastq ${TMPDIR}/${name}_trim.${n1}Nremoved_R2.fastq | paste - - - - | LC_ALL=C sort --temporary-directory=${TMPDIR} --parallel=${thread} -k1,1 -S 10G | tr '\t' '\n' > ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq
echo 'Removed UMI1 and ADD_B1 from the 3 prime end of R2' >> ${TMPDIR}/${name}.QC.log
echo 'Number of reads after adapter removal and QC:' >> ${TMPDIR}/${name}.QC.log
echo R1 >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
echo R2 >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
rm ${TMPDIR}/${name}_trim_R1.fastq ${TMPDIR}/${name}_untrim_R1.fastq ${TMPDIR}/${name}_q20trim_R1.fastq ${TMPDIR}/${name}_trim.${n2}Nremoved_R1.fastq
rm ${TMPDIR}/${name}_trim_R2.fastq ${TMPDIR}/${name}_untrim_R2.fastq ${TMPDIR}/${name}_q20trim_R2.fastq ${TMPDIR}/${name}_trim.${n1}Nremoved_R2.fastq
## Collapse reads using prinseq-lite.pl. if there are UMI barcodes or $Force_deduplicate=TRUE
if [[ ${UMI2} != 0 || ${UMI1} != 0 ]]; then # if there is UMI barcode, Will perform deduplicates with first ${dedup_L} bp
# Remove PCR duplciates.
# fastq with the first dedup_L nt
seqtk seq -L ${dedup_L} ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq > ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R1.fastq
seqtk seq -L ${dedup_L} ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq > ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R2.fastq
# deduplicate using the first dedup_L nt
prinseq-lite.pl -fastq ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R1.fastq -fastq2 ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R2.fastq -derep 1 -out_format 3 -out_bad null -out_good ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup -min_len 15 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R1.fastq ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}_R2.fastq
# make a list of name from deduplicated fastq
cat ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1.fastq | awk '(NR%4==1){print substr($1,2)}' > ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt
cat ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1_singletons.fastq | awk '(NR%4==1){print substr($1,2)}' > ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1_singletons_l${dedup_L}.txt
cat ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_2_singletons.fastq | awk '(NR%4==1){print substr($1,2)}' > ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_2_singletons_l${dedup_L}.txt
rm ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1_singletons.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_2_singletons.fastq
# generate fastq from the list of name
seqtk subseq ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt > ${TMPDIR}/passQC/${name}_dedup_withBarcode_1.fastq
seqtk subseq ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt > ${TMPDIR}/passQC/${name}_dedup_withBarcode_2.fastq
seqtk subseq ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_1_singletons_l${dedup_L}.txt > ${TMPDIR}/passQC/${name}_dedup_1_singletons.fastq
seqtk subseq ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_2_singletons_l${dedup_L}.txt > ${TMPDIR}/passQC/${name}_dedup_2_singletons.fastq
rm ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup*l${dedup_L}.txt
# trim the UMI and additional barcode after dereplicate
prinseq-lite.pl -trim_left ${n1} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_1.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
prinseq-lite.pl -trim_left ${n2} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_withBarcode_1.fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_2.fastq
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1.fastq -fastq2 ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_QC_end 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1.fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2.fastq
echo 'Number of paired reads after PCR duplicates removal and QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_dedup_QC_end_1.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
elif [[ ${Force_deduplicate} == "TRUE" ]]; then # if there is NO UMI barcode, Will perform deduplicates with whole legnth of reads
# Remove PCR duplciates.
prinseq-lite.pl -derep 1 -fastq ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq -fastq2 ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_withBarcode 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# trim the UMI and additional barcode after dereplicate
prinseq-lite.pl -trim_left ${n1} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_1.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
prinseq-lite.pl -trim_left ${n2} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_withBarcode_1.fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode_2.fastq
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1.fastq -fastq2 ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_QC_end 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_1.fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved_2.fastq
echo 'Number of paired reads after PCR duplicates removal and QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_dedup_QC_end_1.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
else
# trim the additional barcode ${ADD_B1} and ${ADD_B2} WITHOUT dereplicate. If no barcode, prinseq-lite.pl will remove unpair reads and reads that are length 0
prinseq-lite.pl -trim_left ${ADD_B1} -fastq ${TMPDIR}/noadapt/${name}_noadapt_R1.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_BarcodeRemoved_1 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
prinseq-lite.pl -trim_left ${ADD_B2} -fastq ${TMPDIR}/noadapt/${name}_noadapt_R2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_BarcodeRemoved_2 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_BarcodeRemoved_1.fastq -fastq2 ${TMPDIR}/passQC/${name}_BarcodeRemoved_2.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_QC_end 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_BarcodeRemoved_1.fastq ${TMPDIR}/passQC/${name}_BarcodeRemoved_2.fastq
echo 'Number of paired reads after final QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_QC_end_1.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
fi
cat ${OUTPUT}/${name}.prinseq-pcrDups.gd
done
# if there is a data-set wide length cutoff map_L
if [[ ${map_L} != 0 ]]; then
mkdir ${TMPDIR}/passQC_length_${map_L}
for f in ${TMPDIR}/passQC/*_QC_end_1.fastq
do name=`echo $f |rev |cut -d / -f 1 |cut -d _ -f 4-|rev`
seqtk seq -L ${map_L} ${f} > ${TMPDIR}/passQC_length_${map_L}/${name}_l${map_L}_QC_end_1.fastq
done
for f in ${TMPDIR}/passQC/*_QC_end_2.fastq
do name=`echo $f |rev |cut -d / -f 1 |cut -d _ -f 4-|rev`
seqtk seq -L ${map_L} ${f} > ${TMPDIR}/passQC_length_${map_L}/${name}_l${map_L}_QC_end_2.fastq
done
fi
## Cleanup.
rm -r ${TMPDIR}/noadapt
gzip ${TMPDIR}/passQC*/*.fastq
#############################################
## Align reads.
if [[ ${map_L} != 0 ]]; then
ToMapDir=${TMPDIR}/passQC_length_${map_L}
else
ToMapDir=${TMPDIR}/passQC
fi
echo "Reads for mapping is from : $ToMapDir"
QC_INPUT=`ls ${ToMapDir}/*_QC_end_1.fastq.gz | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3- | cut -d _ -f 2- | rev| sort | uniq`
if [[ ${#QC_INPUT} == 0 ]]; then
echo "#########################################"
echo " Something went wrong with Preprocess."
echo " No fastq.gz files passed."
echo " Aborting."
echo "#########################################"
exit 1
fi
# premap if PREMAP_BWAIDX is given
if [[ ! -z "$PREMAP_BWAIDX" ]]; then
echo " "
echo "Pre-Mapping reads start ..."
mkdir ${TMPDIR}/Unmapped_to_premap
if [[ ! -z "$aln" ]]; then
for name in ${QC_INPUT}
do
bwa aln -t 1 ${PREMAP_BWAIDX} ${ToMapDir}/${name}_1.fastq.gz > ${TMPDIR}/${name}_aln_sa1.sai
bwa aln -t 1 ${PREMAP_BWAIDX} ${ToMapDir}/${name}_2.fastq.gz > ${TMPDIR}/${name}_aln_sa2.sai
bwa sampe -n 1 -f ${ToMapDir}/${name}_end.sam ${PREMAP_BWAIDX} ${TMPDIR}/${name}_aln_sa1.sai ${TMPDIR}/${name}_aln_sa2.sai ${ToMapDir}/${name}_1.fastq.gz ${ToMapDir}/${name}_2.fastq.gz
samtools view -bF 0x2 ${ToMapDir}/${name}_end.sam| samtools sort -n -@ 1 - > ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam
rm ${TMPDIR}/${name}_aln_sa1.sai ${TMPDIR}/${name}_aln_sa2.sai ${ToMapDir}/${name}_end.sam
bamToFastq -i ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam -fq ${TMPDIR}/Unmapped_to_premap/${name}_1.fastq -fq2 ${TMPDIR}/Unmapped_to_premap/${name}_2.fastq
gzip ${TMPDIR}/Unmapped_to_premap/${name}_1.fastq ${TMPDIR}/Unmapped_to_premap/${name}_2.fastq
rm ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam
done
else #defaul use mem
for name in ${QC_INPUT}
do
bwa mem -k 19 -t 1 ${PREMAP_BWAIDX} ${ToMapDir}/${name}_1.fastq.gz ${ToMapDir}/${name}_2.fastq.gz |\
samtools view -bF 0x2 | samtools sort -n -@ 1 - > ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam
bamToFastq -i ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam -fq ${TMPDIR}/Unmapped_to_premap/${name}_1.fastq -fq2 ${TMPDIR}/Unmapped_to_premap/${name}_2.fastq
gzip ${TMPDIR}/Unmapped_to_premap/${name}_1.fastq ${TMPDIR}/Unmapped_to_premap/${name}_2.fastq
rm ${TMPDIR}/Unmapped_to_premap/${name}_unmapped.bam
done
fi
ToMapDir=${TMPDIR}/Unmapped_to_premap
echo " "
echo "Pre-Mapping reads end"
fi
# main map
echo " "
echo "Mapping reads:"
echo "Reads for mapping is from : $ToMapDir"
if [[ ! -z "$aln" ]]; then # elif -aln was given, use aln
for name in ${QC_INPUT}
do
## Align using BWA aln
bwa aln -t ${thread} ${BWAIDX} ${ToMapDir}/${name}_1.fastq.gz > ${TMPDIR}/${name}_aln_sa1.sai
bwa aln -t ${thread} ${BWAIDX} ${ToMapDir}/${name}_2.fastq.gz > ${TMPDIR}/${name}_aln_sa2.sai
bwa sampe -n 1 -f ${ToMapDir}/${name}_end.sam ${BWAIDX} ${TMPDIR}/${name}_aln_sa1.sai ${TMPDIR}/${name}_aln_sa2.sai ${ToMapDir}/${name}_1.fastq.gz ${ToMapDir}/${name}_2.fastq.gz
samtools view -bf 0x2 -q 20 ${ToMapDir}/${name}_end.sam | samtools sort -n -@ ${thread} - > ${TMPDIR}/${name}.sort.bam
rm ${TMPDIR}/${name}_aln_sa1.sai ${TMPDIR}/${name}_aln_sa2.sai ${ToMapDir}/${name}_end.sam
done
else #defaul use mem
for name in ${QC_INPUT}
do
## Align using BWA.
bwa mem -k 19 -t ${thread} ${BWAIDX} ${ToMapDir}/${name}_1.fastq.gz ${ToMapDir}/${name}_2.fastq.gz | \
samtools view -bf 0x2 -q 20 - | samtools sort -n -@ ${thread} - > ${TMPDIR}/${name}.sort.bam
done
fi
for name in ${QC_INPUT}
do
cp ${TMPDIR}/${name}.sort.bam ${OUTPUT}
done
## Cleanup
find ${TMPDIR} -name "*.sort.bam" -size -1024k -delete
#############################################
## Write out the bigWigs.
echo " "
echo "Writing bigWigs:"
for f in ${TMPDIR}/*.sort.bam
do
j=`echo $f | awk -F"/" '{print $NF}' | rev | cut -d \. -f 3- |rev `
echo $j > ${OUTPUT}/${j}.align.log
if [ "${RNA5}" == "R1_5prime" ] ; then
if [ "${OPP}" == "FALSE" ] ; then
if [ "${MAP5}" == "TRUE" ] ; then ## report The 5' end of the RNA. Danko lab leChRO-Seq protocol is on the 5' of _R1 readl, same strand of R1 ($9)
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$9}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$9}' | gzip > ${TMPDIR}/$j.bed.gz
else ## report The 3' end of the RNA. Danko lab leChRO-Seq protocol is on the 5 prime of _R2 read, opposite strand of R2 (R2 strand $10, R1 strand $9)
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "-") {print $1,$6-1,$6,$7,$8,$9}; ($10 == "+") {print $1,$5,$5+1,$7,$8,$9}' | gzip > ${TMPDIR}/$j.bed.gz
fi
elif [ "${OPP}" == "TRUE" ] ; then
if [ "${MAP5}" == "TRUE" ] ; then ## report The 5' end of the RNA.
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$10}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$10}' | gzip > ${TMPDIR}/$j.bed.gz
else ## report The 3' end of the RNA.
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "-") {print $1,$6-1,$6,$7,$8,$10}; ($10 == "+") {print $1,$5,$5+1,$7,$8,$10}' | gzip > ${TMPDIR}/$j.bed.gz
fi
fi
elif [ "${RNA5}" == "R2_5prime" ] ; then
if [ "${OPP}" == "FALSE" ] ; then
if [ "${MAP5}" == "TRUE" ] ; then #report the 5 prime end of RNA, in Engreitz data is 5 prime end of R2, same strand
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "+") {print $1,$5,$5+1,$7,$8,$10}; ($10 == "-") {print $1,$6-1,$6,$7,$8,$10}'|gzip > ${TMPDIR}/${j}.bed.gz
else ## report the 3-prime end of the RNA, in Engreitz data is the 5' end of R1 read, but opposite strand
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$10}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$10}' |gzip > ${TMPDIR}/${j}.bed.gz
fi
elif [ "${OPP}" == "TRUE" ] ; then
if [ "${MAP5}" == "TRUE" ] ; then #report the 5 prime end of RNA, in Engreitz data is 5 prime end of R2, same strand
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "+") {print $1,$5,$5+1,$7,$8,$9}; ($10 == "-") {print $1,$6-1,$6,$7,$8,$9}'|gzip > ${TMPDIR}/${j}.bed.gz
else ## report the 3-prime end of the RNA, in Engreitz data is the 5' end of R1 read, but opposite strand
bedtools bamtobed -bedpe -mate1 -i $f 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$9}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$9}' |gzip > ${TMPDIR}/${j}.bed.gz
fi
fi
fi
echo 'Number of mappable reads:' >> ${OUTPUT}/${j}.align.log
readCount=`zcat ${TMPDIR}/$j.bed.gz | grep "" -c`
echo ${readCount} >> ${OUTPUT}/${j}.align.log
## Remove rRNA and reverse the strand (PRO-seq).
zcat ${TMPDIR}/$j.bed.gz | grep "rRNA\|chrM" -v | grep "_" -v | sort-bed - | gzip > ${TMPDIR}/$j.nr.rs.bed.gz
echo 'Number of mappable reads (excluding rRNA):' >> ${OUTPUT}/${j}.align.log
echo `zcat ${TMPDIR}/$j.nr.rs.bed.gz | grep "" -c` >> ${OUTPUT}/${j}.align.log
## Convert to bedGraph ... Can't gzip these, unfortunately.
bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand + > ${TMPDIR}/$j\_plus.bedGraph
bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand - > ${TMPDIR}/$j\_minus.noinv.bedGraph
## Invert minus strand.
cat ${TMPDIR}/$j\_minus.noinv.bedGraph | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,-1*$4}' > ${TMPDIR}/$j\_minus.bedGraph ## Invert read counts on the minus strand.
## normalized by RPM
cat ${TMPDIR}/$j\_plus.bedGraph | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,$4*1000*1000/'$readCount'/1}' > ${TMPDIR}/$j\_plus.rpm.bedGraph
cat ${TMPDIR}/$j\_minus.bedGraph | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,$4*1000*1000/'$readCount'/1}' > ${TMPDIR}/$j\_minus.rpm.bedGraph
## Then to bigWig (nomalized and non-nomrmalized ones)
bedGraphToBigWig ${TMPDIR}/$j\_plus.rpm.bedGraph ${CHINFO} ${OUTPUT}/$j\_plus.rpm.bw
bedGraphToBigWig ${TMPDIR}/$j\_minus.rpm.bedGraph ${CHINFO} ${OUTPUT}/$j\_minus.rpm.bw
bedGraphToBigWig ${TMPDIR}/$j\_plus.bedGraph ${CHINFO} ${OUTPUT}/$j\_plus.bw
bedGraphToBigWig ${TMPDIR}/$j\_minus.bedGraph ${CHINFO} ${OUTPUT}/$j\_minus.bw
rm ${TMPDIR}/$j.nr.rs.bed.gz ${TMPDIR}/$j.bed.gz ${TMPDIR}/$j*.bedGraph
done
fi #*/
if [[ "$SEQ" == "SE" ]] ; then
#############################################
## Preprocess data. Remove adapters. Trim.
echo " "
echo "Preprocessing fastq files:"
mkdir ${TMPDIR}/noadapt
mkdir ${TMPDIR}/passQC
if [[ ${UMI2} != 0 || ${UMI1} != 0 ]]; then
dedup_L=30
mkdir ${TMPDIR}/noadapt/l${dedup_L}
mkdir ${TMPDIR}/noadapt/l${dedup_L}_nodups
fi
for name in ${FQ_INPUT[@]}
do
old_name=${name}
name=`echo ${old_name} | rev | cut -d \/ -f 1| rev`
## read stats
echo ${old_name} > ${TMPDIR}/${name}.QC.log
echo 'Number of original input reads:' >> ${TMPDIR}/${name}.QC.log
zcat ${old_name}.fastq.gz | grep @ -c >> ${TMPDIR}/${name}.QC.log
## Remove adapter, UMI barcode, additional barcode, and low quality (q=20) base from 3prime end of reads. Keep read length >=15 after trimmming
# Remove adapter
cutadapt -a ${ADAPT_SE} -e 0.10 --overlap 2 --output=${TMPDIR}/${name}_trim.fastq --untrimmed-output=${TMPDIR}/${name}_untrim.fastq ${old_name}.fastq.gz -j ${thread}
# Read1
# remove UMI2 and ADD_B2 from the 3 prime end of R1
n2=$[UMI2+ADD_B2]
n1=$[UMI1+ADD_B1]
cutadapt --cut -${n2} --minimum-length=10 ${TMPDIR}/${name}_trim.fastq --output=${TMPDIR}/${name}_trim.${n2}Nremoved.fastq -q 20 -j ${thread}
cutadapt --minimum-length=10 ${TMPDIR}/${name}_untrim.fastq --output=${TMPDIR}/${name}_q20trim.fastq -q 20 -j ${thread}
cat ${TMPDIR}/${name}_q20trim.fastq ${TMPDIR}/${name}_trim.${n2}Nremoved.fastq | paste - - - - |LC_ALL=C sort --temporary-directory=${TMPDIR} --parallel=10 -k1,1 -S 10G | tr '\t' '\n' > ${TMPDIR}/noadapt/${name}_noadapt.fastq
echo 'Number of reads after adapter removal and QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/noadapt/${name}_noadapt.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
rm ${TMPDIR}/${name}_trim.fastq ${TMPDIR}/${name}_untrim.fastq ${TMPDIR}/${name}_q20trim.fastq ${TMPDIR}/${name}_trim.${n2}Nremoved.fastq
## Collapse reads using prinseq-lite.pl. if there are UMI barcodes
if [[ ${UMI2} != 0 || ${UMI1} != 0 ]]; then
# Remove PCR duplciates.
# fastq with the first dedup_L nt
seqtk seq -L ${dedup_L} ${TMPDIR}/noadapt/${name}_noadapt.fastq > ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}.fastq
# deduplicate using the first dedup_L nt
prinseq-lite.pl -fastq ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}.fastq -derep 1 -out_format 3 -out_bad null -out_good ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup -min_len 15 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/noadapt/l${dedup_L}/${name}_noadapt_l${dedup_L}.fastq
# make a list of name from deduplicated fastq
cat ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup.fastq | awk '(NR%4==1){print substr($1,2)}' > ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt
rm ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup.fastq
# generate fastq from the list of name
seqtk subseq ${TMPDIR}/noadapt/${name}_noadapt.fastq ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt > ${TMPDIR}/passQC/${name}_dedup_withBarcode.fastq
rm ${TMPDIR}/noadapt/l${dedup_L}_nodups/${name}_dedup_l${dedup_L}.txt
# trim the UMI and additional barcode after dereplicate
prinseq-lite.pl -trim_left ${n1} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_QC_end 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_withBarcode.fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved.fastq
echo 'Number of reads after PCR duplicates removal and QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_dedup_QC_end.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
elif [[ ${Force_deduplicate} == "TRUE" ]]; then
# Remove PCR duplciates.
prinseq-lite.pl -derep 1 -fastq ${TMPDIR}/noadapt/${name}_noadapt.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_withBarcode 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# trim the UMI and additional barcode after dereplicate
prinseq-lite.pl -trim_left ${n1} -fastq ${TMPDIR}/passQC/${name}_dedup_withBarcode.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_dedup_QC_end 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_dedup_withBarcode.fastq ${TMPDIR}/passQC/${name}_dedup_BarcodeRemoved.fastq
echo 'Number of reads after PCR duplicates removal and QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_dedup_QC_end.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
else
# trim the additional barcode WITHOUT dereplicate. If no barcode, prinseq-lite.pl will remove unpair reads and reads that are length 0
prinseq-lite.pl -trim_left ${ADD_B1} -fastq ${TMPDIR}/noadapt/${name}_noadapt.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_BarcodeRemoved 2>> ${OUTPUT}/${name}.prinseq-pcrDups.gd
# min_len 15
prinseq-lite.pl -min_len 15 -fastq ${TMPDIR}/passQC/${name}_BarcodeRemoved.fastq -out_format 3 -out_bad null -out_good ${TMPDIR}/passQC/${name}_QC_end 2> ${OUTPUT}/${name}.prinseq-pcrDups.gd
rm ${TMPDIR}/passQC/${name}_BarcodeRemoved.fastq
echo 'Number of reads after final QC:' >> ${TMPDIR}/${name}.QC.log
cat ${TMPDIR}/passQC/${name}_QC_end.fastq | grep @ -c >> ${TMPDIR}/${name}.QC.log
fi
cat ${OUTPUT}/${name}.prinseq-pcrDups.gd
done
# if there is a data-set wide length cutoff map_L
if [[ ${map_L} != 0 ]]; then
mkdir ${TMPDIR}/passQC_length_${map_L}
for f in ${TMPDIR}/passQC/*_QC_end.fastq
do name=`echo $f |rev |cut -d / -f 1 |cut -d _ -f 3-|rev`
seqtk seq -L ${map_L} ${f} > ${TMPDIR}/passQC_length_${map_L}/${name}_l${map_L}_QC_end.fastq
done
fi
## Cleanup.
rm -r ${TMPDIR}/noadapt
gzip ${TMPDIR}/passQC*/*.fastq
#############################################
## Align reads.
if [[ ${map_L} != 0 ]]; then
ToMapDir=${TMPDIR}/passQC_length_${map_L}
else
ToMapDir=${TMPDIR}/passQC
fi