-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathproject.yaml
200 lines (165 loc) · 7.35 KB
/
project.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# N E X T F L O W
##########################################################################
##### SAMPLE RIBOFLOW ARGUMENTS FILE WITH RNASEQ AND METADATA ########
##########################################################################
# Tested on version 19.04.1
# Perform fastqc at several stages of the pipeline
do_fastqc: true
# Check existnece of fastq.gz files and bowtie2 reference files
do_check_file_existence : true
# Remove duplicate reads based on their length
# and mapped position
deduplicate: true
# If you have RNA-Seq data additionally,
# that you want to pair with your ribosome profiling data,
# you can set this flag to true
# AND PROVIDE RNA-Seq data
# under the key rnaseq in this file. See below.
# If you don't have RNA-Seq data, set this flag to false
do_rnaseq: true
# If you don't have metadata set do_metadata to false.
# If you have metadata, provide yaml files for the experiments
# under input -> metadata below.
do_metadata: true
# These arguments are used for clipping adapters by cutadapt.
# (see https://cutadapt.readthedocs.io/en/stable/guide.html )
clip_arguments: '-u 1 -a CTGTAGGCACCATCAAT --overlap=4 --trimmed-only --maximum-length=40 --minimum-length=15 --quality-cutoff=28'
# If you don't want to perform and adapter clipping,
# you can comment the above option and use the option below.
#clip_arguments: '--quality-cutoff=0'
# Transcriptome alignments are filtered based on mapping quality.
# This is the threshold that the alignments need to pass for
# downstream quantification
mapping_quality_cutoff: 2
###############################################################################
# Arguments for the aligner for
# corresponding steps
alignment_arguments:
# bowtie2 arguments for rtRNA filtering step
filter: '-L 15 --no-unal --norc'
# bowtie2 arguments for transcriptome alignment step
transcriptome: '-L 15 --norc --no-unal'
# hisat2 arguments
# use -k 1 so that each aligned read is reported once.
# otherwise, our read length analysis values might be inflated.
genome: '--no-unal -k 1'
###############################################################################
# RiboPy parameters for ribo file generation.
ribo:
ref_name: "appris-v1"
metagene_radius: 50
left_span: 35
right_span: 10
read_length:
min: 28
max: 32
coverage: true
###############################################################################
# Output folder settings
# These entries typically don't need modifications.
# Note that everything is placed as a subfolder under the *base* folder
# *base* gives the actual folder location
# The other parameters are folder names that are going to be under the *base*
output:
individual_lane_directory: 'individual'
merged_lane_directory: 'merged'
intermediates:
# base is the root folder for the intermediate files
base: 'intermediates'
clip: 'clip'
log: 'log'
transcriptome_alignment: 'transcriptome_alignment'
filter: 'filter'
genome_alignment: 'genome_alignment'
bam_to_bed: 'bam_to_bed'
quality_filter: 'quality_filter'
genome_alignment: 'genome_alignment'
# alignment_ribo folder contains the bed files
# that are used as input to RiboPy to create ribo files.
alignment_ribo: 'alignment_ribo'
output:
# base is the root folder for the output files
base: 'output'
log: 'log'
fastqc: 'fastqc'
ribo: 'ribo'
###############################################################################
# In this exapmle we have two experiments with the names
# GSM1606107 and GSM1606108
# These names are first introduced when providing fastq files
# for ribosome profiling data. (input -> fastq -> GSM1606107) and (input -> fastq -> GSM1606108)
#
# If metadata or RNA-Seq data are provided, they must match these names
# See below as an example.
input:
reference:
# filter indicates bowtie2 index files
# * is used as a wild card to match all bowtie2 index files:
# human_rtRNA.1.bt2, human_rtRNA.2.bt2, ....
filter: ./rf_sample_data/filter/human_rtRNA*
# transcriptome indicates bowtie2 index files
# Generated from isoform sequences.
transcriptome: ./rf_sample_data/transcriptome/appris_human_24_01_2019_selected*
# Main annotation file.
# CDS and UTR regions are defined in this file.
regions: ./rf_sample_data/annotation/appris_human_24_01_2019_actual_regions.bed
# Transcript lengths
transcript_lengths: ./rf_sample_data/annotation/appris_human_24_01_2019_selected.lengths.tsv
## Genome Alignment Reference
# Sequences that are NOT aligneod to the transcriptome
# are mappoed to the genome
# This parameter (and the corresponding step) is optional.
# Comment the line below to skip this step
#genome: ./rf_sample_data/genome/mock_hg38*
# Reads NOT aligned to the genome are mapped to this reference
# This parameter (and the corresponding step) is optional.
# Comment the line below to skip this step
#post_genome: ./rf_sample_data/post_genome/post_genome*
# This will be prefixed to the file paths below
# You can leave it as empty "" if you provide complete paths.
fastq_base: ""
fastq:
# We have two ribosome profiling experiments called
# GSM1606107 and GSM1606108
GSM1606107:
- ./rf_sample_data/fastq/ribosome_profiling/GSM1606107/SRR1795425.fastq.gz
- ./rf_sample_data/fastq/ribosome_profiling/GSM1606107/SRR1795426.fastq.gz
GSM1606108:
- ./rf_sample_data/fastq/ribosome_profiling/GSM1606108/SRR1795427.fastq.gz
- ./rf_sample_data/fastq/ribosome_profiling/GSM1606108/SRR1795428.fastq.gz
## INPUTS BELOW THIS LINE ARE POTIONAL
# This is the metadata file stored at the root ribo file
# In this example, we are storing this yaml file
# Any valid yaml file can be stored as metadata.
root_meta: "./project.yaml"
# The following metadata is stored under individual experiments.
metadata:
# This will be prefixed to the file paths below
# You can leave it as empty "" if you provide complete paths.
base: ""
# file keys (left hand side of ":")
# must match experiment names for ribosome profiling data above.
files:
GSM1606108: ./rf_sample_data/metadata/GSM1606108.yml
GSM1606107: ./rf_sample_data/metadata/GSM1606107.yml
###############################################################################
# If you have no RNA-Seq data to process,
# remove "rnaseq" node from this yaml tree
rnaseq:
clip_arguments: '-u 1 --quality-cutoff=28'
# This will be prefixed to the file paths below
# You can leave it as empty "" if you provide complete paths.
fastq_base: ""
deduplicate: false
filter_arguments: '-L 15 --no-unal'
bt2_argumments: "-L 15 --no-unal"
# Keys must match the experiment names for the ribosome profiling data
# In this particular example, valid keys are
# GSM1606107 an d GSM1606108
fastq:
GSM1606107:
- ./rf_sample_data/fastq/rnaseq/GSM1606099/SRR1795409.fastq.gz
- ./rf_sample_data/fastq/rnaseq/GSM1606099/SRR1795410.fastq.gz
GSM1606108:
- ./rf_sample_data/fastq/rnaseq/GSM1606100/SRR1795411.fastq.gz
- ./rf_sample_data/fastq/rnaseq/GSM1606100/SRR1795412.fastq.gz