project.yaml

# N E X T F L O W 
##########################################################################
#####   SAMPLE RIBOFLOW ARGUMENTS FILE WITH RNASEQ AND METADATA   ########
########################################################################## 

# Tested on  version 19.04.1

# Perform fastqc at several stages of the pipeline
do_fastqc: true

# Check existnece of fastq.gz files and bowtie2 reference files 
do_check_file_existence : true

# Remove duplicate reads based on their length
# and mapped position
deduplicate: true

# If you have RNA-Seq data additionally, 
# that you want to pair with your ribosome profiling data,
# you can set this flag to true 
# AND PROVIDE RNA-Seq data
# under the key rnaseq in this file. See below.
# If you don't have RNA-Seq data, set this flag to false
do_rnaseq: true

# If you don't have metadata set do_metadata to false.
# If you have metadata, provide yaml files for the experiments
# under input -> metadata below. 
do_metadata: true

# These arguments are used for clipping adapters by cutadapt.
# (see https://cutadapt.readthedocs.io/en/stable/guide.html )
clip_arguments: '-u 1 -a CTGTAGGCACCATCAAT --overlap=4 --trimmed-only --maximum-length=40 --minimum-length=15 --quality-cutoff=28'

# If you don't want to perform and adapter clipping, 
# you can comment the above option and use the option below.   
#clip_arguments: '--quality-cutoff=0'

# Transcriptome alignments are filtered based on mapping quality.
# This is the threshold that the alignments need to pass for
# downstream quantification
mapping_quality_cutoff: 2

###############################################################################
# Arguments for the aligner for 
# corresponding steps
alignment_arguments:
   # bowtie2 arguments for rtRNA filtering step
   filter:        '-L 15 --no-unal --norc'
   
   # bowtie2 arguments for transcriptome alignment step
   transcriptome: '-L 15 --norc --no-unal'
   
   # hisat2 arguments
   # use -k 1 so that each aligned read is reported once.
   # otherwise, our read length analysis values might be inflated. 
   genome: '--no-unal -k 1'

###############################################################################
# RiboPy parameters for ribo file generation.
ribo:
    ref_name:        "appris-v1"
    metagene_radius: 50
    left_span:       35
    right_span:      10
    read_length:
       min: 28
       max: 32
    coverage: true
   
###############################################################################
# Output folder settings
# These entries typically don't need modifications.
# Note that everything is placed as a subfolder under the *base* folder
# *base* gives the actual folder location
# The other parameters are folder names that are going to be under the *base* 
output:
   individual_lane_directory: 'individual'
   merged_lane_directory: 'merged'
   intermediates: 
      # base is the root folder for the intermediate files
      base: 'intermediates'
      clip: 'clip'
      log: 'log'
      transcriptome_alignment: 'transcriptome_alignment'
      filter: 'filter'
      genome_alignment: 'genome_alignment'
      bam_to_bed: 'bam_to_bed'
      quality_filter: 'quality_filter'
      genome_alignment: 'genome_alignment'
      # alignment_ribo folder contains the bed files
      # that are used as input to RiboPy to create ribo files.
      alignment_ribo: 'alignment_ribo'
   output: 
      # base is the root folder for the output files
      base: 'output'
      log: 'log'
      fastqc: 'fastqc'
      ribo: 'ribo'
      
###############################################################################
# In this exapmle we have two experiments with the names  
# GSM1606107 and GSM1606108
# These names are first introduced when providing fastq files 
# for ribosome profiling data. (input -> fastq -> GSM1606107) and (input -> fastq -> GSM1606108)
# 
# If metadata or RNA-Seq data are provided, they must match these names
# See below as an example. 


input:
   reference:
   # filter indicates bowtie2 index files
   # * is used as a wild card to match all bowtie2 index files:
   # human_rtRNA.1.bt2, human_rtRNA.2.bt2, ....
      filter: ./rf_sample_data/filter/human_rtRNA*

      # transcriptome indicates bowtie2 index files
      # Generated from isoform sequences.
      transcriptome: ./rf_sample_data/transcriptome/appris_human_24_01_2019_selected*

      # Main annotation file.
      # CDS and UTR regions are defined in this file.
      regions: ./rf_sample_data/annotation/appris_human_24_01_2019_actual_regions.bed

      # Transcript lengths
      transcript_lengths: ./rf_sample_data/annotation/appris_human_24_01_2019_selected.lengths.tsv
      
      ## Genome Alignment Reference
      # Sequences that are NOT aligneod to the transcriptome 
      # are mappoed to the genome
      # This parameter (and the corresponding step) is optional.
      # Comment the line below to skip this step
      #genome: ./rf_sample_data/genome/mock_hg38*
      
      # Reads NOT aligned to the genome are mapped to this reference
      # This parameter (and the corresponding step) is optional.
      # Comment the line below to skip this step
      #post_genome: ./rf_sample_data/post_genome/post_genome*

   # This will be prefixed to the file paths below
   # You can leave it as empty "" if you provide complete paths.
   fastq_base: ""
   fastq:
       # We have two ribosome profiling experiments called 
       # GSM1606107  and  GSM1606108
       GSM1606107:
         - ./rf_sample_data/fastq/ribosome_profiling/GSM1606107/SRR1795425.fastq.gz
         - ./rf_sample_data/fastq/ribosome_profiling/GSM1606107/SRR1795426.fastq.gz

       GSM1606108:
         - ./rf_sample_data/fastq/ribosome_profiling/GSM1606108/SRR1795427.fastq.gz
         - ./rf_sample_data/fastq/ribosome_profiling/GSM1606108/SRR1795428.fastq.gz
    
   ## INPUTS BELOW THIS LINE ARE POTIONAL
   
   # This is the metadata file stored at the root ribo file
   # In this example, we are storing this yaml file    
   # Any valid yaml file can be stored as metadata.
   root_meta: "./project.yaml"   
   
   # The following metadata is stored under individual experiments.     
   metadata:
     
        # This will be prefixed to the file paths below
        # You can leave it as empty "" if you provide complete paths.
        base: ""
       
        # file keys (left hand side of ":")
        # must match experiment names for ribosome profiling data above.
        files: 
         GSM1606108: ./rf_sample_data/metadata/GSM1606108.yml
         GSM1606107: ./rf_sample_data/metadata/GSM1606107.yml

###############################################################################
# If you have no RNA-Seq data to process,
# remove "rnaseq" node from this yaml tree  
rnaseq:
 clip_arguments: '-u 1 --quality-cutoff=28'
 
 # This will be prefixed to the file paths below
 # You can leave it as empty "" if you provide complete paths.
 fastq_base: ""
 
 deduplicate: false
 filter_arguments: '-L 15 --no-unal'
 bt2_argumments: "-L 15  --no-unal"
 
 # Keys must match the experiment names for the ribosome profiling data
 # In this particular example, valid keys are
 # GSM1606107 an d GSM1606108 
 fastq:
   GSM1606107:
     - ./rf_sample_data/fastq/rnaseq/GSM1606099/SRR1795409.fastq.gz
     - ./rf_sample_data/fastq/rnaseq/GSM1606099/SRR1795410.fastq.gz
   GSM1606108:
     - ./rf_sample_data/fastq/rnaseq/GSM1606100/SRR1795411.fastq.gz
     - ./rf_sample_data/fastq/rnaseq/GSM1606100/SRR1795412.fastq.gz