forked from unavailable-2374/Genome-Wide-Annotation-Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconf_all_defaults.txt
91 lines (68 loc) · 5.66 KB
/
conf_all_defaults.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
## Set the detail parameters of the modules during GETA pipeline through this config file.
# the parameters set below will rewrite the default threashold.
# the template conf_for_small_genome.txt in GETA software was the default values.
# the anotated lines start with # will be ingnored. the blank lines will be ignored.
[RepeatMasker]
-e ncbi -gff
# During the step of RepeatMasker: -species was passed by --RM_species; -pa was passed by --CPU.
[trimmomatic]
TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50 TOPHRED33
# note: must start with the adapter file name.
[hisat2-build]
-p 1
# note: the multi threads parameter -p was set to 1 by default, which was not passed by --CPU as usual.
[hisat2]
--min-intronlen 20 --max-intronlen 20000 --dta --score-min L,0.0,-0.4
# During the step of hisat2: -x was set to "genome"; -p was passed by --CPU;
[sam2transfrag]
--fraction 0.05 --min_expressed_base_depth 2 --max_expressed_base_depth 50 --min_junction_depth 2 --max_junction_depth 50 --min_fragment_count_per_transfrags 10 --min_intron_length 20
# During the step of sam2transfrag: --no_strand_specific was passed reversely by --strand_specific;
# If RNA-Seq reads containig single-end data, the --no_PE_sequencing should be added. Otherwise, the alignment of single-end will be ignored.
# The defalut value means: the dynamic coverage threshold of mapping regions (come frome SAM file) was determinated by the maximum base depth of each region * 0.05, as well as this threshold should between 2 and 50.
[TransDecoder.LongOrfs]
-m 100 -G universal
# note: the strand specific parameter -S will be set automatically if transcripts had introns which could determinated the direction.
[TransDecoder.Predict]
--retain_long_orfs_mode dynamic
# note: when predicting the ORF of transfrag with single exon, the parameter --train will be set automatically to the result of transfrag with multi exons.
[homolog_genewise]
--coverage_ratio 0.4 --evalue 1e-9
# During the step of homolog_genewise: --cpu was passed by --CPU; --max_gene_length, --segmentSize and --overlapSize were automatically calculated.
[homolog_genewiseGFF2GFF3]
--min_score 15 --gene_prefix genewise --filterMiddleStopCodon
# During the step of homolog_genewiseGFF2GFF3: --input_genewise_start_info was fixed to "genewise.start_info.txt"; --output_start_and_stop_hints_of_augustus was fixed to "genewise.start_stop_hints.gff".
[geneModels2AugusutsTrainingInput]
--min_evalue 1e-9 --min_identity 0.8 --min_coverage_ratio 0.8 --min_cds_num 2 --min_cds_length 450 --min_cds_exon_ratio 0.60
# During the step of geneModels2AugusutsTrainingInput: --cpu was passed by --CPU; --out_prefix was set to ati.
# if gene models number of ati.filter2.gff3 was few, recommed to decrease --min_cds_exon_ratio and increase --min_coverage_ratio.
[BGM2AT]
--min_gene_number_for_augustus_training 500 --gene_number_for_accuracy_detection 200 --min_gene_number_of_optimize_augustus_chunk 50 --max_gene_number_of_optimize_augustus_chunk 200
# During the step of BGM2AT: --flanking_length was automatically calculated; --stopAfterFirstEtraining was set or not set at different steps; --onlytrain_GFF3 was set to a intermediate file.
[prepareAugusutusHints]
--margin 20
# note: to obtain the exonpart or cdspart hints, the margin 20bp will be removed.
[paraAugusutusWithHints]
--gene_prefix augustus --min_intron_len 20
# During the step of paraAugusutusWithHints: --species was passed by --augustus_species; --cpu was passed by --CPU; --segmentSize and --overlapSize were automatically calculated.
[paraCombineGeneModels]
--overlap 30 --min_augustus_transcriptSupport_percentage 10.0 --min_augustus_intronSupport_number 1 --min_augustus_intronSupport_ratio 0.01
# During the step of paraAugusutusWithHints: --cpu was passed by --CPU.
[pickout_better_geneModels_from_evidence]
--overlap_ratio 0.2 --ratio1 2 --ratio2 1.5 --ratio3 0.85 --ratio4 0.85
# 本步骤用于挑选完全来自evidence的基因模型。当evidence基因模型的CDS重叠比例低于--overlap_ratio参数值,则挑选之;当 CDS重叠区碱基数 / evidence基因模型CDS总碱基数 >= 阈值[0.2],认为基因有重叠,进一步挑选:当evidence基因模型的CDS范围(首尾CDS坐标差)> ref基因模型的CDS范围 * 系数阈值[2],则挑选出来;当evidence基因模型的CDS长度 > ref基因模型 * 系数阈值[1.5],则挑选出来;当完整evidence基因模型(根据GFF3文件中的Integrity=complete信息)的CDS范围(首尾CDS坐标差)> ref基因模型的CDS范围 * 系数阈值[0.85],则挑选出来;当完整evidence基因模型的CDS长度 > ref基因模型的CDS长度 * 系数阈值[0.85],则挑选出来。
[fillingEndsOfGeneModels]
--start_codon ATG --stop_codon TAG,TGA,TAA
[alternative_splicing_analysis]
--min_intron_depth 1 --min_base_depth_ratio_for_ref_specific_intron 0.3 --min_intron_depth_ratio_for_evidence_specific_intron 0.2 --min_base_depth_ratio_for_common_intron 0.2 --min_gene_depth 10 --min_transcript_confidence_for_output 0.05 --transcript_num_for_output_when_all_low_confidence 8 --added_mRNA_ID_prefix t
[GFF3_extract_TranscriptID_for_filtering]
--min_CDS_ratio 0.3 --min_CDS_length 600 --max_repeat_overlap_ratio 0.3 --ignore_repeat_Name Simple_repeat,Low_complexity,Satellite,Unknown,Tandem_repeat
[para_hmmscan]
--evalue1 1e-5 --evalue2 1e-3 --hmm_length 80 --coverage 0.25 --no_cut_ga --chunk 20 --hmmscan_cpu 2
[para_blast]
--chunk 10 --blast-threads 1 --evalue 1e-3 --max-target-seqs 20 --completed_ratio 0.9
[dimanod]
--sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --index-chunks 1 --block-size 5
[parsing_blast_result.pl]
--evalue 1e-9 --identity 0.1 --CIP 0.4 --subject-coverage 0.4 --query-coverage 0.4
[get_valid_geneModels]
# None