-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastProteinExonerate_v220221.sh
127 lines (101 loc) · 6.36 KB
/
FastProteinExonerate_v220221.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/bin/bash
#Takes a protein and DNA input file, pblats the proteins against the DNA sequences, then predicts genes in the matched sections using exconerate protein2genome
CONDASH=/data/miniconda3/etc/profile.d/conda.sh
if [ ! -f $CONDASH ]
then
echo "Conda executable does not exist in "$CONDASH" Please adjust set $CONDASH and rerun. Exiting..."
exit
else
source $CONDASH
fi
ENVIRO=$(conda env list | grep -v "#" | awk '{print $1}' | sed '/^$/d' | grep "proteinexonerate")
if [ "$ENVIRO" == "proteinexonerate" ]
then
echo "Conda environment \"proteinexonerate\" exist. Activating environment"
conda activate proteinexonerate
else
echo "Creating conda environment \"proteinexonerate\" "
conda create -n proteinexonerate -c bioconda pblat bedtools exonerate gffread
conda activate proteinexonerate
fi
if [ "$#" -lt "4" ]
then
echo "Usage: FastProteinExonerate_v220221.sh <protein file> <DNA file> <n cores> <maxIntron>"
exit
fi
if [[ "$1" == "-h" ]]
then
echo "Usage: FastProteinExonerate_v220221.sh <protein file> <DNA file> <n cores> <maxIntron>"
exit
fi
if [[ "$1" == "--help" ]]
then
echo "Usage: FastProteinExonerate_v220221.sh <protein file> <DNA file> <n cores> <maxIntron>"
exit
fi
echo "$(date): Starting run"
echo "$(date): Running pblat and Exonerate for protein data set $1 against DNA sequences $2 using $3 cores and maxIntron length $4"
if [ -d "protExon" ]
then
echo "$(date): Removing existing folder protExon"
rm -rf protExon
fi
mkdir protExon && cd protExon
#clean input protein data set (read in from command line via $1)"
echo "$(date): Cleaning protein input $1"
awk '{print $1}' ../$1 | sed 's/|/_/g' | awk 'BEGIN{getline; print $0; seq="";}{if($0 ~ "^>"){print seq; seq=""; print $0;}else{seq=seq""$0}}END{print seq}' > cleaned_proteins.fasta
#run pblat with number of threads $3
echo "$(date): Run pblat with $3 cores: $1 against $2"
pblat -threads=$3 -q=prot -t=dnax -fine -maxIntron=$4 ../$2 cleaned_proteins.fasta protein_out.psl 1>pblat.log 2>pblat.err
#Pick top hit only
echo "$(date): Picking best pblat hit for each protein"
awk -F"\t" '{if($0 ~ /^psLayout/){for(i=1; i < 5; i++){getline;}}else{print $0}}' protein_out.psl | sort -t$'\t' -k10,10 -k1,1nr -k15,15nr | sort -u -k10,10 --merge > best_hits_protein_out.psl
#create coordinate file with +- 500nt distances
echo "$(date): Adjusting match coordinates and creating lookup files"
cut -f 10,14-17 best_hits_protein_out.psl | awk -F"\t" '{printf $1"\t"$2"\t"$3; if($4-500 < 0){printf "\t0"}else{printf "\t"$4-500}; if($5+500 > $3-1){print "\t"$3-1}else{print "\t"$5+500};}' | sed 's/|/_/g' > coord.info.tsv
#awk -F"\t" '{if(){}; }' > tmp && mv tmp coord.info.tsv
#cut -f 10,14-17 best_hits_protein_out.psl | awk -F"\t" '{printf $0; if($4-500 < 0){printf "\t0"}else{printf "\t"$4-500}; if($5+500 > $3){print "\t"$3}else{print "\t"$5+500};}' | sed 's/|/_/g' > coord.info.tsv
cut -f2,4,5 coord.info.tsv | sort | uniq > match_coord.bed
#extract scaffold regions using bedtools
echo "$(date): Using bedtools to extract the match regions from $2"
bedtools getfasta -fi ../$2 -bed match_coord.bed | sed 's/:/__/' > match_sections.fasta
#prepare the script that creates input files, runs exonerate, cleans up gff etc.
echo "$(date): Creating run.sh script. This script will be very large and contains sequence data, do not attempt to less/more/cat it"
#IF YOUR conda.sh IS SOMEWHERE ELSE MODIFY THE NEXT LINE ACCORDINGLY
echo "source $CONDASH" > run.sh
echo "conda activate proteinexonerate" >> run.sh
echo "echo \"\$(date): Start run.sh\"" >> run.sh
awk -F"\t" 'BEGIN{
while(getline < "match_sections.fasta"){
a=$1; getline < "match_sections.fasta"; seq[a]=$1;
};
while(getline < "cleaned_proteins.fasta"){
b=$1; getline < "cleaned_proteins.fasta"; prot[b]=$1;
};
}{
q=$1; t=$2; len=$3; sn=$4; en=$5;
q_name=q".fasta"
t_comb=t"__"sn"-"en
t_name=t_comb".fasta"
print("echo \">"t_comb"\" > "t_name);
print("echo \""seq[">"t_comb]"\" >> "t_name);
print("echo \">"q"\" > "q_name);
print("echo \""prot[">"q]"\" >> "q_name);
print("exonerate --minintron 20 --model protein2genome -S FALSE --showtargetgff TRUE --showvulgar FALSE --showalignment no --singlepass FALSE --bestn 1 -q "q_name" -t "t_name" > "q".gff")
#print("rm "q_name" "t_name)
print("sed '\''s/\\tgene_id 0 /\\tID="q"/'\'' "q".gff | awk -F\"\\t\" '\''{if($3 == \"gene\"){print $0; gsub(/gene/,\"mRNA\",$3); for(i=1; i<NF; i++){printf $i\"\\t\"}; print $NF}else{print $0}}'\'' | awk -F\"\\t\" '\''{if($9 !~ /^ID=/ && $9 !~ /^Parent=/){print $0\"\\tParent="q";\"$9}else{print $0\"\\t\"$9}}'\'' | cut -f1-8,10 > tmp; mv tmp "q".gff")
print("grep -v -e \"^-- completed exonerate analysis$\" -e \"^#\" -e \"^Command line:\" -e \"^Hostname:\" "q".gff | awk -F\"\\t\" '\''{if($3 != \"similarity\"){$4=$4+"sn"; $5=$5+"sn"; printf \""t"\\t\"; for(i=2; i<NF; i++){printf $i\"\\t\"}; print $NF}}'\'' >> final.gff")
#print("rm "q".gff")
}' coord.info.tsv >> run.sh
#NORUN #print("sed '\''s/\\tgene_id 0 /\\tID="q"/'\'' "q".gff | awk -F\"\\t\" '\''{if($3 == \"gene\"){print $0; gsub(/gene/,\"mRNA\",$3); gsub(/^ID=/,\"Parent=\",$9); for(i=1; i<NF; i++){printf $i\"\\t\"}; print $NF}else{print $0}}'\'' | awk -F\"\\t\" '\''{if($9 !~ /^ID=/ && $9 !~ /^Parent=/){print $0\"\\tParent="q";\"$9}else{print $0\"\\t\"$9}}'\'' | cut -f1-8,10 > tmp; mv tmp "q".gff")
echo "echo \"\$(date): End run.sh\"" >> run.sh
echo "$(date): run.sh created"
echo "$(date): Size of run sh: $(du -h run.sh | awk '{print $1}')"
echo "$(date): Executing run.sh. Log file: run.log. Error file: err.log"
bash run.sh 1> run.log 2> run.err
awk -F"\t" '{if($3 == "mRNA" || $3 == "cds" || $3 == "exon"){print $0}}' final.gff | sed 's/\tcds\t/\tCDS\t/' | awk -F"\t" '{if($9 ~ /^ID=/){print $0"\t"$9}else{split($9,attr,"[;=]"); count[$3""attr[2]]++; print $0"\tID="attr[2]"_"$3"_"count[$3""attr[2]]"; "$9}}' | cut -f 1-8,10 > tmp; mv tmp final.gff
#awk -F"\t" '{if($3 == "gene" || $3 == "mRNA" || $3 == "cds" || $3 == "exon"){print $0}}' final.gff | sed 's/\tcds\t/\tCDS\t/' > tmp; mv tmp final.gff
echo "$(date): Creating CDS and protein files"
gffread final.gff -g ../$2 -y final.proteins.fa
gffread final.gff -g ../$2 -x final.cds.fa
echo "$(date): Done running run.sh. Analysis complete. Output in final.gff, final.cds.fa, final.proteins.fa"