-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfunTFBS
executable file
·124 lines (101 loc) · 4.35 KB
/
funTFBS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
#bedtools -version
set -e
DIR=$(dirname $(readlink -f $0))
PATH=$PATH:$DIR/src
function do_usage {
cat << EOF
Program: FunTFBS (The Tool for identifying TFBS with transcriptional regulatory functions)
Version: v1.1.0
Usage: funTFBS -t TFBS.bed -m motifs -f format -p phyloP.bg -g genome.fa -o output
-t [TFBS.bed] the file containing candidate TFBS in bed format (with strand information)
-m [motifs] the file containing binding motifs in specified format.
-f [format] the format of bidning motifs, could be: meme/beeml/chen/jaspar-pfm/jaspar-sites/jaspar-cm/transfac/uniprobe.
-p [phyloP.bg] the file containing PhyloP scores in bedGraph format.
-g [genome.fa] the file containing genomic sequence in fasta format.
-o [output] the output directory.
-h show this help information.
Example: funTFBS -t demo/test_TFBS.bed -m demo/Ath.meme -f meme -p demo/test_PhyloP.bed -g demo/Ath_test.fa -o test
EOF
exit
}
if [[ $# -eq 0 ]]; then do_usage; fi
while getopts :t:m:f:p:g:o:h option
do
case "$option" in
t) TFBS=$OPTARG;;
m) motifs=$OPTARG;;
f) mtform=$OPTARG;;
p) phyloP=$OPTARG;;
g) genome=$OPTARG;;
o) output_dir=$OPTARG;;
h) do_usage;;
\?) echo "Option -$OPTARG not recognized."; do_usage;;
esac
done
mkdir -p $output_dir $output_dir/motifs
# format TFBS bed file
bedtools getfasta -s -fi $genome -bed $TFBS -fo $output_dir/TFBS.fa
paste <(cut -f 1-6 $TFBS) <(grep -v '^>' $output_dir/TFBS.fa) > $output_dir/TFBS.bed # skip sort (No need)
N=`cat $output_dir/TFBS.bed | wc -l`
echo "TFBS before filtering: $N"
# format motifs
echo "motif format: $mtform"
case $mtform in
meme) cat $motifs > $output_dir/input.meme;;
beeml) beeml2meme $motifs > $output_dir/input.meme;;
chen) chen2meme $motifs > $output_dir/input.meme;;
jaspar-pfm) jaspar2meme -bundle $motifs > $output_dir/input.meme;;
jaspar-sites) jaspar2meme $output_dir/motifs > $output_dir/input.meme;;
jaspar-cm) jaspar2meme -cm $output_dir/motifs > $output_dir/input.meme;;
transfac) transfac2meme $motifs > $output_dir/input.meme;;
uniprobe) uniprobe2meme $motifs > $output_dir/input.meme;;
esac
# mtFreq
#ls $motif_dir/*.meme | while read fl
ls $output_dir/input.meme | while read fl
do
do_meme2freqBatch.pl $fl
done > $output_dir/mtFreq.txt
# check missed motif
cut -f 1 $output_dir/mtFreq.txt | sort -u > $output_dir/TF_in_motif.txt
cut -f 4 $output_dir/TFBS.bed | sort -u > $output_dir/TF_in_BS.txt
comm -1 -3 $output_dir/TF_in_motif.txt $output_dir/TF_in_BS.txt > $output_dir/TF_missed.txt
if [[ -s $output_dir/TF_missed.txt ]]
then
echo "Missed TF motif information for:"
cat $output_dir/TF_missed.txt
touch $output_dir/TFBS_filtered.bed
exit
fi
do_seq2mtFreq.pl $output_dir/mtFreq.txt $output_dir/TFBS.bed > $output_dir/TFBS.mtFreq
# extend (No need)
# phyloP
# step by step
echo "Step1"
awk -F "\t" '{$4=$4"_"NR; print $0}' OFS="\t" $output_dir/TFBS.bed > $output_dir/TFBS.bed_numed
bedtools makewindows -b $output_dir/TFBS.bed_numed -w 1 -i src | sort -T $output_dir -k 1,1 -k 2,2n > $output_dir/tmp1.bed
echo "Step2"
bedtools map -a $output_dir/tmp1.bed -b $phyloP -c 4 -o collapse > $output_dir/tmp2.bed_pre
cat $output_dir/tmp2.bed_pre | sort -T $output_dir -s -k 4,4 | sed 's/\.$/NA/g' > $output_dir/tmp2.bed
echo "Step3"
bedtools groupby -i $output_dir/tmp2.bed -g 4 -c 5 -o collapse > $output_dir/TFBS.pp_collapsed
sed 's/_\([0-9]\+\)\t/\t\1\t/' $output_dir/TFBS.pp_collapsed | sort -T $output_dir -k 2,2n | cut -f 3 > $output_dir/TFBS.pp_sorted
paste <(cut -f 1-7 $output_dir/TFBS.bed) $output_dir/TFBS.pp_sorted > $output_dir/TFBS.pp0
do_revMinus.pl $output_dir/TFBS.pp0 > $output_dir/TFBS.pp # reverse the value in "-" strand
#rm -f $output_dir/TFBS_2bp.pp0 $output_dir/TFBS_2bp.ppe # XXX
# calculate correlation
funTFBS.r $output_dir/TFBS.bed $output_dir/TFBS.mtFreq $output_dir/TFBS.pp $output_dir
n=`cat $output_dir/TFBS_filtered.bed | wc -l`
echo "TFBS after filtering: $n"
# clean tmp files
function do_clean {
rm -rf $output_dir/motifs
rm -f $output_dir/input.meme $output_dir/TFBS.fa $output_dir/mtFreq.txt $output_dir/TF_missed.txt $output_dir/TF_in_motif.txt $output_dir/TF_in_BS.txt
rm -f $output_dir/TFBS.bed_numed $output_dir/tmp[12].bed* $output_dir/TFBS.pp_* $output_dir/TFBS.pp0
rm -f $output_dir/TFBS.bed $output_dir/TFBS.mtFreq $output_dir/TFBS.pp
#rm -f $output_dir/TFBS_unfiltered.bed
}
do_clean
echo "$output_dir/TFBS_filtered.bed"
#echo "OK"