-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2b_check_censor_output.sbatch
62 lines (51 loc) · 2.01 KB
/
2b_check_censor_output.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# Want to keep only the nucleotide sequences that CENSOR recognises as your repeat element of interest (e.g. L1s)
# And not some other repeat (e.g. CR1 or hAT)
# Example usage
#
# SPECIES=Yarrowia.lipolytica \
# FILE=Yarrowia.lipolytica_L1_combined.fasta \
# GENOME=test_genome/YarrowiaLipolytica_ASM252v1.fa \
# ELEMENT=L1
# QUERY=test_query/known_L1_elements_from_repbase.txt \
# CENSORDIR=results/censored \
# sbatch 2b_check_censor_output.sbatch
#SBATCH -p short
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=12:00:00
#SBATCH --mem=32GB
# Load the necessary modules
module load bedtools
# First, extract map file lines in the right orientation (d, not c)
# Print the header (which contains coords and strand) and column 4 (hit name)
# Put into BED-like format
cat $CENSORDIR/"$FILE".map \
| awk '{if ($7=="d") print $0 }' \
| awk '{print $1 "\t" $4}' \
| sed 's/:/\t/g' \
| sed 's/(-)/.rev/g' \
| sed 's/(+)/.fwd/g' \
| awk '{gsub(/-/,"\t",$2); print}' \
| sed 's/.rev/\t-/g' \
| sed 's/.fwd/\t+/g' \
| awk '{print $1 "\t" $2 "\t" $3 "\t" $5 "\t" "1" "\t" $4}' \
> "$FILE".rearranged.tmp
# Sort, merge and rearrange columns
bedtools sort -i "$FILE".rearranged.tmp \
| bedtools merge -s -i - -c 4,5,6 -o distinct,distinct,distinct \
| awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $6}' \
> "$FILE".merged.tmp
# Extract sequences which correctly identified as L1s
awk 'NR==FNR { a[$1] = $1; next} { for (k in a) if ($1 ~ a[k]) { print $0; break } }' "$QUERY" "$FILE".merged.tmp \
| awk '{print $2 "\t" $3 "\t" $4 "\t" $1 "\t" "1" "\t" $5}' \
> "$SPECIES"_"$ELEMENT"_censored.bed
# Extract FASTA from L1-only merged BED file
bedtools getfasta -s -fi "$GENOME" -bed "$SPECIES"_"$ELEMENT"_censored.bed -fo "$SPECIES"_"$ELEMENT"_censored.tmp
# Sort sequences by length
usearch -sortbylength "$SPECIES"_"$ELEMENT"_censored.tmp -fastaout "$SPECIES"_"$ELEMENT"_censored.fasta
# Move files to censor dir
mv "$SPECIES"_"$ELEMENT"_censored.bed $CENSORDIR
mv "$SPECIES"_"$ELEMENT"_censored.fasta $CENSORDIR
# Remove unnecessary files
rm *.tmp