-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtelocal_gen_tables.R
92 lines (73 loc) · 2.84 KB
/
telocal_gen_tables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
## ---------------------------
##
## Script name: telocal_gen_tables.R
##
## Purpose of script: loading gene and TE cntTable files generated by TElocal, splitting gene and te counts, merging into a common count table (separate for genes and tes)
##
## Author: Natalia Savytska
##
## Date Created: 2020-08-20
##
## Copyright (c) Natalia Savytska, 2020
## Email: savytskanat@gmail.com
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------
## Obtaining common count table for TElocal output
library(tidyverse)
library(data.table)
library(stringr)
library(purrr)
library(zeallot)
#temp = list.files(pattern="*.csv")
#myfiles = lapply(temp, read.delim)
# strips colnames from "junk" identifiers and adds "TElocal_" prefix
# _v3_MM_100Aligned.out.bam
# genesep extracts gene counts from common table generated by TElocal
genesep<-function(df){
ge_cnt<-df[df$gene.TE %in% grep(":",df$gene.TE,value=TRUE, invert=TRUE),]
cname<-str_remove(colnames(df),"_v[0-9]_MM_go_multiAligned.out.bam")
colnames(ge_cnt)<-cname
print(cname)
return(ge_cnt)
}
# tesep extracts TE counts from common table generated by TElocal
tesep<-function(df){
te_cnt<-df[df$gene.TE %in% grep(":",df$gene.TE,value=TRUE),]
cname<-str_remove(colnames(df),"_v[0-9]_MM_go_multiAligned.out.bam")
colnames(te_cnt)<-cname
print(cname)
return(te_cnt)
}
# telocal_load is function for reading in and creating a common table for all telocal count tables under one, provided as input argument, directory path.
# loads and merges all the tables, created one common table for TE loci counts, one common table for TE subfamily level counts and one common table for gene counts
#list(x, y, z) %>% reduce(left_join, by = "i")
telocal_load<-function(mypath){
temp<-list.files(path=mypath, pattern="*.cntTable")
temp<-paste0(mypath,temp)
myfiles<-lapply(temp,read.delim)
mytes<-lapply(myfiles,tesep)
mygenes<-lapply(myfiles,genesep)
te_tab<-mytes %>% purrr::reduce(full_join, by="gene.TE")
te_tab<-as.data.table(te_tab,key="gene.TE")
gene_tab<-mygenes %>% purrr::reduce(full_join, by="gene.TE")
gene_tab<-as.data.table(gene_tab,key="gene.TE")
colnames(te_tab)[1]<-"ID"
colnames(gene_tab)[1]<-"ID"
te_tab[,ID:=str_split_fixed(ID,":" ,2)[,1]]
tel_tab2<-separate(te_tab, ID, sep="_dup", into=c("subid","ID"))
tel_tab2[,ID:=NULL]
te_sub<-colnames(tel_tab2)[!colnames(tel_tab2) %in% c("ID","subid")]
te_sub2<-tel_tab2[,lapply(.SD,sum), by = .(subid),.SDcols=te_sub]
return(list(te_tab,te_sub2,gene_tab))
}
# Example
c(te_tab,te_sub2, gene_tab) %<-% telocal_load(mypath)
# Writing the generated output tables to the files.
fwrite(te_tab,file="TE_loci_counts.txt", sep="\t",quote="auto", col.names=TRUE)
fwrite(te_sub2,file="TE_subfamily_counts.txt", sep="\t",quote="auto", col.names=TRUE)
fwrite(gene_tab,file="gene_counts.txt", sep="\t",quote="auto", col.names=TRUE)