transcript_counts_Salmon

Create PCA of read counts using Salmon and DESeq2

1. Index the transcriptomes with Salmon

sbatch -n 14 --mem 96G -t 4:00:00 -p Lewis,BioCompute --wrap="salmon index -t pan_trans_cds.fa -i pan_trans_cds_index -k 31 --keepDuplicates -p 14"

For B. rapa:
sbatch -n 14 --mem 96G -t 4:00:00 -p Lewis,BioCompute --wrap="salmon index -t pan_A_cds.fa -i pan_A_cds_index -k 31 --keepDuplicates -p 14"

2. Run salmon quant to get read counts

run salmon_count.sh as:

sh salmon_count.sh sample.txt

if the PE fq file named as 2109.pair_1.fq.gz, 2109.pair_2.fq.gz; 2110.pair_1.fq.gz, 2110.pair_2.fq.gz..., the sample.txt will be:

2109
2110

and the salmon_count.sh is:

while read line
do
	sbatch -n 14 --mem 96G -t 2:00:00 -p Lewis,BioCompute --wrap="salmon quant -l A -i ../pan_trans_cds_index -1 '$line'.pair_1.fq.gz -2 '$line'.pair_2.fq.gz -o quants/'$line'_quant -p 14 --seqBias --validateMappings"
	###for B. rapa change "pan_trans_cds_index" to "pan_A_cds_index"
done <$1

split B. napus quant.sf into A- and C- subgenomes, run split_AC_quant.sh as:

!!only B. napus need this step

sh split_AC_quant.sh sample.txt

and the split_AC_quant.sh is:

while read line
do
	cd "$line"_quant
	head -n1 quant.sf > head.txt
	awk '{if($1 ~ /^Cab/ || $1 ~ /^BnaA/)print}' quant.sf | cat head.txt - > ../chrA_"$line"_quant.sf
	awk '{if($1 ~ /^Bo/ || $1 ~ /^BnaC/)print}' quant.sf | cat head.txt - > ../chrC_"$line"_quant.sf
	cd ..
done<$1

3. Use tximport and DESeq2 to get PCA

Prepare packages for tximport and DESeq2

module load r/r-3.6.0-python-2.7.14-tk
export R_LIBS=~/R/:${R_LIBS} #~/R/: your own R library path
srun --pty -p Interactive -c8 --mem 64G R

###Install R packages only at the first time, and then load them for R (in R cmd model)###
install.packages("BiocManager")
install.packages("jsonlite")
install.packages("plyr", lib="~/R/")
install.packages("ggplot2")
install.packages("Hmisc")
BiocManager::install("DESeq2")
BiocManager::install("tximport")
install.packages("RColorBrewer")
BiocManager::install("apeglm")
##########################################################################################

#Load library
library(ggplot2)
library(DESeq2)
library(tximport)
library(RColorBrewer)
library(apeglm)

B. napus only R script

#Calculate
samples <- read.table("phenotype_Bn_noAdmix.txt", header = TRUE)
files <- file.path("/storage/htc/pireslab/salmon/b_napus/quants", samples$sample, "quant.sf")
trans2gene <- read.csv("trans2gene_all")
txi <- tximport(files, type = "salmon", tx2gene = trans2gene)
write.csv(txi$counts, "txi.csv") #use this if you just want a table of counts per sample
dds <- DESeqDataSetFromTximport(txi, colData = samples, design = ~1)
keep <- rowSums(counts(dds)) >= 2
dds <- dds[keep,]
colSums(counts(dds)) #this just checks how many reads per sample
dds <- estimateSizeFactors(dds)  #I think this is correcting for library size, which is important for all Brassica dataset
dds_vst <- vst(dds) #normalized with respect to library size or other normalization factors.
pcaData <- plotPCA(dds_vst, intgroup=c("group"), ntop = 500, returnData=TRUE)  #PCA plot; MMbary's cmd: pcaData <- plotPCA(dds_vst, intgroup=c("phenotype", "group"), ntop = 500, returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

#Plot
cols <-c("WEAm"='yellow', "WEsA"='orange', "S"='red', "R"='blue', "SK"='purple', "WeA"='green')
PCA_Expression <- ggplot(pcaData, aes(PC1, PC2, color=group)) +
  						geom_point(size=3) +
 						xlab(paste0("PC1: ",percentVar[1],"% variance")) +
 						ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
 						scale_color_manual(values=cols) +
 						ggtitle("Expression Variation between different B. napus") +
 	 					coord_fixed()
ggsave("PCA_Expression_Bnapus.pdf", PCA_Expression, width = 8.5, height = 9)

A subgenome (B. rapa and B. napus) R script

#Calculate
samples <- read.table("phyenotype_A_noAdmix.txt", header = TRUE)
files <- file.path("/storage/htc/pireslab/salmon/sub_A_quant", samples$sample, "quant.sf")
trans2gene <- read.csv("trans2gene_A")
txi <- tximport(files, type = "salmon", tx2gene = trans2gene)
write.csv(txi$counts, "txi.csv") #use this if you just want a table of counts per sample
dds <- DESeqDataSetFromTximport(txi, colData = samples, design = ~1)
keep <- rowSums(counts(dds)) >= 2
dds <- dds[keep,]
colSums(counts(dds)) #this just checks how many reads per sample
dds <- estimateSizeFactors(dds)  #I think this is correcting for library size, which is important for all Brassica dataset
dds_vst <- vst(dds) #normalized with respect to library size or other normalization factors.
pcaData <- plotPCA(dds_vst, intgroup=c("phenotype", "group"), ntop = 500, returnData=TRUE)  #PCA plot
percentVar <- round(100 * attr(pcaData, "percentVar"))

#Plot
cols <-c("WEAm"='yellow', "WEsA"='orange', "S"='red', "R"='blue', "SK"='purple', "WeA"='green', "Chinensis"='cyan2', "EU-CA"='bisque3', "Pekinensis"='darkolivegreen3', "Sarson"='brown', "Sylvestris"='deeppink')
PCA_Expression <- ggplot(pcaData, aes(PC1, PC2, color=group.1, shape=phenotype)) +
  						geom_point(size=2) +
 						xlab(paste0("PC1: ",percentVar[1],"% variance")) +
 						ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
 						scale_color_manual(values=cols) +
 						ggtitle("Expression Variation among A subgenome") +
 	 					coord_fixed()
ggsave("PCA_Expression_Asub.pdf", PCA_Expression, width = 8.5, height = 9)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

README.md

README.md

transcript_counts_Salmon

1. Index the transcriptomes with Salmon

2. Run salmon quant to get read counts

3. Use tximport and DESeq2 to get PCA

Files

README.md

Latest commit

History

README.md

File metadata and controls

transcript_counts_Salmon

1. Index the transcriptomes with Salmon

2. Run salmon quant to get read counts

3. Use tximport and DESeq2 to get PCA