Merge pull request #18 from Russel88/dev

v.1.0.1 - Addition of relative abundance file and changes to README
Russel88 · Jan 9, 2025 · a71f340 · a71f340
2 parents dd3725e + 984867a
commit a71f340
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 137 deletions.
diff --git a/README.md b/README.md
@@ -136,8 +136,14 @@ This is what MAGinator does with your input (if you want to see all parameters r
     * This can be changed with the --multi option.
 * Pick non-redundant genes that are only found in one MAG cluster each
 * Fit signature gene model and use the resulting signature genes to get the abundance of each MAG cluster
+    * Use --num_signature_genes defines the number of signature genes used for the detection of a MAG cluster
     * Use --min_mapped_signature_genes to change minimum number of signature genes to be detected in the sample to be included in the analysis
     * Use --min_samples to alter the number of samples with the MAG cluster present in order to perform signature gene refinement
+* Selection of genes used for abundance calculations
+    * Use --abundance_calculation to select
+        * "sum": abundance is the sum of reads per bp across the total number of signature genes (num_signature_genes) used for the abundance calculation
+        * "ot_trunc" - one tail truncation: abundance is the average of reads per bp across the signature genes, but excluding the most abundant signature genes as indicated by --tail-percentage 
+        * "tt_trunc" - two tailed truncation: abundance is the average of reads per bp across the signature genes but excluding the most AND LEAST abundant signature genes as indicated by --tail-percentage
 * Prepare for generation of phylogenies for each MAG cluster by finding outgroups and marker genes which will be used for rooting the phylogenies
 * Use the read mappings to collect SNV information for each signature gene and marker gene for each sample
 * Align signature and marker genes, concatenate alignments and infer phylogenetic trees for each MAG cluster
@@ -151,6 +157,7 @@ This is what MAGinator does with your input (if you want to see all parameters r
 
 * abundance/
     * abundance_phyloseq.RData - Phyloseq object for R, with absolute abundance and taxonomic data
+    * relative_abundance_phyloseq.RData - Phyloseq object for R, with relative abundance and taxonomic data
 * clusters/
     * <cluster>/<bin>.fa - Fasta files with nucleotide sequence of bins
 * genes/

diff --git a/maginator/main.py b/maginator/main.py
@@ -66,7 +66,7 @@ def cli():
     # Parameters
     app = ap.add_argument_group('parameters')
     app.add_argument('--binsize', help='Minimum bin size for inclusion [%(default)s].', default=200000, type=int)
-    app.add_argument('--mgs_collections', help='If set, bin clusters will be aggregated to metagenomic species.', action='store_true')
+    app.add_argument('--mgs_collections', help='If set, bin clusters will be aggregated to metagenomic species (MGS).', action='store_true')
     app.add_argument('--annotation_prevalence', help='Minimum prevalence of taxonomic assignment in a cluster of bins to call consensus [%(default)s]', default=0.75, type=float)
     app.add_argument('--clustering_coverage', help='Alignment coverage for clustering of genes with MMseqs2 [%(default)s]', default=0.8, type=float)
     app.add_argument('--clustering_min_seq_id', help='Sequence identity threshold for clustering of genes with MMseqs2 [%(default)s]', default=0.95, type=float)
@@ -75,12 +75,12 @@ def cli():
     app.add_argument('--min_identity',help='Minimum percentage of identity for a read to be included [%(default)s]', default=95, type=int)	
     app.add_argument('--min_map', help='Minimum percentage of mapped bases for a read to be included [%(default)s]', default=80, type=int)
     app.add_argument('--multi',help='Method used by msamtools to treat multihit inserts [%(default)s]',default='proportional',type=str,choices=['proportional','ignore','all','equal'])
-    app.add_argument('--abundance_calculation', help='Method employed to calculate the absolute abundances [%(default)s]', default='ot_trun', type=str, choices=['sum', 'ot_trun','tt_trun'])
-    app.add_argument('--tail_percentage', help='Percentage range for the tail of the truncated mean or low_avg method [%(default)s]', default=10, type=float)
-    app.add_argument('--min_gtdb_markers', help='Minimum GTDBtk marker genes shared between MGS and outgroup for rooting trees [%(default)s]', default=10, type=int)
-    app.add_argument('--marker_gene_cluster_prevalence', help='Minimum prevalence of marker genes to be selected for rooting of MGS trees [%(default)s]', default=0.5, type=float)
+    app.add_argument('--abundance_calculation', help='Method employed to calculate the absolute abundances. sum - uses the summed read counts to all signature genes. ot_trunc - one tailed truncation uses the average of reads per bp across signature genes but excluding the most abundant (defined by --tail-percentage). tt_trunc - two tailed truncation uses the average of reads per bp across signature genes but excluding the most AND LEAST abundant (defined by --tail-percentage)  [%(default)s]', default='tt_trun', type=str, choices=['sum', 'ot_trun','tt_trun'])
+    app.add_argument('--tail_percentage', help='Percentage range for the tail of the one- or two-sided truncated method [%(default)s]', default=10, type=float)
+    app.add_argument('--min_gtdb_markers', help='Minimum GTDBtk marker genes shared between the MAG cluster and outgroup for rooting trees [%(default)s]', default=10, type=int)
+    app.add_argument('--marker_gene_cluster_prevalence', help='Minimum prevalence of marker genes to be selected for rooting of MAG cluster trees [%(default)s]', default=0.5, type=float)
     app.add_argument('--min_mapped_signature_genes', help='Minimum number of signature genes with reads mapped for the sample to be included in the refinement [%(default)s]', default=3, type=int)
-    app.add_argument('--num_signature_genes',help='NUmber of signature genes used for the detection of a MGS [%(default)s]', default=100, type=int)
+    app.add_argument('--num_signature_genes', help='Number of signature genes used for the detection of a MAG cluster [%(default)s]', default=100, type=int)
     app.add_argument('--min_samples', help='Minimum number of samples containing the MAG cluster (more than "min_mapped_signature_genes" present) for the MAG cluster to identidy SG [%(default)s]', default=3, type=int)
     app.add_argument('--min_af', help='Minimim allele frequency for calling a base when creating phylogenies [%(default)s]', default=0.8, type=float)
     app.add_argument('--min_depth', help='Minimim read depth for calling a base when creating phylogenies [%(default)s]', default=2, type=int)

diff --git a/maginator/workflow/scripts/abundance_profiles.R b/maginator/workflow/scripts/abundance_profiles.R
@@ -1,130 +1,133 @@
-# Converting the read count matrix to abundance profiles with GTDB-tk annotation
-
-# Initializing 
-library(phyloseq)
-library(MASS)
-library(stringr)
-
-# Loading relevant input
-GeneLengths <- readRDS(snakemake@input[["R_gene_lengths"]]) # The gene lengths
-sg_files <- snakemake@input[["screened_clusters"]]
-screened_clusters <- do.call("rbind", lapply(sg_files, readRDS))
-#load(snakemake@input[["MGS_object"]]) # contain the SGs of the Clusters
-Clusterlist <- readRDS(snakemake@input[["R_clusters"]]) # read count mat ofclusters
-taxonomy <- read.csv(snakemake@input[["annotation"]], header=FALSE, sep="\t") # the taxonomy 
-stat <- snakemake@params[["stat"]] # the statistic used to calculate the abundance
-pctg <- as.integer(snakemake@params[["percentage"]])/100 # percentage of the SG distribution that will be left out to calculate abundances
-colnames(taxonomy) <- c("Cluster","Taxonomy")
-
-#setting important variables
-gene_index <- seq(1,length(GeneLengths))
-gene_names <- names(GeneLengths)
-n.mapped.minimum <- as.integer(snakemake@params[["min_genes"]]) #The number of reads that needs reads that map to count the cluster as present
-n.genes <- as.integer(snakemake@params[["n_genes"]]) # number of signature genes
-
-# inserting NA for the Clusters that do not have a annotation
-taxmat <- matrix("NA", nrow = length(names(Clusterlist)), ncol = 7)
-colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
-rownames(taxmat) <- names(Clusterlist)
-MGSids <- unlist(lapply(str_split(taxonomy$Cluster,","), '[[',1))
-
-for (cluster in names(Clusterlist)){
-  cluster_no <- str_split(cluster, "Cluster")[[1]][2]
-
-  if (cluster_no %in% MGSids){ # if the cluster is annotated
-    tax <- taxonomy$Taxonomy[lapply(str_split(as.character(taxonomy$Cluster),","), '[[',1)==cluster_no]
-    tax <- substr(strsplit(tax, ";")[[1]], 4, nchar(strsplit(tax, ";")[[1]]))
-    row <- rep("NA", 7)
-    row[1:length(tax)] <- as.character(tax)
-    taxmat[cluster,] <- row
-  } 
-}
-
-write.table(taxmat, file=snakemake@output[["tax_matrix"]], row.names=TRUE, col.names=FALSE, sep="\t",quote = FALSE)
-
-# Identifying the "Maintax"
-taxonomy$MainTax <- rep("NA", length(taxonomy[,1]))
-count <- 0 
-for (MGS in taxonomy$Cluster){
-  if (length(taxonomy$Taxonomy[taxonomy$Cluster==MGS])==0){
-    na_df <- data.frame(MGS, "NA;NA;NA", "NA")
-    colnames(na_df) <- c("Cluster", "Taxonomy", "MainTax")
-    taxonomy <- rbind(taxonomy, na_df)
-    count <- count + 1
-  } else {
-    main_tax <- paste(tail(strsplit(as.character(taxonomy$Taxonomy[taxonomy$Cluster==MGS]),";")[[1]], n=2)[1], tail(strsplit(as.character(taxonomy$Taxonomy[taxonomy$Cluster==MGS]),";")[[1]], n=2)[2])
-    taxonomy$MainTax[taxonomy$Cluster == MGS] <- main_tax
-  }
-}
-rownames(taxonomy) <- taxonomy$Cluster # set the clusterID as rownames (for the phyloseq)
-taxonomy$Cluster <- NULL #removing the column witht the clusterID
-
-
-# Prepare a table containing the signature geneID's in column1 and the corresponding cluster in column2
-sg_cluster <- matrix("NA", nrow = length(names(Clusterlist))*100, ncol = 2)
-sg_cluster[,2]<- sort(rep(names(Clusterlist), 100))
-
-# The read counts are normalized according to gene lengths
-final.read.matrix <- matrix(NA, nrow=dim(Clusterlist[[1]])[2], ncol=length(Clusterlist))
-sample.names <- colnames(Clusterlist[[1]]) 
-rownames(final.read.matrix) <- sample.names
-colnames(final.read.matrix) <- names(Clusterlist) 
-
-final.Clusterlist <- Clusterlist
-sg_reads <- list()
-for (id in names(Clusterlist)){  
-  # Repeat for the final SG
-  final.gene.names <- screened_clusters[,3][screened_clusters[,'id']==id][1][[1]]$best
-  final.colsum <- colSums(Clusterlist[[id]][final.gene.names, ])
-  final.mapped <- names(final.colsum[final.colsum >= n.mapped.minimum])
-  final.not_mapped <- sample.names[!sample.names %in% final.mapped]
-  final.Clusterlist[[id]][,final.not_mapped] <- 0 # setting the counts to 0 if less than n.mapped.minimum genes have reads that map
-
-  # The readcounts are divided by the gene length
-  final.reads <- final.Clusterlist[[id]][final.gene.names, ] / GeneLengths[final.gene.names]
-  sg_reads[[id]] <- final.reads
-  # summing the read counts for the id/cluster/MGS
-  if (stat == "sum"){
-    abundance <- colSums(final.reads)
-  } else if (stat == "tt_trun"){ # Obtain the truncated average of the read counts
-    # Calculate truncated mean for each column
-    abundance <- apply(final.reads, 2, function(x) {
-      if (all(x == 0)) { # If all values are 0, the truncated mean would return NaN
-        return(0)
-      } else {
-       quantiles <- quantile(x, probs = c(pctg, 1-pctg))
-        return(mean(x[x >= quantiles[1] & x <= quantiles[2]])) # Should it be also equal??
-      }
-    })
-  } else if (stat == "ot_trun"){ #One tailed truncated mean (only truncated in the top X genes)
-    # Calculate truncated mean for each column
-    abundance <- apply(final.reads, 2, function(x) {
-      if (all(x == 0)) { # If all values are 0, the truncated mean would return NaN
-        return(0)
-      } else {
-        quantiles <- quantile(x, probs = c(1-pctg))  # Should it be also equal??
-        return(mean(x[x <= quantiles]))
-      }
-    })
-
-  }
-  final.read.matrix[, id] <- abundance
-  if (length(final.gene.names)>0){
-  if (length(final.gene.names)!=n.genes){
-  final.gene.names<-c(final.gene.names, rep("NA", (100-length(final.gene.names))))}
-  sg_cluster[sg_cluster[,2]==id] <- matrix(c(final.gene.names, rep(id, length(final.gene.names))), ncol=2)
-}}
-
-write.csv(final.read.matrix,"Absolute_counts.tsv",sep="\t")
-
-final.abundance <- final.read.matrix
-#/rowSums(final.read.matrix)
-
-final.otu.table <- otu_table(final.abundance, taxa_are_rows = FALSE)
-tax.table <- tax_table(taxmat)
-
-final.physeq <-  phyloseq(final.otu.table, tax.table)
-
-save(final.physeq, file = snakemake@output[["physeq_abundance"]])
-write.table(sg_cluster, file = snakemake@output[["sg_cluster"]], row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)
-saveRDS(sg_reads, snakemake@output[["sg_reads"]])
+# Converting the read count matrix to abundance profiles with GTDB-tk annotation
+
+# Initializing 
+library(phyloseq)
+library(MASS)
+library(stringr)
+
+# Loading relevant input
+GeneLengths <- readRDS(snakemake@input[["R_gene_lengths"]]) # The gene lengths
+sg_files <- snakemake@input[["screened_clusters"]]
+screened_clusters <- do.call("rbind", lapply(sg_files, readRDS))
+#load(snakemake@input[["MGS_object"]]) # contain the SGs of the Clusters
+Clusterlist <- readRDS(snakemake@input[["R_clusters"]]) # read count mat ofclusters
+taxonomy <- read.csv(snakemake@input[["annotation"]], header=FALSE, sep="\t") # the taxonomy 
+stat <- snakemake@params[["stat"]] # the statistic used to calculate the abundance
+pctg <- as.integer(snakemake@params[["percentage"]])/100 # percentage of the SG distribution that will be left out to calculate abundances
+colnames(taxonomy) <- c("Cluster","Taxonomy")
+
+#setting important variables
+gene_index <- seq(1,length(GeneLengths))
+gene_names <- names(GeneLengths)
+n.mapped.minimum <- as.integer(snakemake@params[["min_genes"]]) #The number of reads that needs reads that map to count the cluster as present
+n.genes <- as.integer(snakemake@params[["n_genes"]]) # number of signature genes
+
+# inserting NA for the Clusters that do not have a annotation
+taxmat <- matrix("NA", nrow = length(names(Clusterlist)), ncol = 7)
+colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
+rownames(taxmat) <- names(Clusterlist)
+MGSids <- unlist(lapply(str_split(taxonomy$Cluster,","), '[[',1))
+
+for (cluster in names(Clusterlist)){
+  cluster_no <- str_split(cluster, "Cluster")[[1]][2]
+
+  if (cluster_no %in% MGSids){ # if the cluster is annotated
+    tax <- taxonomy$Taxonomy[lapply(str_split(as.character(taxonomy$Cluster),","), '[[',1)==cluster_no]
+    tax <- substr(strsplit(tax, ";")[[1]], 4, nchar(strsplit(tax, ";")[[1]]))
+    row <- rep("NA", 7)
+    row[1:length(tax)] <- as.character(tax)
+    taxmat[cluster,] <- row
+  } 
+}
+
+write.table(taxmat, file=snakemake@output[["tax_matrix"]], row.names=TRUE, col.names=FALSE, sep="\t",quote = FALSE)
+
+# Identifying the "Maintax"
+taxonomy$MainTax <- rep("NA", length(taxonomy[,1]))
+count <- 0 
+for (MGS in taxonomy$Cluster){
+  if (length(taxonomy$Taxonomy[taxonomy$Cluster==MGS])==0){
+    na_df <- data.frame(MGS, "NA;NA;NA", "NA")
+    colnames(na_df) <- c("Cluster", "Taxonomy", "MainTax")
+    taxonomy <- rbind(taxonomy, na_df)
+    count <- count + 1
+  } else {
+    main_tax <- paste(tail(strsplit(as.character(taxonomy$Taxonomy[taxonomy$Cluster==MGS]),";")[[1]], n=2)[1], tail(strsplit(as.character(taxonomy$Taxonomy[taxonomy$Cluster==MGS]),";")[[1]], n=2)[2])
+    taxonomy$MainTax[taxonomy$Cluster == MGS] <- main_tax
+  }
+}
+rownames(taxonomy) <- taxonomy$Cluster # set the clusterID as rownames (for the phyloseq)
+taxonomy$Cluster <- NULL #removing the column witht the clusterID
+
+
+# Prepare a table containing the signature geneID's in column1 and the corresponding cluster in column2
+sg_cluster <- matrix("NA", nrow = length(names(Clusterlist))*100, ncol = 2)
+sg_cluster[,2]<- sort(rep(names(Clusterlist), 100))
+
+# The read counts are normalized according to gene lengths
+final.read.matrix <- matrix(NA, nrow=dim(Clusterlist[[1]])[2], ncol=length(Clusterlist))
+sample.names <- colnames(Clusterlist[[1]]) 
+rownames(final.read.matrix) <- sample.names
+colnames(final.read.matrix) <- names(Clusterlist) 
+
+final.Clusterlist <- Clusterlist
+sg_reads <- list()
+for (id in names(Clusterlist)){  
+  # Repeat for the final SG
+  final.gene.names <- screened_clusters[,3][screened_clusters[,'id']==id][1][[1]]$best
+  final.colsum <- colSums(Clusterlist[[id]][final.gene.names, ])
+  final.mapped <- names(final.colsum[final.colsum >= n.mapped.minimum])
+  final.not_mapped <- sample.names[!sample.names %in% final.mapped]
+  final.Clusterlist[[id]][,final.not_mapped] <- 0 # setting the counts to 0 if less than n.mapped.minimum genes have reads that map
+
+  # The readcounts are divided by the gene length
+  final.reads <- final.Clusterlist[[id]][final.gene.names, ] / GeneLengths[final.gene.names]
+  sg_reads[[id]] <- final.reads
+  # summing the read counts for the id/cluster/MGS
+  if (stat == "sum"){
+    abundance <- colSums(final.reads)
+  } else if (stat == "tt_trun"){ # Obtain the truncated average of the read counts
+    # Calculate truncated mean for each column
+    abundance <- apply(final.reads, 2, function(x) {
+      if (all(x == 0)) { # If all values are 0, the truncated mean would return NaN
+        return(0)
+      } else {
+       quantiles <- quantile(x, probs = c(pctg, 1-pctg))
+        return(mean(x[x >= quantiles[1] & x <= quantiles[2]])) # Should it be also equal??
+      }
+    })
+  } else if (stat == "ot_trun"){ #One tailed truncated mean (only truncated in the top X genes)
+    # Calculate truncated mean for each column
+    abundance <- apply(final.reads, 2, function(x) {
+      if (all(x == 0)) { # If all values are 0, the truncated mean would return NaN
+        return(0)
+      } else {
+        quantiles <- quantile(x, probs = c(1-pctg))  # Should it be also equal??
+        return(mean(x[x <= quantiles]))
+      }
+    })
+
+  }
+  final.read.matrix[, id] <- abundance
+  if (length(final.gene.names)>0){
+  if (length(final.gene.names)!=n.genes){
+  final.gene.names<-c(final.gene.names, rep("NA", (100-length(final.gene.names))))}
+  sg_cluster[sg_cluster[,2]==id] <- matrix(c(final.gene.names, rep(id, length(final.gene.names))), ncol=2)
+}}
+
+write.csv(final.read.matrix,"Absolute_counts.tsv",sep="\t")
+
+final.abundance <- final.read.matrix
+relative.abundance <-  final.read.matrix/rowSums(final.read.matrix)
+
+final.otu.table <- otu_table(final.abundance, taxa_are_rows = FALSE)
+relative.otu.table <- otu_table(relative.abundance, taxa_are_rows = FALSE)
+tax.table <- tax_table(taxmat)
+
+final.physeq <-  phyloseq(final.otu.table, tax.table)
+relative.physeq <- phyloseq(relative.otu.table, tax.table)
+
+save(final.physeq, file = snakemake@output[["physeq_abundance"]])
+save(relative.physeq, file = snakemake@output[["physeq_rel_abundance"]])
+write.table(sg_cluster, file = snakemake@output[["sg_cluster"]], row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)
+saveRDS(sg_reads, snakemake@output[["sg_reads"]])
diff --git a/maginator/workflow/signature_genes.Snakefile b/maginator/workflow/signature_genes.Snakefile
@@ -72,6 +72,7 @@ rule abundance_profile:
         annotation = os.path.join(WD, 'tabs', 'metagenomicspecies.tab')
     output:
         physeq_abundance = os.path.join(WD, 'abundance', 'abundance_phyloseq.RData'),
+        physeq_rel_abundance = os.path.join(WD, 'abundance', 'relative_abundance_phyloseq.RData'),
         tax_matrix = os.path.join(WD, 'tabs', 'tax_matrix.tsv'),
         sg_cluster = os.path.join(WD, 'tabs', 'signature_genes_cluster.tsv'),
         sg_reads = os.path.join(WD, 'tabs', 'signature_genes_counts.rds')

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="maginator", 
-    version="1.0.0",
+    version="1.0.1",
     author="Jakob Russel & Trine Zachariasen",
     author_email="russel2620@gmail.com,trine_zachariasen@hotmail.com",
     description="MAGinator: Abundance, strain, and functional profiling of MAGs",