fixup! Add pca-beta process

genialis · Jun 27, 2024 · 95d4750 · 95d4750
1 parent 3a7890b
commit 95d4750
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 17 deletions.
diff --git a/resolwe_bio/processes/clustering/pca.py b/resolwe_bio/processes/clustering/pca.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.decomposition import PCA as plotPCA
+from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 
 from resolwe.process import (
@@ -41,13 +41,12 @@ def get_pca(expressions=pd.DataFrame(), gene_labels=[], error=None):
         all_components = [[], []]
         all_explained_variance_ratios = [0.0, 0.0]
     else:
-        pca = plotPCA(n_components=2, whiten=True)
+        pca = PCA(n_components=2, whiten=True)
         pca_expressions = pca.fit_transform(expressions.transpose())
 
         coordinates = [
             t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions
         ]
-        print(coordinates)
         all_components = [
             component_top_factors(component=component, allgenes_array=gene_labels)
             for component in pca.components_
@@ -85,11 +84,8 @@ def save_pca(result={}, sample_ids=[], max_size=10):
         "all_explained_variance_ratios": result["all_explained_variance_ratios"],
     }
 
-    if output_fn:
-        with open(output_fn, "w") as outfile:
-            json.dump(data, outfile, separators=(",", ":"), allow_nan=False)
-    else:
-        print(json.dumps(data, separators=(",", ":"), allow_nan=False))
+    with open(output_fn, "w") as outfile:
+        json.dump(data, outfile, separators=(",", ":"), allow_nan=False)
 
 
 def read_csv(fname):
@@ -182,16 +178,16 @@ def run_pca(
     save_pca(result, sample_ids)
 
 
-class PCA(Process):
-    """Principal component analysis process."""
+class PrinicipalComponentAnalysis(Process):
+    """Principal component analysis process (beta)."""
 
     slug = "pca-beta"
-    name = "PCA"
+    name = "Principal component analysis (beta)"
     requirements = {
         "expression-engine": "jinja",
         "executor": {
             "docker": {
-                "image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0",
+                "image": "public.ecr.aws/genialis/resolwebio/common:4.1.1",
             },
         },
         "resources": {
@@ -201,7 +197,7 @@ class PCA(Process):
         },
     }
     data_name = "{{ alignment|name|default('?') }}"
-    version = "1.0.0"
+    version = "0.1.0"
     process_type = "data:pca"
     category = "Enrichment and Clustering"
     scheduling_class = SchedulingClass.INTERACTIVE
@@ -218,7 +214,8 @@ class Input:
             label="Gene subset",
             required=False,
             placeholder="new gene id, e.g. ENSG00000185982 (ENSEMBL database)",
-            description="Specify at least two genes or leave this field empty.",
+            description="Specify at least two genes or leave this field empty. "
+            "To subset your analysis to only a subset of genes, specify them here.",
         )
         source = StringField(
             label="Gene ID database of selected genes",
@@ -243,17 +240,18 @@ class Input:
             default=True,
             description="Filter the input expression matrix to remove genes "
             "with low expression values. Only genes with 80th percentile above "
-            "read count of 100 are retained.",
+            "read count of 100 are retained. "
+            "This option only works if raw counts are available.",
         )
         log2 = BooleanField(
             label="Log-transform expressions",
             default=True,
-            description="Transform expressions with log2(x + 1) before PCA.",
+            description="Transform expressions with log2(x + 1) before performing PCA.",
         )
         standard_scaler = BooleanField(
             label="Transform input data using StandardScaler",
             default=True,
-            description="Apply the StandardScaler transformation before PCA.",
+            description="Apply the StandardScaler transformation before performing PCA.",
         )
 
     class Output:

diff --git a/resolwe_bio/tests/files/pca_plot_beta_4.json.gz b/resolwe_bio/tests/files/pca_plot_beta_4.json.gz
diff --git a/resolwe_bio/tests/processes/test_pca.py b/resolwe_bio/tests/processes/test_pca.py
@@ -78,6 +78,39 @@ def test_beta_pca(self):
         self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
         self.assertEqual(len(pca.process_warning), 0)
 
+        with self.preparation_stage():
+            expression_1 = self.prepare_expression(
+                f_exp="exp_1_tpm.tab.gz",
+                f_type="TPM",
+                source="DICTYBASE",
+                species="Dictyostelium discoideum",
+            )
+            expression_2 = self.prepare_expression(
+                f_exp="exp_2_tpm.tab.gz",
+                f_type="TPM",
+                source="DICTYBASE",
+                species="Dictyostelium discoideum",
+            )
+
+        inputs = {
+            "exps": [expression_1.pk, expression_2.pk],
+            "source": "DICTYBASE",
+            "species": "Dictyostelium discoideum",
+        }
+        pca = self.run_process("pca-beta", inputs)
+        saved_json, test_json = self.get_json(
+            "pca_plot_beta_4.json.gz", pca.output["pca"]
+        )
+        self.assertAlmostEqualGeneric(
+            test_json["flot"]["data"], saved_json["flot"]["data"]
+        )
+        self.assertAlmostEqualGeneric(
+            test_json["explained_variance_ratios"],
+            saved_json["explained_variance_ratios"],
+        )
+        self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
+        self.assertEqual(len(pca.process_warning), 0)
+
     @with_resolwe_host
     @tag_process("pca")
     def test_pca(self):