Update the pca process with the functionality from the

deprecated pca-beta process
genialis · Oct 6, 2024 · 4f57d8d · 4f57d8d
1 parent d1097d8
commit 4f57d8d
Show file tree

Hide file tree

Showing 18 changed files with 15 additions and 426 deletions.
diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
@@ -28,6 +28,8 @@ Changed
 - Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``,
   ``upload-fastq-single``, ``upload-fastq-paired``, 
   ``files-to-fastq-single`` and ``files-to-fastq-paired``
+- Update the ``pca`` process with the functionality from the 
+  deprecated ``pca-beta`` process
 
 Fixed
 -----

diff --git a/resolwe_bio/processes/clustering/pca.py b/resolwe_bio/processes/clustering/pca.py
@@ -181,8 +181,8 @@ def run_pca(
 class PrinicipalComponentAnalysis(Process):
     """Principal component analysis process (beta)."""
 
-    slug = "pca-beta"
-    name = "Principal component analysis (beta)"
+    slug = "pca"
+    name = "Principal component analysis (PCA)"
     requirements = {
         "expression-engine": "jinja",
         "executor": {
@@ -197,7 +197,7 @@ class PrinicipalComponentAnalysis(Process):
         },
     }
     data_name = "PCA"
-    version = "0.1.1"
+    version = "3.0.0"
     process_type = "data:pca"
     category = "Enrichment and Clustering"
     scheduling_class = SchedulingClass.INTERACTIVE

diff --git a/resolwe_bio/processes/clustering/pca.yml b/resolwe_bio/processes/clustering/pca.yml
diff --git a/resolwe_bio/tests/files/clustering_NCBI.tab.gz b/resolwe_bio/tests/files/clustering_NCBI.tab.gz
diff --git a/resolwe_bio/tests/files/clustering_NCBI_1.tab.gz b/resolwe_bio/tests/files/clustering_NCBI_1.tab.gz
diff --git a/resolwe_bio/tests/files/clustering_NCBI_2.tab.gz b/resolwe_bio/tests/files/clustering_NCBI_2.tab.gz
diff --git a/resolwe_bio/tests/files/pca_exp_noname1.tab.gz b/resolwe_bio/tests/files/pca_exp_noname1.tab.gz
diff --git a/resolwe_bio/tests/files/pca_exp_noname2.tab.gz b/resolwe_bio/tests/files/pca_exp_noname2.tab.gz
diff --git a/resolwe_bio/tests/files/pca_plot.json.gz b/resolwe_bio/tests/files/pca_plot.json.gz
diff --git a/resolwe_bio/tests/files/pca_plot2.json.gz b/resolwe_bio/tests/files/pca_plot2.json.gz
diff --git a/...e_bio/tests/files/pca_plot_beta_2.json.gz → resolwe_bio/tests/files/pca_plot_2.json.gz b/...e_bio/tests/files/pca_plot_beta_2.json.gz → resolwe_bio/tests/files/pca_plot_2.json.gz
diff --git a/...e_bio/tests/files/pca_plot_beta_3.json.gz → resolwe_bio/tests/files/pca_plot_3.json.gz b/...e_bio/tests/files/pca_plot_beta_3.json.gz → resolwe_bio/tests/files/pca_plot_3.json.gz
diff --git a/...e_bio/tests/files/pca_plot_beta_4.json.gz → resolwe_bio/tests/files/pca_plot_4.json.gz b/...e_bio/tests/files/pca_plot_beta_4.json.gz → resolwe_bio/tests/files/pca_plot_4.json.gz
diff --git a/resolwe_bio/tests/files/pca_plot_beta.json.gz b/resolwe_bio/tests/files/pca_plot_beta.json.gz
diff --git a/resolwe_bio/tests/files/pca_plot_ncbi.json.gz b/resolwe_bio/tests/files/pca_plot_ncbi.json.gz
diff --git a/resolwe_bio/tests/files/pca_plot_single_sample.json.gz b/resolwe_bio/tests/files/pca_plot_single_sample.json.gz
diff --git a/resolwe_bio/tests/processes/test_pca.py b/resolwe_bio/tests/processes/test_pca.py
@@ -22,8 +22,8 @@ def round_elements(element, places=2):
 
 class PcaProcessorTestCase(KBBioProcessTestCase):
     @with_resolwe_host
-    @tag_process("pca-beta")
-    def test_beta_pca(self):
+    @tag_process("pca")
+    def test_pca(self):
         with self.preparation_stage():
             expression_1 = self.prepare_expression(
                 f_rc="exp_1_rc.tab.gz",
@@ -59,10 +59,8 @@ def test_beta_pca(self):
             "source": "DICTYBASE",
             "species": "Dictyostelium discoideum",
         }
-        pca = self.run_process("pca-beta", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_beta.json.gz", pca.output["pca"]
-        )
+        pca = self.run_process("pca", inputs)
+        saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
         self.assertAlmostEqualGeneric(
             round_elements(test_json["flot"]["data"]),
             round_elements(saved_json["flot"]["data"]),
@@ -83,10 +81,8 @@ def test_beta_pca(self):
             "DPU_G0067102",
             "DPU_G0067112",
         ]  # all non-zero
-        pca = self.run_process("pca-beta", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_beta_2.json.gz", pca.output["pca"]
-        )
+        pca = self.run_process("pca", inputs)
+        saved_json, test_json = self.get_json("pca_plot_2.json.gz", pca.output["pca"])
         self.assertAlmostEqualGeneric(
             round_elements(test_json["flot"]["data"]),
             round_elements(saved_json["flot"]["data"]),
@@ -96,10 +92,8 @@ def test_beta_pca(self):
 
         inputs["low_expression_filter"] = False
         inputs["standard_scaler"] = False
-        pca = self.run_process("pca-beta", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_beta_3.json.gz", pca.output["pca"]
-        )
+        pca = self.run_process("pca", inputs)
+        saved_json, test_json = self.get_json("pca_plot_3.json.gz", pca.output["pca"])
         self.assertAlmostEqualGeneric(
             round_elements(test_json["flot"]["data"]),
             round_elements(saved_json["flot"]["data"]),
@@ -112,164 +106,11 @@ def test_beta_pca(self):
             "source": "DICTYBASE",
             "species": "Dictyostelium discoideum",
         }
-        pca = self.run_process("pca-beta", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_beta_4.json.gz", pca.output["pca"]
-        )
+        pca = self.run_process("pca", inputs)
+        saved_json, test_json = self.get_json("pca_plot_4.json.gz", pca.output["pca"])
         self.assertAlmostEqualGeneric(
             round_elements(test_json["flot"]["data"]),
             round_elements(saved_json["flot"]["data"]),
         )
 
         self.assertEqual(len(pca.process_warning), 1)
-
-    @with_resolwe_host
-    @tag_process("pca")
-    def test_pca(self):
-        with self.preparation_stage():
-            expression_1 = self.prepare_expression(
-                f_rc="exp_1_rc.tab.gz",
-                f_exp="exp_1_tpm.tab.gz",
-                f_type="TPM",
-                source="DICTYBASE",
-                species="Dictyostelium discoideum",
-            )
-            expression_2 = self.prepare_expression(
-                f_rc="exp_2_rc.tab.gz",
-                f_exp="exp_2_tpm.tab.gz",
-                f_type="TPM",
-                source="DICTYBASE",
-                species="Dictyostelium discoideum",
-            )
-            expression_noname_1 = self.prepare_expression(
-                f_rc="pca_exp_noname1.tab.gz",
-                f_exp="pca_exp_noname1.tab.gz",
-                f_type="TPM",
-                source="DICTYBASE",
-                species="Dictyostelium discoideum",
-            )
-            expression_noname_2 = self.prepare_expression(
-                f_rc="pca_exp_noname2.tab.gz",
-                f_exp="pca_exp_noname2.tab.gz",
-                f_type="TPM",
-                source="DICTYBASE",
-                species="Dictyostelium discoideum",
-            )
-
-        inputs = {
-            "exps": [expression_1.pk, expression_2.pk],
-            "source": "DICTYBASE",
-            "species": "Dictyostelium discoideum",
-        }
-        pca = self.run_process("pca", inputs)
-        saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
-        self.assertAlmostEqualGeneric(
-            test_json["flot"]["data"], saved_json["flot"]["data"]
-        )
-        self.assertAlmostEqualGeneric(
-            test_json["explained_variance_ratios"],
-            saved_json["explained_variance_ratios"],
-        )
-        self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
-        self.assertEqual(len(pca.process_warning), 0)
-
-        inputs = {
-            "exps": [expression_1.pk, expression_2.pk],
-            "genes": ["DPU_G0067098", "DPU_G0067100", "DPU_G0067104"],  # all zero
-            "source": "DICTYBASE",
-            "species": "Dictyostelium discoideum",
-        }
-        pca = self.run_process("pca", inputs)
-        saved_json, test_json = self.get_json("pca_plot2.json.gz", pca.output["pca"])
-        self.assertAlmostEqualGeneric(
-            test_json["flot"]["data"], saved_json["flot"]["data"]
-        )
-        self.assertAlmostEqualGeneric(
-            test_json["explained_variance_ratios"],
-            saved_json["explained_variance_ratios"],
-        )
-        self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
-        self.assertEqual(len(pca.process_warning), 0)
-
-        inputs = {
-            "exps": [expression_1.pk],
-            "source": "DICTYBASE",
-            "species": "Dictyostelium discoideum",
-        }
-        pca = self.run_process("pca", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_single_sample.json.gz", pca.output["pca"]
-        )
-        self.assertAlmostEqualGeneric(
-            test_json["flot"]["data"], saved_json["flot"]["data"]
-        )
-        self.assertAlmostEqualGeneric(
-            test_json["explained_variance_ratios"],
-            saved_json["explained_variance_ratios"],
-        )
-        self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
-        self.assertEqual(len(pca.process_warning), 0)
-
-        inputs = {
-            "exps": [
-                expression_noname_1.pk,
-                expression_noname_2.pk,
-            ],
-            "source": "DICTYBASE",
-            "species": "Dictyostelium discoideum",
-        }
-        pca = self.run_process("pca", inputs)
-
-    @with_resolwe_host
-    @tag_process("pca")
-    def test_pca_ncbi(self):
-        with self.preparation_stage():
-            expression_1 = self.prepare_expression(
-                f_exp="clustering_NCBI.tab.gz",
-                f_type="rc",
-                name="Expression",
-                source="NCBI",
-                species="Homo sapiens",
-            )
-            expression_2 = self.prepare_expression(
-                f_exp="clustering_NCBI_1.tab.gz",
-                f_type="rc",
-                name="Expression",
-                source="NCBI",
-                species="Homo sapiens",
-            )
-            expression_3 = self.prepare_expression(
-                f_exp="clustering_NCBI_2.tab.gz",
-                f_type="rc",
-                name="Expression",
-                source="NCBI",
-                species="Homo sapiens",
-            )
-
-        inputs = {
-            "exps": [
-                expression_1.pk,
-                expression_2.pk,
-                expression_3.pk,
-            ],
-            "genes": [
-                "1",
-                "503538",
-                "56934",
-                "29974",
-                "2",
-                "144571",
-                "3",
-                "abc",
-                "lll",
-            ],
-            "source": "NCBI",
-            "species": "Homo sapiens",
-        }
-        pca = self.run_process("pca", inputs)
-        saved_json, test_json = self.get_json(
-            "pca_plot_ncbi.json.gz", pca.output["pca"]
-        )
-        self.assertCountEqual(
-            test_json["zero_gene_symbols"], saved_json["zero_gene_symbols"]
-        )