Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the pca process with the functionality from the deprecated pca-beta process #1391

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Changed
- Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``,
``upload-fastq-single``, ``upload-fastq-paired``,
``files-to-fastq-single`` and ``files-to-fastq-paired``
- Update the ``pca`` process with the functionality from the
deprecated ``pca-beta`` process

Fixed
-----
Expand Down
6 changes: 3 additions & 3 deletions resolwe_bio/processes/clustering/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ def run_pca(
class PrinicipalComponentAnalysis(Process):
"""Principal component analysis process (beta)."""

slug = "pca-beta"
name = "Principal component analysis (beta)"
slug = "pca"
name = "Principal component analysis (PCA)"
requirements = {
"expression-engine": "jinja",
"executor": {
Expand All @@ -197,7 +197,7 @@ class PrinicipalComponentAnalysis(Process):
},
}
data_name = "PCA"
version = "0.1.1"
version = "3.0.0"
process_type = "data:pca"
category = "Enrichment and Clustering"
scheduling_class = SchedulingClass.INTERACTIVE
Expand Down
109 changes: 0 additions & 109 deletions resolwe_bio/processes/clustering/pca.yml

This file was deleted.

Binary file removed resolwe_bio/tests/files/clustering_NCBI.tab.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/clustering_NCBI_1.tab.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/clustering_NCBI_2.tab.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_exp_noname1.tab.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_exp_noname2.tab.gz
Binary file not shown.
Binary file modified resolwe_bio/tests/files/pca_plot.json.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_plot2.json.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_plot_beta.json.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_plot_ncbi.json.gz
Binary file not shown.
Binary file removed resolwe_bio/tests/files/pca_plot_single_sample.json.gz
Binary file not shown.
179 changes: 10 additions & 169 deletions resolwe_bio/tests/processes/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def round_elements(element, places=2):

class PcaProcessorTestCase(KBBioProcessTestCase):
@with_resolwe_host
@tag_process("pca-beta")
def test_beta_pca(self):
@tag_process("pca")
def test_pca(self):
with self.preparation_stage():
expression_1 = self.prepare_expression(
f_rc="exp_1_rc.tab.gz",
Expand Down Expand Up @@ -59,10 +59,8 @@ def test_beta_pca(self):
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca-beta", inputs)
saved_json, test_json = self.get_json(
"pca_plot_beta.json.gz", pca.output["pca"]
)
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
Expand All @@ -83,10 +81,8 @@ def test_beta_pca(self):
"DPU_G0067102",
"DPU_G0067112",
] # all non-zero
pca = self.run_process("pca-beta", inputs)
saved_json, test_json = self.get_json(
"pca_plot_beta_2.json.gz", pca.output["pca"]
)
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_2.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
Expand All @@ -96,10 +92,8 @@ def test_beta_pca(self):

inputs["low_expression_filter"] = False
inputs["standard_scaler"] = False
pca = self.run_process("pca-beta", inputs)
saved_json, test_json = self.get_json(
"pca_plot_beta_3.json.gz", pca.output["pca"]
)
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_3.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
Expand All @@ -112,164 +106,11 @@ def test_beta_pca(self):
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca-beta", inputs)
saved_json, test_json = self.get_json(
"pca_plot_beta_4.json.gz", pca.output["pca"]
)
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot_4.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
round_elements(test_json["flot"]["data"]),
round_elements(saved_json["flot"]["data"]),
)

self.assertEqual(len(pca.process_warning), 1)

@with_resolwe_host
@tag_process("pca")
def test_pca(self):
with self.preparation_stage():
expression_1 = self.prepare_expression(
f_rc="exp_1_rc.tab.gz",
f_exp="exp_1_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_2 = self.prepare_expression(
f_rc="exp_2_rc.tab.gz",
f_exp="exp_2_tpm.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_noname_1 = self.prepare_expression(
f_rc="pca_exp_noname1.tab.gz",
f_exp="pca_exp_noname1.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)
expression_noname_2 = self.prepare_expression(
f_rc="pca_exp_noname2.tab.gz",
f_exp="pca_exp_noname2.tab.gz",
f_type="TPM",
source="DICTYBASE",
species="Dictyostelium discoideum",
)

inputs = {
"exps": [expression_1.pk, expression_2.pk],
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
test_json["flot"]["data"], saved_json["flot"]["data"]
)
self.assertAlmostEqualGeneric(
test_json["explained_variance_ratios"],
saved_json["explained_variance_ratios"],
)
self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
self.assertEqual(len(pca.process_warning), 0)

inputs = {
"exps": [expression_1.pk, expression_2.pk],
"genes": ["DPU_G0067098", "DPU_G0067100", "DPU_G0067104"], # all zero
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json("pca_plot2.json.gz", pca.output["pca"])
self.assertAlmostEqualGeneric(
test_json["flot"]["data"], saved_json["flot"]["data"]
)
self.assertAlmostEqualGeneric(
test_json["explained_variance_ratios"],
saved_json["explained_variance_ratios"],
)
self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
self.assertEqual(len(pca.process_warning), 0)

inputs = {
"exps": [expression_1.pk],
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json(
"pca_plot_single_sample.json.gz", pca.output["pca"]
)
self.assertAlmostEqualGeneric(
test_json["flot"]["data"], saved_json["flot"]["data"]
)
self.assertAlmostEqualGeneric(
test_json["explained_variance_ratios"],
saved_json["explained_variance_ratios"],
)
self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"])
self.assertEqual(len(pca.process_warning), 0)

inputs = {
"exps": [
expression_noname_1.pk,
expression_noname_2.pk,
],
"source": "DICTYBASE",
"species": "Dictyostelium discoideum",
}
pca = self.run_process("pca", inputs)

@with_resolwe_host
@tag_process("pca")
def test_pca_ncbi(self):
with self.preparation_stage():
expression_1 = self.prepare_expression(
f_exp="clustering_NCBI.tab.gz",
f_type="rc",
name="Expression",
source="NCBI",
species="Homo sapiens",
)
expression_2 = self.prepare_expression(
f_exp="clustering_NCBI_1.tab.gz",
f_type="rc",
name="Expression",
source="NCBI",
species="Homo sapiens",
)
expression_3 = self.prepare_expression(
f_exp="clustering_NCBI_2.tab.gz",
f_type="rc",
name="Expression",
source="NCBI",
species="Homo sapiens",
)

inputs = {
"exps": [
expression_1.pk,
expression_2.pk,
expression_3.pk,
],
"genes": [
"1",
"503538",
"56934",
"29974",
"2",
"144571",
"3",
"abc",
"lll",
],
"source": "NCBI",
"species": "Homo sapiens",
}
pca = self.run_process("pca", inputs)
saved_json, test_json = self.get_json(
"pca_plot_ncbi.json.gz", pca.output["pca"]
)
self.assertCountEqual(
test_json["zero_gene_symbols"], saved_json["zero_gene_symbols"]
)
Loading
Loading