From 4f57d8db2dcc42e7529621b1885fa9130651e4ea Mon Sep 17 00:00:00 2001 From: Janez Kokosar Date: Sun, 6 Oct 2024 22:33:41 +0200 Subject: [PATCH] Update the pca process with the functionality from the deprecated pca-beta process --- docs/CHANGELOG.rst | 2 + resolwe_bio/processes/clustering/pca.py | 6 +- resolwe_bio/processes/clustering/pca.yml | 109 ----------- .../tests/files/clustering_NCBI.tab.gz | Bin 154 -> 0 bytes .../tests/files/clustering_NCBI_1.tab.gz | Bin 158 -> 0 bytes .../tests/files/clustering_NCBI_2.tab.gz | Bin 156 -> 0 bytes .../tests/files/pca_exp_noname1.tab.gz | Bin 66 -> 0 bytes .../tests/files/pca_exp_noname2.tab.gz | Bin 66 -> 0 bytes resolwe_bio/tests/files/pca_plot.json.gz | Bin 381 -> 302 bytes resolwe_bio/tests/files/pca_plot2.json.gz | Bin 230 -> 0 bytes ...plot_beta_2.json.gz => pca_plot_2.json.gz} | Bin ...plot_beta_3.json.gz => pca_plot_3.json.gz} | Bin ...plot_beta_4.json.gz => pca_plot_4.json.gz} | Bin resolwe_bio/tests/files/pca_plot_beta.json.gz | Bin 302 -> 0 bytes resolwe_bio/tests/files/pca_plot_ncbi.json.gz | Bin 343 -> 0 bytes .../files/pca_plot_single_sample.json.gz | Bin 194 -> 0 bytes resolwe_bio/tests/processes/test_pca.py | 179 +----------------- resolwe_bio/tools/pca.py | 145 -------------- 18 files changed, 15 insertions(+), 426 deletions(-) delete mode 100644 resolwe_bio/processes/clustering/pca.yml delete mode 100644 resolwe_bio/tests/files/clustering_NCBI.tab.gz delete mode 100644 resolwe_bio/tests/files/clustering_NCBI_1.tab.gz delete mode 100644 resolwe_bio/tests/files/clustering_NCBI_2.tab.gz delete mode 100644 resolwe_bio/tests/files/pca_exp_noname1.tab.gz delete mode 100644 resolwe_bio/tests/files/pca_exp_noname2.tab.gz delete mode 100644 resolwe_bio/tests/files/pca_plot2.json.gz rename resolwe_bio/tests/files/{pca_plot_beta_2.json.gz => pca_plot_2.json.gz} (100%) rename resolwe_bio/tests/files/{pca_plot_beta_3.json.gz => pca_plot_3.json.gz} (100%) rename resolwe_bio/tests/files/{pca_plot_beta_4.json.gz => pca_plot_4.json.gz} (100%) delete mode 100644 resolwe_bio/tests/files/pca_plot_beta.json.gz delete mode 100644 resolwe_bio/tests/files/pca_plot_ncbi.json.gz delete mode 100644 resolwe_bio/tests/files/pca_plot_single_sample.json.gz delete mode 100755 resolwe_bio/tools/pca.py diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 03fc74975..a1cb834d8 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -28,6 +28,8 @@ Changed - Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``, ``upload-fastq-single``, ``upload-fastq-paired``, ``files-to-fastq-single`` and ``files-to-fastq-paired`` +- Update the ``pca`` process with the functionality from the + deprecated ``pca-beta`` process Fixed ----- diff --git a/resolwe_bio/processes/clustering/pca.py b/resolwe_bio/processes/clustering/pca.py index 2002bcf54..28b4a4485 100644 --- a/resolwe_bio/processes/clustering/pca.py +++ b/resolwe_bio/processes/clustering/pca.py @@ -181,8 +181,8 @@ def run_pca( class PrinicipalComponentAnalysis(Process): """Principal component analysis process (beta).""" - slug = "pca-beta" - name = "Principal component analysis (beta)" + slug = "pca" + name = "Principal component analysis (PCA)" requirements = { "expression-engine": "jinja", "executor": { @@ -197,7 +197,7 @@ class PrinicipalComponentAnalysis(Process): }, } data_name = "PCA" - version = "0.1.1" + version = "3.0.0" process_type = "data:pca" category = "Enrichment and Clustering" scheduling_class = SchedulingClass.INTERACTIVE diff --git a/resolwe_bio/processes/clustering/pca.yml b/resolwe_bio/processes/clustering/pca.yml deleted file mode 100644 index ef2295f81..000000000 --- a/resolwe_bio/processes/clustering/pca.yml +++ /dev/null @@ -1,109 +0,0 @@ -# === -# PCA -# === ---- - -- slug: pca - name: PCA - requirements: - expression-engine: jinja - executor: - docker: - image: public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0 - resources: - cores: 1 - memory: 4096 - storage: 10 - data_name: 'PCA' - version: 2.4.2 - type: data:pca - category: Enrichment and Clustering - persistence: TEMP - scheduling_class: interactive - description: | - Principal component analysis (PCA) - input: - - name: exps - label: Expressions - type: list:data:expression - - name: genes - label: Gene subset - type: list:basic:string - required: false - - name: source - label: Gene ID database of selected genes - description: This field is required if gene subset is set. - type: basic:string - required: false - - name: species - label: Species - type: basic:string - description: | - Species latin name. This field is required if gene subset is set. - allow_custom_choice: true - required: false - choices: - - label: Homo sapiens - value: Homo sapiens - - label: Mus musculus - value: Mus musculus - - label: Rattus norvegicus - value: Rattus norvegicus - - label: Dictyostelium discoideum - value: Dictyostelium discoideum - - label: Odocoileus virginianus texanus - value: Odocoileus virginianus texanus - - label: Solanum tuberosum - value: Solanum tuberosum - output: - - name: pca - label: PCA - type: basic:json - run: - runtime: polyglot - language: bash - program: | - - {% for e in exps %} - - {% if e.source != (exps|first).source %} - re-warning "All expression data must be annotated by the same genome database." - re-error "Sample {{ exps|first|name }} has {{ (exps|first).source }} gene IDs, while sample {{ e|name }} has {{ e.source }} gene IDs." - {% endif %} - - {% if e.species != (exps|first).species %} - re-warning "All expressions must be of the same Species." - re-error "Sample {{ exps|first|name }} is {{ (exps|first).species }}, while sample {{ e|name }} is {{ e.species }}." - {% endif %} - - {% if e.exp_type != (exps|first).exp_type %} - re-warning "All expressions must be of the same Expression type." - re-error "Expression {{ exps|first|name }} has {{ (exps|first).exp_type }} expression type, while sample {{ e|name }} has {{ e.exp_type }} expression type." - {% endif %} - - {% if e.exp_type != (exps|first).exp_type %} - re-warning "All expressions must be of the same Expression type." - re-error "Expression {{ exps|first|name }} has {{ (exps|first).feature_type }} feature type, while sample {{ e|name }} has {{ e.feature_type }} feature type." - {% endif %} - - {% if genes %} - {%if e.source != source %} - re-warning "Selected genes must be annotated by the same genome database as all expression files." - re-error "Gene IDs are from {{ source }} database while sample {{ e|name }} has gene IDs from {{ e.source }} database." - {% endif %} - {%if e.species != species %} - re-warning "Selected genes must be from the same species as all expression files." - re-error "Selected genes are {{ species }}, while expression {{ e|name }} is {{ e.species }}" - {% endif %} - {% endif %} - - {% endfor %} - - pca.py \ - --sample-files {% for exp in exps %} {{ exp.exp.file }} {% endfor %} \ - --sample-ids {% for exp in exps %} {{ exp|sample_id }} {% endfor %} \ - {% if genes %} --gene-labels {% for gene in genes %} {{ gene }} {% endfor %} {% endif %} \ - --output-fn pca.json - re-checkrc "Principal component analysis failed." - - re-save pca pca.json diff --git a/resolwe_bio/tests/files/clustering_NCBI.tab.gz b/resolwe_bio/tests/files/clustering_NCBI.tab.gz deleted file mode 100644 index 58000dc86d944e65714f80d5d95470cc607f3398..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 154 zcmV;L0A>FliwFqyuc24~17mD;b97~LX>MmMm8K@3AJkL!9oPpD`TxjcP~Ijp-v3?{jn36027G5b4CMxj=j zO9JT~B^uEEt62?gX;ng(M+;Hi^kFuPnjzd6Aw#mjT+AF2wq~v-q|(NM#jn*oIR9+p M3qCC>!nOba0AMshx&QzG diff --git a/resolwe_bio/tests/files/clustering_NCBI_2.tab.gz b/resolwe_bio/tests/files/clustering_NCBI_2.tab.gz deleted file mode 100644 index d47911b6410f925fe5d5cd8fff90bea9d191d537..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 156 zcmV;N0Av3jiwFplu%TE017mD;b97~LX>Mm;;7K}Yqy?dpUx~$?>WO6~`PihOH@I{)q zGCJ2^*#a}Cq9xr1EwTl=x+?@U^#W5ch0Lmy*(_R-(&|&T0d+_+iwFpWNke7=|8QerUvO-1bYEg+bYU)Pb8l_{12#hVzn4#%h;wQ4f*bs910=vP`Y*#PrtkO?zl5~4{;_>#hEm2cIi=7P%2Oq zq|%gXDYz1w;z8r}rl_`$@(QZz#FArhdmoYk**zDOB;|CgU{)sA4Z+*CnQ{=iDX_jI z`4GDy_P(DNxvI#^?#;eJvPd?mF2xW^F##wI)kdOFIo&OWzrn!dZYxyARya*1H&SRY zs1yowZ+fLTXyh6VaLKe}Ysu~*)b+A67lQq_!+-fP+7I7*JDs|H49ooa!ykv>+TJ(z z(~Yg`58jS$ZsYZ6N=Ro@g5V4VDh`x^K)+?|XQ6*BzkCA#0RR63006p0)hq)50KQF; ALI3~& literal 381 zcmV-@0fPP?iwFpQbQWR)|8QerUvO-1bS`RhZ*BnWk}+=rF${(O$|*A-+ez%CYb$kS z>2yM;0BLp7afbj^P{e<4aul_NTiT^dr$jHG?L6xp)pOJB)Ln5@%diXSQkG+w2nYg; zLv(D7(5Sf4+-y{ox9R{%vxKUB6Xvl=1Jv}sK$VokSB0{&4Xbq%t9rT3A5lF;YIaJ; zqFb%IHnzL|sVvpQ^r?D;GzKBprm5PoFl((5ycU!H_ep{0L?vs?&T6Z{Sp%B+JB3JR z3Y>9PtQALwf#eDU?neR|K_;ysH$!2%ZU<-Xy4(%RW>_9<0{+q@>o-pGKC~C3y|YZ7 z=-TPW)>z1avFMT&@n*FZ5c}hGN-8whCSLp*+-6BXqxIjGyj?S zNoL;SrmJ3J8>{VMHSd}}Hu)j$*G;I~xU62orVi~QR-3S^yB@#^j0&D0qgX&O-s$8m bjx}9!_}lW+I{*Lx|NjF3n?us9o&^8^k}0_b diff --git a/resolwe_bio/tests/files/pca_plot2.json.gz b/resolwe_bio/tests/files/pca_plot2.json.gz deleted file mode 100644 index 3d57a3f9d9bbb42807f1fbb8173b27fd094fb0ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 230 zcmVgw|8QerUvO-1bTTe#b8l_{%#lwE!Y~ZQze>qjBop!9Rl%zl zPi2(oq7=GzY(>+iwFpWNke7=|8QerUvO-1bYEg+bYU)Pb8l_{12#hVzn4#%h;wQ4f*bs910=vP`Y*#PrtkO?zl5~4{;_>#hEm2cIi=7P%2Oq zq|%gXDYz1w;z8r}rl_`$@(QZz#FArhdmoYk**zDOB;|CgU{)sA4Z+*CnQ{=iDX_jI z`4GDy_P(DNxvI#^?#;eJvPd?mF2xW^F##wI)kdOFIo&OWzrn!dZYxyARya*1H&SRY zs1yowZ+fLTXyh6VaLKe}Ysu~*)b+A67lQq_!+-fP+7I7*JDs|H49ooa!ykv>+TJ(z z(~Yg`58jS$ZsYZ6N=Ro@g5V4VDh`x^K)+?|XQ6*BzkCA#0RR63006p0)hq)50KQF; ALI3~& diff --git a/resolwe_bio/tests/files/pca_plot_ncbi.json.gz b/resolwe_bio/tests/files/pca_plot_ncbi.json.gz deleted file mode 100644 index 2667cd6522212525440d78489fc69fe71f236305..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 343 zcmV-d0jT~TiwFpk4B1!!18`$uUvO-1bYE^`VredFb8l_{)lji+10f9EEA<~v%n7mq z+t9T?(ESvhNJ6WV$X%42s-#i=y+As2?N|m3e$V!MzNz;-FY2YcscVkgl*?1Sy>>>e zI(7M-!J65}C6<5RuR)U-wCLG;YXY0F>R_|#dejaW5r6?9IOBivrNUbq#A84s?ASG= zK5j z%s?i9NWtZSJ4aGUMQU|WrJCY`#1)Qj90E($*G55Gi!3Fx4Rm0&Z?HgoQHqBq$>Zfi@HBQeFGvOX(zk_007(&rf&cM diff --git a/resolwe_bio/tests/files/pca_plot_single_sample.json.gz b/resolwe_bio/tests/files/pca_plot_single_sample.json.gz deleted file mode 100644 index 0ae0f6b55eac0158af896a4eb2765eb45f15ddea..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 194 zcmV;z06qU7iwFp^$H`j)|8QerUvO-1bYF96Zf9&|Uvpt?aBO8RYIARH0E~{i3d1lA zMSsO;4khW>wLj3^7^B2$F~|=bQ}Q7HUfYz?rO-u)bFU8C;m#q$MeJZOV_Br^PkkrK zP^uHKFpk89!|f^tC}jO6=SpUDkx5%K*LxU19feN8#o)>3MrD!7W<+NFE88dq{UA?z wUfmeD#WkQMGFneYKQUSK6X}H6hF17I{I>sBK3)I-0RR630F#r{EBpWe030@1>Hq)$ diff --git a/resolwe_bio/tests/processes/test_pca.py b/resolwe_bio/tests/processes/test_pca.py index 41d51d439..8e135092b 100644 --- a/resolwe_bio/tests/processes/test_pca.py +++ b/resolwe_bio/tests/processes/test_pca.py @@ -22,8 +22,8 @@ def round_elements(element, places=2): class PcaProcessorTestCase(KBBioProcessTestCase): @with_resolwe_host - @tag_process("pca-beta") - def test_beta_pca(self): + @tag_process("pca") + def test_pca(self): with self.preparation_stage(): expression_1 = self.prepare_expression( f_rc="exp_1_rc.tab.gz", @@ -59,10 +59,8 @@ def test_beta_pca(self): "source": "DICTYBASE", "species": "Dictyostelium discoideum", } - pca = self.run_process("pca-beta", inputs) - saved_json, test_json = self.get_json( - "pca_plot_beta.json.gz", pca.output["pca"] - ) + pca = self.run_process("pca", inputs) + saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"]) self.assertAlmostEqualGeneric( round_elements(test_json["flot"]["data"]), round_elements(saved_json["flot"]["data"]), @@ -83,10 +81,8 @@ def test_beta_pca(self): "DPU_G0067102", "DPU_G0067112", ] # all non-zero - pca = self.run_process("pca-beta", inputs) - saved_json, test_json = self.get_json( - "pca_plot_beta_2.json.gz", pca.output["pca"] - ) + pca = self.run_process("pca", inputs) + saved_json, test_json = self.get_json("pca_plot_2.json.gz", pca.output["pca"]) self.assertAlmostEqualGeneric( round_elements(test_json["flot"]["data"]), round_elements(saved_json["flot"]["data"]), @@ -96,10 +92,8 @@ def test_beta_pca(self): inputs["low_expression_filter"] = False inputs["standard_scaler"] = False - pca = self.run_process("pca-beta", inputs) - saved_json, test_json = self.get_json( - "pca_plot_beta_3.json.gz", pca.output["pca"] - ) + pca = self.run_process("pca", inputs) + saved_json, test_json = self.get_json("pca_plot_3.json.gz", pca.output["pca"]) self.assertAlmostEqualGeneric( round_elements(test_json["flot"]["data"]), round_elements(saved_json["flot"]["data"]), @@ -112,164 +106,11 @@ def test_beta_pca(self): "source": "DICTYBASE", "species": "Dictyostelium discoideum", } - pca = self.run_process("pca-beta", inputs) - saved_json, test_json = self.get_json( - "pca_plot_beta_4.json.gz", pca.output["pca"] - ) + pca = self.run_process("pca", inputs) + saved_json, test_json = self.get_json("pca_plot_4.json.gz", pca.output["pca"]) self.assertAlmostEqualGeneric( round_elements(test_json["flot"]["data"]), round_elements(saved_json["flot"]["data"]), ) self.assertEqual(len(pca.process_warning), 1) - - @with_resolwe_host - @tag_process("pca") - def test_pca(self): - with self.preparation_stage(): - expression_1 = self.prepare_expression( - f_rc="exp_1_rc.tab.gz", - f_exp="exp_1_tpm.tab.gz", - f_type="TPM", - source="DICTYBASE", - species="Dictyostelium discoideum", - ) - expression_2 = self.prepare_expression( - f_rc="exp_2_rc.tab.gz", - f_exp="exp_2_tpm.tab.gz", - f_type="TPM", - source="DICTYBASE", - species="Dictyostelium discoideum", - ) - expression_noname_1 = self.prepare_expression( - f_rc="pca_exp_noname1.tab.gz", - f_exp="pca_exp_noname1.tab.gz", - f_type="TPM", - source="DICTYBASE", - species="Dictyostelium discoideum", - ) - expression_noname_2 = self.prepare_expression( - f_rc="pca_exp_noname2.tab.gz", - f_exp="pca_exp_noname2.tab.gz", - f_type="TPM", - source="DICTYBASE", - species="Dictyostelium discoideum", - ) - - inputs = { - "exps": [expression_1.pk, expression_2.pk], - "source": "DICTYBASE", - "species": "Dictyostelium discoideum", - } - pca = self.run_process("pca", inputs) - saved_json, test_json = self.get_json("pca_plot.json.gz", pca.output["pca"]) - self.assertAlmostEqualGeneric( - test_json["flot"]["data"], saved_json["flot"]["data"] - ) - self.assertAlmostEqualGeneric( - test_json["explained_variance_ratios"], - saved_json["explained_variance_ratios"], - ) - self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"]) - self.assertEqual(len(pca.process_warning), 0) - - inputs = { - "exps": [expression_1.pk, expression_2.pk], - "genes": ["DPU_G0067098", "DPU_G0067100", "DPU_G0067104"], # all zero - "source": "DICTYBASE", - "species": "Dictyostelium discoideum", - } - pca = self.run_process("pca", inputs) - saved_json, test_json = self.get_json("pca_plot2.json.gz", pca.output["pca"]) - self.assertAlmostEqualGeneric( - test_json["flot"]["data"], saved_json["flot"]["data"] - ) - self.assertAlmostEqualGeneric( - test_json["explained_variance_ratios"], - saved_json["explained_variance_ratios"], - ) - self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"]) - self.assertEqual(len(pca.process_warning), 0) - - inputs = { - "exps": [expression_1.pk], - "source": "DICTYBASE", - "species": "Dictyostelium discoideum", - } - pca = self.run_process("pca", inputs) - saved_json, test_json = self.get_json( - "pca_plot_single_sample.json.gz", pca.output["pca"] - ) - self.assertAlmostEqualGeneric( - test_json["flot"]["data"], saved_json["flot"]["data"] - ) - self.assertAlmostEqualGeneric( - test_json["explained_variance_ratios"], - saved_json["explained_variance_ratios"], - ) - self.assertAlmostEqualGeneric(test_json["components"], saved_json["components"]) - self.assertEqual(len(pca.process_warning), 0) - - inputs = { - "exps": [ - expression_noname_1.pk, - expression_noname_2.pk, - ], - "source": "DICTYBASE", - "species": "Dictyostelium discoideum", - } - pca = self.run_process("pca", inputs) - - @with_resolwe_host - @tag_process("pca") - def test_pca_ncbi(self): - with self.preparation_stage(): - expression_1 = self.prepare_expression( - f_exp="clustering_NCBI.tab.gz", - f_type="rc", - name="Expression", - source="NCBI", - species="Homo sapiens", - ) - expression_2 = self.prepare_expression( - f_exp="clustering_NCBI_1.tab.gz", - f_type="rc", - name="Expression", - source="NCBI", - species="Homo sapiens", - ) - expression_3 = self.prepare_expression( - f_exp="clustering_NCBI_2.tab.gz", - f_type="rc", - name="Expression", - source="NCBI", - species="Homo sapiens", - ) - - inputs = { - "exps": [ - expression_1.pk, - expression_2.pk, - expression_3.pk, - ], - "genes": [ - "1", - "503538", - "56934", - "29974", - "2", - "144571", - "3", - "abc", - "lll", - ], - "source": "NCBI", - "species": "Homo sapiens", - } - pca = self.run_process("pca", inputs) - saved_json, test_json = self.get_json( - "pca_plot_ncbi.json.gz", pca.output["pca"] - ) - self.assertCountEqual( - test_json["zero_gene_symbols"], saved_json["zero_gene_symbols"] - ) diff --git a/resolwe_bio/tools/pca.py b/resolwe_bio/tools/pca.py deleted file mode 100755 index 71cc2a433..000000000 --- a/resolwe_bio/tools/pca.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -"""Principal components analysis.""" - -import argparse -import json - -import numpy as np -import pandas as pd -from resolwe_runtime_utils import send_message, warning -from sklearn.decomposition import PCA - - -def get_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="PCA") - parser.add_argument( - "--sample-files", "-f", nargs="+", help="Sample file names", required=True - ) - parser.add_argument( - "--sample-ids", "-i", nargs="+", help="Sample IDs", required=True - ) - parser.add_argument("--gene-labels", "-g", nargs="+", help="Filter genes by label") - parser.add_argument( - "--components", "-c", help="Number of PCA components", type=int, default=2 - ) - parser.add_argument("--output-fn", "-o", help="Output file name") - return parser.parse_args() - - -def component_top_factors(component, allgenes_array, max_size=20): - """Return top 20 absolute factors.""" - abs_component = np.abs(component) - size = min(component.size, max_size) - unordered_ixs = np.argpartition(abs_component, -size)[-size:] - ixs = unordered_ixs[np.argsort(abs_component[unordered_ixs])[::-1]] - if ixs.size == 0: - return [] - return list(zip(np.array(allgenes_array)[ixs].tolist(), component[ixs].tolist())) - - -def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]): - """Compute PCA.""" - if not gene_labels: - gene_labels = expressions.index - skipped_gene_labels = list(set(gene_labels).difference(expressions.index)) - - if expressions.shape[0] < 2 or expressions.shape[1] < 2: - coordinates = [[0.0, 0.0] for i in range(expressions.shape[1])] - all_components = [[], []] - all_explained_variance_ratios = [0.0, 0.0] - else: - pca = PCA(n_components=n_components, whiten=True) - pca_expressions = pca.fit_transform(expressions.transpose()) - - coordinates = [ - t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions - ] - all_components = [ - component_top_factors(component, gene_labels) - for component in pca.components_ - ] - if np.isnan(pca.explained_variance_ratio_).any(): - all_explained_variance_ratios = [0.0 for _ in pca.explained_variance_ratio_] - else: - all_explained_variance_ratios = pca.explained_variance_ratio_.tolist() - - result = { - "coordinates": coordinates, - "all_components": all_components, - "all_explained_variance_ratios": all_explained_variance_ratios, - "skipped_gene_labels": skipped_gene_labels, - "warning": None, - } - - if expressions.empty: - send_message( - warning( - "Gene selection and filtering resulted in no genes. Please select different samples or genes." - ) - ) - - return result - - -def save_pca(result={}, sample_ids=[], output_fn=None, max_size=10): - """Save PCA.""" - data = { - "flot": { - "data": result["coordinates"], - "xlabel": "PC 1", - "ylabel": "PC 2", - "sample_ids": sample_ids, - }, - "zero_gene_symbols": result["skipped_gene_labels"], - "components": result["all_components"][:max_size], - "all_components": result["all_components"], - "explained_variance_ratios": result["all_explained_variance_ratios"][:max_size], - "all_explained_variance_ratios": result["all_explained_variance_ratios"], - } - - if output_fn: - with open(output_fn, "w") as outfile: - json.dump(data, outfile, separators=(",", ":"), allow_nan=False) - else: - print(json.dumps(data, separators=(",", ":"), allow_nan=False)) - - -def read_csv(fname): - """Read CSV file and return Pandas DataFrame.""" - csv = pd.read_csv( - filepath_or_buffer=fname, - sep="\t", - header=0, - index_col=0, - dtype={ - 0: str, - 1: float, - }, - keep_default_na=False, - ) - csv.index = csv.index.map(str) - return csv - - -def get_csv(fnames): - """Read CSV files and return Pandas DataFrame.""" - expressions = [read_csv(fname) for fname in fnames] - return pd.concat(expressions, axis=1, join="inner") - - -def main(): - """Read data, run PCA, and output results.""" - args = get_args() - expressions = get_csv(args.sample_files) - - if args.gene_labels: - gene_labels = set(args.gene_labels).intersection(expressions.index) - expressions = expressions.loc[gene_labels] - - result = get_pca(expressions, args.components, args.gene_labels) - save_pca(result, args.sample_ids, args.output_fn) - - -if __name__ == "__main__": - main()