diff --git a/poetry.lock b/poetry.lock index c313ca5a6..5ec26c272 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2621,6 +2621,39 @@ files = [ {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, ] +[[package]] +name = "polars" +version = "0.18.13" +description = "Blazingly fast DataFrame library" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polars-0.18.13-cp38-abi3-macosx_10_7_x86_64.whl", hash = "sha256:d71167aea2968d7f354f2553a56369684b66dca48efb7dc0963fee7041bfc267"}, + {file = "polars-0.18.13-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:eaa55c2bfab114718f9605d3149d58d7f92f95533da1e23559994b7a12f9b3b2"}, + {file = "polars-0.18.13-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543d136666b8be18f679587b48bdc45b4541f332a9050f0ee90449cbf3d01a35"}, + {file = "polars-0.18.13-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d052e18686c8a9b9a68c8360ad90e53886990460dc65aeb8f72d4c54859d398f"}, + {file = "polars-0.18.13-cp38-abi3-win_amd64.whl", hash = "sha256:4b340a193144b2f5276b9c3c538784da80d9aca28ad5f27b7c183cdc292876bc"}, + {file = "polars-0.18.13.tar.gz", hash = "sha256:b00d1c7700969c47f3202c5be54d074d99df51acb51943dc4b60cdcf759940fd"}, +] + +[package.extras] +adbc = ["adbc_driver_sqlite"] +all = ["polars[adbc,cloudpickle,connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,pydantic,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx"] +deltalake = ["deltalake (>=0.10.0)"] +fsspec = ["fsspec"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "prefixcommons" version = "0.1.12" @@ -4293,4 +4326,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0.0" -content-hash = "f7e918756caa973314f7051b9b5d0d09b4945dd98e50232f45d4f0e385fcb6ea" +content-hash = "c04e3e48c27ef475fb44556640cfdf5e23c1a373db8abcabfd5b8f59f3a1ba4c" diff --git a/pyproject.toml b/pyproject.toml index 6714147d1..c6dee7b90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ plotly = "^5.13.0" seaborn = "^0.12.2" matplotlib = "^3.7.0" pyserde = "^0.9.8" +polars = "^0.18.13" [tool.poetry.dev-dependencies] pytest = "^7.2.0" diff --git a/src/pheval/cli.py b/src/pheval/cli.py index ed6b4d163..c905150f7 100644 --- a/src/pheval/cli.py +++ b/src/pheval/cli.py @@ -9,6 +9,7 @@ from .cli_pheval_utils import ( create_spiked_vcfs_command, scramble_phenopackets_command, + semsim_comparison_command, semsim_convert_command, semsim_scramble_command, update_phenopackets_command, @@ -54,6 +55,7 @@ def pheval_utils(): pheval_utils.add_command(semsim_convert_command) pheval_utils.add_command(scramble_phenopackets_command) pheval_utils.add_command(update_phenopackets_command) +pheval_utils.add_command(semsim_comparison_command) pheval_utils.add_command(create_spiked_vcfs_command) pheval_utils.add_command(benchmark) pheval_utils.add_command(benchmark_comparison) diff --git a/src/pheval/cli_pheval_utils.py b/src/pheval/cli_pheval_utils.py index 59e5f6013..51246004e 100644 --- a/src/pheval/cli_pheval_utils.py +++ b/src/pheval/cli_pheval_utils.py @@ -9,7 +9,7 @@ from pheval.prepare.create_spiked_vcf import spike_vcfs from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError from pheval.prepare.update_phenopacket import update_phenopackets -from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot +from pheval.utils.semsim_utils import semsim_comparison from pheval.utils.utils import semsim_convert, semsim_scramble @@ -116,18 +116,11 @@ def scramble_phenopackets_command( @click.command("semsim-comparison") @click.option( - "--semsim-left", - "-L", - required=True, - metavar="FILE", - help="Path to the first semantic similarity profile.", -) -@click.option( - "--semsim-right", - "-R", + "--input", + "-i", + multiple=True, required=True, - metavar="FILE", - help="Path to the second semantic similarity profile.", + help="Semsim inputs file", ) @click.option( "--score-column", @@ -142,42 +135,39 @@ def scramble_phenopackets_command( "--analysis", "-a", required=True, - type=click.Choice(["heatmap", "percentage_diff"], case_sensitive=False), + type=click.Choice(["heatmap", "percentage_diff", "distribution"], case_sensitive=False), help="""There are two types of analysis: heatmap - Generates a heatmap plot that shows the differences between the semantic similarity profiles using the score column for this purpose. Defaults to "heatmap". - percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles""", + percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles + distribution - Plot showing the semsim score's distributions""", ) @click.option( - "--output", - "-o", - metavar="FILE", - default="percentage_diff.semsim.tsv", - help="Output path for the difference tsv. Defaults to percentage_diff.semsim.tsv", + "--output-dir", + "-O", + metavar="output_dir", + default=".", + help="Output path directory for the comparisons", ) -def semsim_comparison( - semsim_left: Path, - semsim_right: Path, +def semsim_comparison_command( + input: List[Path], score_column: str, analysis: str, - output: Path = "percentage_diff.semsim.tsv", + output_dir: Path, ): - """Compares two semantic similarity profiles + """Compares semantic similarity profiles Args: - semsim-left (Path): File path of the first semantic similarity profile - semsim-right (Path): File path of the second semantic similarity profile - output (Path): Output path for the difference tsv. Defaults to "percentage_diff.semsim.tsv". + input (List[Path]): File paths semantic similarity profiles + output-dir (Path): Output directory path for the comparisons. score_column (str): Score column that will be computed (e.g. jaccard_similarity) - analysis (str): There are two types of analysis: + analysis (str): There are three types of analysis: heatmap - Generates a heatmap plot that shows the differences between the semantic similarity profiles using the score column for this purpose. Defaults to "heatmap". percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles. + distribution - Plot showing the semsim score's distributions """ - if analysis == "heatmap": - return semsim_heatmap_plot(semsim_left, semsim_right, score_column) - if analysis == "percentage_diff": - percentage_diff(semsim_left, semsim_right, score_column, output) + semsim_comparison(input, score_column, analysis, output_dir) @click.command("update-phenopackets") diff --git a/src/pheval/utils/semsim_utils.py b/src/pheval/utils/semsim_utils.py index 1451d533b..fd6c02358 100644 --- a/src/pheval/utils/semsim_utils.py +++ b/src/pheval/utils/semsim_utils.py @@ -1,66 +1,142 @@ """ Contains all pheval utility methods """ +import logging +import subprocess +from itertools import combinations from pathlib import Path +from typing import List -import numpy import pandas as pd import plotly.express as px +import polars as pl +import seaborn as sns +from matplotlib import pyplot as plt + +from tqdm import tqdm import pheval.utils.file_utils as file_utils +info_log = logging.getLogger("info") + + +def semsim_comparison(input: List[Path], score_column: str, analysis: str, output: Path): + """Makes a paired semantic similarity profiles comparison based on a chosen score column + + Args: + input (List[Path]): semsim profiles path's + score_column (str): Score column that will be computed (e.g. jaccard_similarity) + analysis (str): There are three types of analysis: + heatmap - Generates a heatmap plot that shows the differences between the semantic similarity profiles using the + score column for this purpose. Defaults to "heatmap". + percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles. + distribution - Plot showing the semsim score's distributions + + """ + for s in set(combinations(input, 2)): + semsim_left = s[0] + semsim_right = s[1] + if analysis == "heatmap": + semsim_heatmap_plot(semsim_left, semsim_right, score_column) + if analysis == "percentage_diff": + percentage_diff(semsim_left, semsim_right, score_column, output) + if analysis == "distribution": + distribution(input, score_column, output) + -def filter_non_0_score(data: pd.DataFrame, col: str) -> pd.DataFrame: +def filter_non_0_score(data: pl.DataFrame, col: str) -> pd.DataFrame: """Removes rows that have value equal to 0 based on the given column passed by col parameter Args: - data (pd.DataFrame): Dirty dataframe + data (pl.DataFrame): Dirty dataframe col (str): Column to be filtered Returns: - pd.DataFrame: Filtered dataframe + pl.DataFrame: Filtered dataframe """ - return data[data[col] != 0] + return data.filter(pl.col(col) != 0) -def parse_semsim(df: pd.DataFrame, cols: list) -> pd.DataFrame: +def parse_semsim(df: pl.DataFrame, cols: list) -> pd.DataFrame: """Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Args: - df (pd.DataFrame): semantic similarity profile dataframe + df (pl.DataFrame): semantic similarity profile dataframe cols (list): list of columns that will be selected on semsim data Returns: pd.Dataframe: parsed semantic similarity dataframe """ - df[cols[-1]] = pd.to_numeric(df[cols[-1]], errors="coerce") - df.replace("None", numpy.nan).dropna(subset=cols[-1], inplace=True) + df.with_columns(pl.col(cols[-1]).cast(pl.Float64)) + df[cols[-1]].set(df[cols[-1]].is_null(), None) + + df.drop_nulls(cols[-1]) return df def diff_semsim( - semsim_left: pd.DataFrame, semsim_right: pd.DataFrame, score_column: str, absolute_diff: bool -) -> pd.DataFrame: + semsim_left: pl.DataFrame, semsim_right: pl.DataFrame, score_column: str, absolute_diff: bool +) -> pl.DataFrame: """Calculates score difference between two semantic similarity profiles Args: - semsim_left (pd.DataFrame): first semantic similarity dataframe - semsim_right (pd.DataFrame): second semantic similarity dataframe + semsim_left (pl.DataFrame): first semantic similarity dataframe + semsim_right (pl.DataFrame): second semantic similarity dataframe score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: - pd.DataFrame: A dataframe with terms and its scores differences + pl.DataFrame: A dataframe with terms and its scores differences """ - df = pd.merge(semsim_left, semsim_right, on=["subject_id", "object_id"], how="outer") + df = semsim_left.join(semsim_right, on=["subject_id", "object_id"], how="outer") + df.drop_nulls(score_column) + df.drop_nulls(f"{score_column}_right") if absolute_diff: - df["diff"] = df[f"{score_column}_x"] - df[f"{score_column}_y"] - return df[["subject_id", "object_id", "diff"]] - df["diff"] = df.apply( - lambda row: get_percentage_diff(row[f"{score_column}_x"], row[f"{score_column}_y"]), axis=1 + df = df.with_columns((pl.col(score_column) - pl.col(f"{score_column}_right")).alias("diff")) + return df[["subject_id", "object_id", f"{score_column}", f"{score_column}_right", "diff"]] + df = df.with_columns( + # horizontal sum with a custom apply + pl.struct([score_column, f"{score_column}_right"]) + .apply(lambda x: get_percentage_diff(x[score_column], x[f"{score_column}_right"])) + .alias("diff") + ) + return df[["subject_id", "object_id", f"{score_column}", f"{score_column}_right", "diff"]] + + +def distribution(input: List[Path], score_column: str, output: Path): + df_list = [] + plt.rcParams["figure.autolayout"] = True + plt.rcParams["figure.figsize"] = [20, 3.50 * len(input)] + _, axes = plt.subplots(len(input), 1) + for idx, i in enumerate(input): + print(f"Reading {Path(i).stem}") + df = pl.read_csv(i, separator="\t") + df = df[["subject_id", "object_id", f"{score_column}"]] + df = df.with_columns(semsim=pl.lit(Path(i).stem)) + df_list.append(df) + axes[idx].ticklabel_format(style="plain", axis="both") + axes[idx].set_xlabel(score_column) + sns.histplot(df[score_column], bins=20, ax=axes[idx]).set_title(Path(i).stem) + plt.setp(axes, ylim=axes[0].get_ylim()) + print("Concatenating data") + df_concat = pl.concat(df_list) + print(f"Saving plot in {output}/bars.png") + plt.savefig(f"{output}/bars.png") + plt.clf() + graph = sns.histplot( + df_concat, + x=score_column, + bins=10, + multiple="dodge", + fill=True, + kde=True, + alpha=0.5, + hue="semsim", ) - return df[["subject_id", "object_id", f"{score_column}_x", f"{score_column}_y", "diff"]] + graph.ticklabel_format(style="plain", axis="both") + print(f"Saving plot in {output}/dist.png") + plt.savefig(f"{output}/dist.png") def percentage_diff(semsim_left: Path, semsim_right: Path, score_column: str, output: Path): @@ -72,8 +148,24 @@ def percentage_diff(semsim_left: Path, semsim_right: Path, score_column: str, ou score_column (str): Score column that will be computed (e.g. jaccard_similarity) output (Path): Output path for the difference tsv file """ - clean_df = semsim_analysis(semsim_left, semsim_right, score_column, absolute_diff=False) - clean_df.sort_values(by="diff", ascending=False).to_csv(output, sep="\t", index=False) + fname_left = Path(semsim_left).stem + fname_right = Path(semsim_right).stem + fname = f"{output}/{fname_left}-{fname_right}.diff.tsv" + Path(fname).unlink(missing_ok=True) + for idx, clean_df in enumerate( + semsim_analysis(semsim_left, semsim_right, score_column, absolute_diff=False) + ): + ( + clean_df.drop_nulls("diff") + .sort("diff", descending=True) + .rename( + { + score_column: f"{fname_left}_{score_column}", + f"{score_column}_right": f"{fname_right}_{score_column}", + } + ) + .write_csv(fname, has_header=idx == 0, separator="\t") + ) def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str): @@ -87,12 +179,13 @@ def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str clean_df = semsim_analysis(semsim_left, semsim_right, score_column) df = clean_df.pivot(index="subject_id", columns="object_id", values="diff") fig = px.imshow(df, text_auto=True) + fig.update_layout( + title=f"{Path(semsim_left).stem} - {Path(semsim_right).stem}", xaxis_nticks=36 + ) fig.show() -def semsim_analysis( - semsim_left: Path, semsim_right: Path, score_column: str, absolute_diff=True -) -> pd.DataFrame: +def semsim_analysis(semsim_left: Path, semsim_right: Path, score_column: str, absolute_diff=True): """semsim_analysis Args: @@ -102,22 +195,35 @@ def semsim_analysis( absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. - Returns: - [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles + Yields: + pd.DataFrame: DataFrame with the differences between two semantic similarity profiles """ validate_semsim_file_comparison(semsim_left, semsim_right) cols = ["subject_id", "object_id", score_column] - semsim_left = pd.read_csv(semsim_left, sep="\t") - semsim_right = pd.read_csv(semsim_right, sep="\t") - file_utils.ensure_columns_exists( - cols=cols, - err_message="must exist in semsim dataframes", - dataframes=[semsim_left, semsim_right], - ) - semsim_left = parse_semsim(semsim_left, cols) - semsim_right = parse_semsim(semsim_right, cols) - diff_df = diff_semsim(semsim_left, semsim_right, score_column, absolute_diff) - return filter_non_0_score(diff_df, "diff") + batch_size = 100000 + count = int(subprocess.check_output(["wc", "-l", semsim_left]).split()[0]) + reader_left = pl.read_csv_batched(semsim_left, separator="\t", batch_size=batch_size) + reader_right = pl.read_csv_batched(semsim_right, separator="\t", batch_size=batch_size) + # file_utils.ensure_columns_exists( + # cols=cols, + # err_message="must exist in semsim dataframes", + # dataframes=[reader_left, reader_right], + # ) + batches_left = reader_left.next_batches(5) + batches_right = reader_right.next_batches(5) + with tqdm(total=count - 1) as bar: + while batches_left or batches_right: + for input_data in zip(batches_left, batches_right): + semsim_left = parse_semsim(input_data[0], cols) + semsim_right = parse_semsim(input_data[1], cols) + diff_df = diff_semsim(semsim_left, semsim_right, score_column, absolute_diff) + bar.update(input_data[0].shape[0]) + if not absolute_diff: + yield diff_df + else: + yield filter_non_0_score(diff_df, "diff") + batches_left = reader_left.next_batches(5) + batches_right = reader_right.next_batches(5) def validate_semsim_file_comparison(semsim_left: Path, semsim_right: Path): @@ -145,6 +251,8 @@ def get_percentage_diff(current_number: float, previous_number: float) -> float: float: percentage difference between two numbers """ try: + if not current_number or not previous_number: + return None if current_number == previous_number: return "{:.2%}".format(0) if current_number > previous_number: diff --git a/testdata/semsim/hp-mp.semsim.tsv b/testdata/semsim/hp-mp.semsim.tsv index 2e97d2f2e..4b16b2ad6 100644 --- a/testdata/semsim/hp-mp.semsim.tsv +++ b/testdata/semsim/hp-mp.semsim.tsv @@ -1,10 +1,10 @@ subject_id subject_label subject_source object_id object_label object_source ancestor_id ancestor_label ancestor_source object_information_content subject_information_content ancestor_information_content jaccard_similarity dice_similarity phenodigm_score -HP:0000001 None None HP:0000236 None None HP:0000001 None None None None 0.6926096656076508 0.05263157894736842 None 0.19092705490615916 -HP:0000001 None None HP:0000309 None None HP:0000001 None None None None 0.6926096656076508 0.08333333333333333 None 0.24024460895922492 -HP:0000001 None None HP:0000322 None None HP:0000001 None None None None 0.6926096656076508 0.034482758620689655 None 0.15454155401543365 -HP:0000001 None None HP:0000735 None None HP:0000001 None None None None 0.6926096656076508 0.07142857142857142 None 0.22242328783644721 -HP:0000001 None None HP:0000826 None None HP:0000001 None None None None 0.6926096656076508 0.06666666666666667 None 0.21488131074427277 -HP:0000001 None None HP:0000853 None None HP:0000001 None None None None 0.6926096656076508 0.08333333333333333 None 0.24024460895922492 -HP:0000001 None None HP:0000938 None None HP:0000001 None None None None 0.6926096656076508 0.05555555555555555 None 0.19615890180152568 -HP:0000001 None None HP:0001144 None None HP:0000001 None None None None 0.6926096656076508 0.0625 None 0.20805793448094734 -HP:0000001 None None HP:0001443 None None HP:0000001 None None None None 0.6926096656076508 0.13 None 0.26317478329195043 +HP:0000001 None None HP:0000236 None None HP:0000001 None None None None 0.6926096656076508 0.02 None 0.19092705490615916 +HP:0000001 None None HP:0000309 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.24024460895922492 +HP:0000001 None None HP:0000322 None None HP:0000001 None None None None 0.6926096656076508 0.04 None 0.15454155401543365 +HP:0000001 None None HP:0000735 None None HP:0000001 None None None None 0.6926096656076508 0.09 None 0.22242328783644721 +HP:0000001 None None HP:0000826 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.21488131074427277 +HP:0000001 None None HP:0000853 None None HP:0000001 None None None None 0.6926096656076508 0.02 None 0.24024460895922492 +HP:0000001 None None HP:0000938 None None HP:0000001 None None None None 0.6926096656076508 0.09 None 0.19615890180152568 +HP:0000001 None None HP:0001144 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.20805793448094734 +HP:0000001 None None HP:0001443 None None HP:0000001 None None None None 0.6926096656076508 0.9 None 0.26317478329195043 diff --git a/testdata/semsim/hp-mp3.semsim.tsv b/testdata/semsim/hp-mp3.semsim.tsv new file mode 100644 index 000000000..7564003d9 --- /dev/null +++ b/testdata/semsim/hp-mp3.semsim.tsv @@ -0,0 +1,10 @@ +subject_id subject_label subject_source object_id object_label object_source ancestor_id ancestor_label ancestor_source object_information_content subject_information_content ancestor_information_content jaccard_similarity dice_similarity phenodigm_score +HP:0000001 None None HP:0000236 None None HP:0000001 None None None None 0.6927082271817895 0.01 None 0.19617285846668062 +HP:0000001 None None HP:0000309 None None HP:0000001 None None None None 0.6927082271817895 0.09 None 0.24026170231329516 +HP:0000001 None None HP:0000322 None None HP:0000001 None None None None 0.6927082271817895 0.01 None 0.1545525496149303 +HP:0000001 None None HP:0000735 None None HP:0000001 None None None None 0.6927082271817895 0.08 None 0.222439113207218 +HP:0000001 None None HP:0000826 None None HP:0000001 None None None None 0.6927082271817895 0.02 None 0.21489659950493856 +HP:0000001 None None HP:0000853 None None HP:0000001 None None None None 0.6927082271817895 0.04 None 0.24026170231329516 +HP:0000001 None None HP:0000938 None None HP:0000001 None None None None 0.6927082271817895 0.01 None 0.19617285846668062 +HP:0000001 None None HP:0001144 None None HP:0000001 None None None None 0.6927082271817895 0.02 None 0.20807273775980803 +HP:0000001 None None HP:0001443 None None HP:0000001 None None None None 0.6927082271817895 0.5 None 0.26319350812316583 diff --git a/tests/test_cli.py b/tests/test_cli.py index b65449bb0..a9024822d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,7 +8,7 @@ from click.testing import CliRunner from pheval.cli_pheval import run -from pheval.cli_pheval_utils import semsim_comparison +from pheval.cli_pheval_utils import semsim_comparison_command class TestCommandLineInterface(unittest.TestCase): @@ -54,15 +54,18 @@ def test_cli_runner(self): def test_semsim_heatmap(self): """test_semsim_heatmap""" - semsim_left = "./testdata/semsim/hp-mp.semsim.tsv" - semsim_right = "./testdata/semsim/hp-mp2.semsim.tsv" + semsim_1 = "./testdata/semsim/hp-mp.semsim.tsv" + semsim_2 = "./testdata/semsim/hp-mp2.semsim.tsv" + semsim_3 = "./testdata/semsim/hp-mp3.semsim.tsv" result = self.runner.invoke( - semsim_comparison, + semsim_comparison_command, [ - "--semsim-left", - semsim_left, - "--semsim-right", - semsim_right, + "--input", + semsim_1, + "--input", + semsim_2, + "--input", + semsim_3, "-c", "jaccard_similarity", "-a", @@ -79,11 +82,11 @@ def test_semsim_heatmap_invalid_col(self): semsim_left = "./testdata/semsim/hp-mp.semsim.tsv" semsim_right = "./testdata/semsim/hp-mp2.semsim.tsv" result = self.runner.invoke( - semsim_comparison, + semsim_comparison_command, [ - "--semsim-left", + "--input", semsim_left, - "--semsim-right", + "--input", semsim_right, "-c", "invalid_col", @@ -100,11 +103,11 @@ def test_semsim_heatmap_invalid_file(self): semsim_left = "./testdata/semsim/hp-mpx.semsim.tsv" semsim_right = "./testdata/semsim/hp-mp2.semsim.tsv" result = self.runner.invoke( - semsim_comparison, + semsim_comparison_command, [ - "--semsim-left", + "--input", semsim_left, - "--semsim-right", + "--input", semsim_right, "-c", "jaccard_similarity", @@ -121,11 +124,11 @@ def test_semsim_heatmap_invalid_equal_file(self): semsim_left = "./testdata/semsim/hp-mp.semsim.tsv" semsim_right = "./testdata/semsim/hp-mp.semsim.tsv" result = self.runner.invoke( - semsim_comparison, + semsim_comparison_command, [ - "--semsim-left", + "--input", semsim_left, - "--semsim-right", + "--input", semsim_right, "-c", "jaccard_similarity", @@ -137,3 +140,56 @@ def test_semsim_heatmap_invalid_equal_file(self): self.assertEqual(errmsg, str(result.exception)) logging.info("ERR=%s", result.exception) self.assertEqual(1, result.exit_code) + + def test_semsim_distribution_plot(self): + """test_semsim_distribution_plot""" + semsim_1 = "./testdata/semsim/hp-mp.semsim.tsv" + semsim_2 = "./testdata/semsim/hp-mp2.semsim.tsv" + semsim_3 = "./testdata/semsim/hp-mp3.semsim.tsv" + Path("./results").mkdir(parents=True, exist_ok=True) + result = self.runner.invoke( + semsim_comparison_command, + [ + "--input", + semsim_1, + "--input", + semsim_2, + "--input", + semsim_3, + "-c", + "jaccard_similarity", + "-a", + "distribution", + "-O", + "./results", + ], + ) + err = result.stderr + self.assertEqual(None, result.exception) + logging.info("ERR=%s", err) + self.assertEqual(0, result.exit_code) + + def test_semsim_diff(self): + """test_semsim_distribution_plot""" + semsim_1 = "/home/vinicius/workspace/monarch-semantic-similarity-profiles/profiles/upheno2-lattice-hp-mp.semsimian.0.tsv" + semsim_2 = "/home/vinicius/workspace/monarch-semantic-similarity-profiles/profiles/upheno2-lattice-hp-hp.semsimian.0.tsv" + Path("./results").mkdir(parents=True, exist_ok=True) + result = self.runner.invoke( + semsim_comparison_command, + [ + "--input", + semsim_1, + "--input", + semsim_2, + "-c", + "jaccard_similarity", + "-a", + "percentage_diff", + "-O", + "./results", + ], + ) + err = result.stderr + self.assertEqual(None, result.exception) + logging.info("ERR=%s", err) + self.assertEqual(0, result.exit_code)