From 2ced69ee6a6c6405e4b646d8f0952c9782ad9187 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 15 Jan 2025 10:16:38 -0500 Subject: [PATCH] Refactor paper ranking workflow (#1358) --- .github/workflows/paper_ranking.yml | 4 +- pyproject.toml | 7 + src/bioregistry/analysis/paper_ranking.py | 400 +++++++++++------- .../analysis/paper_ranking_requirements.txt | 6 - tests/{ => resources}/mock_pubmed_data.json | 0 tests/resources/mock_search.json | 7 + tests/test_paper_ranking.py | 91 ++-- tox.ini | 8 + 8 files changed, 307 insertions(+), 216 deletions(-) delete mode 100644 src/bioregistry/analysis/paper_ranking_requirements.txt rename tests/{ => resources}/mock_pubmed_data.json (100%) create mode 100644 tests/resources/mock_search.json diff --git a/.github/workflows/paper_ranking.yml b/.github/workflows/paper_ranking.yml index 35913e803..f181f2608 100644 --- a/.github/workflows/paper_ranking.yml +++ b/.github/workflows/paper_ranking.yml @@ -20,8 +20,9 @@ jobs: - name: Install dependencies run: | + # TODO update to using uv python -m pip install --upgrade pip - pip install -r src/bioregistry/analysis/paper_ranking_requirements.txt + pip install .[paper-ranking] - name: Set Date Variables id: set-date-variables @@ -39,6 +40,7 @@ jobs: id: run-ranking-script run: | echo "PYTHONPATH=$PYTHONPATH" # Verify PYTHONPATH + # TODO update to using python -m python src/bioregistry/analysis/paper_ranking.py --start-date ${{ env.START_DATE }} --end-date ${{ env.END_DATE }} - name: Upload Full List as Artifact diff --git a/pyproject.toml b/pyproject.toml index a2f7fcfa4..699dc000f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,6 +136,13 @@ web = [ "curies[fastapi]", "a2wsgi", ] +paper-ranking = [ + "indra", + "pandas", + "scikit-learn", + "tabulate", + "more_itertools", +] # See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#urls diff --git a/src/bioregistry/analysis/paper_ranking.py b/src/bioregistry/analysis/paper_ranking.py index 1bd334635..f646678fc 100644 --- a/src/bioregistry/analysis/paper_ranking.py +++ b/src/bioregistry/analysis/paper_ranking.py @@ -1,39 +1,80 @@ -"""Train a TF-IDF classifier and use it to score the relevance of new PubMed papers to the Bioregistry.""" +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "bioregistry[paper-ranking]", +# ] +# +# [tool.uv.sources] +# bioregistry = { path = "../../../" } +# /// + +"""Train a TF-IDF classifier and use it to score the relevance of new PubMed papers to the Bioregistry. + +Run with: + +1. ``python -m bioregistry.analysis.paper_ranking`` +2. ``tox -e paper-ranking`` +3. ``uv run --script paper_ranking.py`` +""" from __future__ import annotations import datetime import json +import logging +import textwrap from collections import defaultdict +from collections.abc import Iterable from pathlib import Path +from typing import Any, NamedTuple, Optional, Union import click -import indra.literature.pubmed_client as pubmed_client import numpy as np import pandas as pd +from more_itertools import chunked from numpy.typing import NDArray from sklearn.base import ClassifierMixin from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression +from sklearn.linear_model._base import LinearClassifierMixin from sklearn.metrics import matthews_corrcoef, roc_auc_score from sklearn.model_selection import cross_val_predict, train_test_split from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier +from tqdm import tqdm +from typing_extensions import TypeAlias from bioregistry.constants import BIOREGISTRY_PATH, CURATED_PAPERS_PATH +logger = logging.getLogger(__name__) + HERE = Path(__file__).parent.resolve() ROOT = HERE.parent.parent.parent.resolve() DIRECTORY = ROOT.joinpath("exports", "analyses", "paper_ranking") DIRECTORY.mkdir(exist_ok=True, parents=True) -URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRPtP-tcXSx8zvhCuX6fqz_\ -QvHowyAoDahnkixARk9rFTe0gfBN9GfdG6qTNQHHVL0i33XGSp_nV9XM/pub?output=csv" +URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRPtP-tcXSx8zvhCuX6fqz_QvHowyAoDahnkixARk9rFTe0gfBN9GfdG6qTNQHHVL0i33XGSp_nV9XM/pub?output=csv" + +XTrain: TypeAlias = NDArray[np.float64] +YTrain: TypeAlias = NDArray[np.float64] +XTest: TypeAlias = NDArray[np.str_] +YTest: TypeAlias = NDArray[np.str_] + +ClassifierHint: TypeAlias = Union[ClassifierMixin, LinearClassifierMixin] +Classifiers: TypeAlias = list[tuple[str, ClassifierHint]] + +DEFAULT_SEARCH_TERMS = [ + "database", + "ontology", + "resource", + "vocabulary", + "nomenclature", +] -def load_bioregistry_json(path: Path | None = None) -> pd.DataFrame: +def get_publications_from_bioregistry(path: Optional[Path] = None) -> pd.DataFrame: """Load bioregistry data from a JSON file, extracting publication details and fetching abstracts if missing. :param path: Path to the bioregistry JSON file. @@ -41,34 +82,26 @@ def load_bioregistry_json(path: Path | None = None) -> pd.DataFrame: """ if path is None: path = BIOREGISTRY_PATH - try: - data = json.loads(path.read_text(encoding="utf-8")) - except json.JSONDecodeError as e: - click.echo(f"JSONDecodeError: {e.msg}") - click.echo(f"Error at line {e.lineno}, column {e.colno}") - click.echo(f"Error at position {e.pos}") - return pd.DataFrame() + records = json.loads(path.read_text(encoding="utf-8")) publications = [] - pmids_to_fetch = [] - for entry in data.values(): - if "publications" in entry: - for pub in entry["publications"]: - pmid = pub.get("pubmed") - title = pub.get("title") - if pmid: - pmids_to_fetch.append(pmid) - publications.append({"pubmed": pmid, "title": title, "abstract": "", "label": 1}) - - fetched_metadata = {} - for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]: - fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True)) - - for pub in publications: - if pub["pubmed"] in fetched_metadata: - pub["abstract"] = fetched_metadata[pub["pubmed"]].get("abstract", "") + pubmeds = set() + for record in records.values(): + # TODO replace with usage of bioregistry code, this is duplicate logic + # see Resource.get_publications() + for publication in record.get("publications", []): + pubmed = publication.get("pubmed") + if pubmed: + pubmeds.add(pubmed) + publications.append({"pubmed": pubmed, "title": publication.get("title"), "label": 1}) + + pubmed_to_metadata = _get_metadata_for_ids(sorted(pubmeds)) + for publication in publications: + publication["abstract"] = pubmed_to_metadata.get(publication["pubmed"], {}).get( + "abstract", "" + ) - click.echo(f"Got {len(publications):,} publications from the bioregistry") + logger.info(f"Got {len(publications):,} publications from the bioregistry") return pd.DataFrame(publications) @@ -84,10 +117,8 @@ def load_curated_papers(file_path: Path = CURATED_PAPERS_PATH) -> pd.DataFrame: curated_df["title"] = "" curated_df["abstract"] = "" - pmids_to_fetch = curated_df["pubmed"].tolist() - fetched_metadata = {} - for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]: - fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True)) + pubmeds = curated_df["pubmed"].tolist() + fetched_metadata = _get_metadata_for_ids(pubmeds) for index, row in curated_df.iterrows(): if row["pubmed"] in fetched_metadata: @@ -98,32 +129,52 @@ def load_curated_papers(file_path: Path = CURATED_PAPERS_PATH) -> pd.DataFrame: return curated_df -def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame: - """Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers. +def _get_metadata_for_ids(pubmed_ids: Iterable[Union[int, str]]) -> dict[str, dict[str, Any]]: + """Get metadata for articles in PubMed, wrapping the INDRA client.""" + from indra.literature import pubmed_client - :param curated_pmids: List containing already curated PMIDs - :return: DataFrame containing PubMed paper details. - """ - click.echo("Starting fetch_pubmed_papers") + fetched_metadata = {} + for chunk in chunked( + tqdm(pubmed_ids, unit="article", unit_scale=True, desc="Getting metadata"), 200 + ): + fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True)) + return fetched_metadata - search_terms = ["database", "ontology", "resource", "vocabulary", "nomenclature"] - paper_to_terms: defaultdict[str, list[str]] = defaultdict(list) - for term in search_terms: - pubmed_ids = pubmed_client.get_ids(term, use_text_word=True, reldate=30) - for pubmed_id in pubmed_ids: - if pubmed_id not in curated_pmids: +def _get_ids(term: str, use_text_word: bool, relative_date: int) -> set[str]: + from indra.literature import pubmed_client + + return { + str(pubmed_id) + for pubmed_id in pubmed_client.get_ids( + term, use_text_word=use_text_word, reldate=relative_date + ) + } + + +def _search( + terms: list[str], pubmed_ids_to_filter: set[str], relative_date: int +) -> dict[str, list[str]]: + paper_to_terms: defaultdict[str, list[str]] = defaultdict(list) + for term in tqdm(terms, desc="Searching PubMed", unit="search term", leave=False): + for pubmed_id in _get_ids(term, use_text_word=True, relative_date=relative_date): + if pubmed_id not in pubmed_ids_to_filter: paper_to_terms[pubmed_id].append(term) + return dict(paper_to_terms) + - all_pmids = list(paper_to_terms.keys()) - click.echo(f"{len(all_pmids):,} articles found") - if not all_pmids: - click.echo(f"No articles found for the last 30 days with the search terms: {search_terms}") - return pd.DataFrame() +def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], relative_date: int) -> pd.DataFrame: + """Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers. - papers = {} - for chunk in [all_pmids[i : i + 200] for i in range(0, len(all_pmids), 200)]: - papers.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True)) + :param pubmed_ids_to_filter: List containing already curated PMIDs + :param relative_date: the number of recent days to search + :return: DataFrame containing PubMed paper details. + """ + paper_to_terms = _search( + DEFAULT_SEARCH_TERMS, pubmed_ids_to_filter=pubmed_ids_to_filter, relative_date=relative_date + ) + + papers = _get_metadata_for_ids(paper_to_terms) records = [] for pubmed_id, paper in papers.items(): @@ -137,7 +188,7 @@ def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame: "title": title, "abstract": abstract, "year": paper.get("publication_date", {}).get("year"), - "search_terms": paper_to_terms.get(pubmed_id), + "search_terms": paper_to_terms[pubmed_id], } ) @@ -145,7 +196,7 @@ def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame: return pd.DataFrame(records) -def load_curation_data() -> pd.DataFrame: +def load_google_curation_df() -> pd.DataFrame: """Download and load curation data from a Google Sheets URL. :return: DataFrame containing curated publication details. @@ -156,9 +207,7 @@ def load_curation_data() -> pd.DataFrame: df = df[["pubmed", "title", "abstract", "label"]] pmids_to_fetch = df[df["abstract"] == ""].pubmed.tolist() - fetched_metadata = {} - for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]: - fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True)) + fetched_metadata = _get_metadata_for_ids(pmids_to_fetch) for index, row in df.iterrows(): if row["pubmed"] in fetched_metadata: @@ -168,7 +217,7 @@ def load_curation_data() -> pd.DataFrame: return df -def _map_labels(s: str) -> int | None: +def _map_labels(s: str) -> Optional[int]: """Map labels to binary values. :param s: Label value. @@ -181,10 +230,7 @@ def _map_labels(s: str) -> int | None: return None -Classifiers = list[tuple[str, ClassifierMixin]] - - -def train_classifiers(x_train: NDArray[np.float64], y_train: NDArray[np.str_]) -> Classifiers: +def train_classifiers(x_train: XTrain, y_train: YTrain) -> Classifiers: """Train multiple classifiers on the training data. :param x_train: Training features. @@ -198,36 +244,54 @@ def train_classifiers(x_train: NDArray[np.float64], y_train: NDArray[np.str_]) - ("svc", LinearSVC()), ("svm", SVC(kernel="rbf", probability=True)), ] - for _, clf in classifiers: + for _, clf in tqdm(classifiers, desc="Training classifiers"): clf.fit(x_train, y_train) return classifiers def generate_meta_features( - classifiers: Classifiers, x_train: NDArray[np.float64], y_train: NDArray[np.str_] + classifiers: Classifiers, x_train: XTrain, y_train: YTrain, cv: int = 5 ) -> pd.DataFrame: """Generate meta-features for training a meta-classifier using cross-validation predictions. + .. todo:: explain what this approach is doing and why. What is a meta-feature? + :param classifiers: List of trained classifiers. :param x_train: Training features. :param y_train: Training labels. :return: DataFrame containing meta-features. """ - meta_features = pd.DataFrame() + df = pd.DataFrame() for name, clf in classifiers: - if hasattr(clf, "predict_proba"): - predictions = cross_val_predict(clf, x_train, y_train, cv=5, method="predict_proba")[ - :, 1 - ] - else: - predictions = cross_val_predict(clf, x_train, y_train, cv=5, method="decision_function") - meta_features[name] = predictions - return meta_features - - -def evaluate_meta_classifier( - meta_clf: ClassifierMixin, x_test_meta: NDArray[np.float64], y_test: NDArray[np.str_] -) -> tuple[float, float]: + df[name] = _cross_val_predict(clf, x_train, y_train, cv=cv) + return df + + +def _cross_val_predict( + clf: ClassifierHint, x_train: XTrain, y_train: YTrain, cv: int +) -> NDArray[np.float64]: + if not hasattr(clf, "predict_proba"): + return cross_val_predict(clf, x_train, y_train, cv=cv, method="decision_function") + return cross_val_predict(clf, x_train, y_train, cv=cv, method="predict_proba")[:, 1] + + +def _predict(clf: ClassifierHint, x: NDArray[np.float64]) -> NDArray[np.float64]: + if hasattr(clf, "predict_proba"): + return clf.predict_proba(x)[:, 1] + else: + return clf.decision_function(x) + + +class MetaClassifierEvaluationResults(NamedTuple): + """A tuple for meta classifier results.""" + + mcc: float + roc_auc: float + + +def _evaluate_meta_classifier( + meta_clf: ClassifierMixin, x_test_meta: XTest, y_test: YTest +) -> MetaClassifierEvaluationResults: """Evaluate meta-classifier using MCC and AUC-ROC scores. :param meta_clf: Trained meta-classifier. @@ -237,14 +301,8 @@ def evaluate_meta_classifier( """ y_pred = meta_clf.predict(x_test_meta) mcc = matthews_corrcoef(y_test, y_pred) - roc_auc = roc_auc_score(y_test, meta_clf.predict_proba(x_test_meta)[:, 1]) - return mcc, roc_auc - - -def truncate_text(text: str, max_length: int) -> str: - """Truncate text to a specified maximum length.""" - # FIXME replace with builtin textwrap function - return text if len(text) <= max_length else text[:max_length] + "..." + roc_auc = roc_auc_score(y_test, _predict(meta_clf, x_test_meta)) + return MetaClassifierEvaluationResults(mcc, roc_auc) def predict_and_save( @@ -252,7 +310,7 @@ def predict_and_save( vectorizer: TfidfVectorizer, classifiers: Classifiers, meta_clf: ClassifierMixin, - filename: str | Path, + path: str | Path, ) -> None: """Predict and save scores for new data using trained classifiers and meta-classifier. @@ -260,21 +318,19 @@ def predict_and_save( :param vectorizer: Trained TF-IDF vectorizer. :param classifiers: List of trained classifiers. :param meta_clf: Trained meta-classifier. - :param filename: Filename to save the predictions. + :param path: Path to save the predictions. """ x_meta = pd.DataFrame() x_transformed = vectorizer.transform(df.title + " " + df.abstract) for name, clf in classifiers: - if hasattr(clf, "predict_proba"): - x_meta[name] = clf.predict_proba(x_transformed)[:, 1] - else: - x_meta[name] = clf.decision_function(x_transformed) + x_meta[name] = _predict(clf, x_transformed) - df["meta_score"] = meta_clf.predict_proba(x_meta)[:, 1] + df["meta_score"] = _predict(meta_clf, x_meta) df = df.sort_values(by="meta_score", ascending=False) - df["abstract"] = df["abstract"].apply(lambda x: truncate_text(x, 25)) - df.to_csv(DIRECTORY.joinpath(filename), sep="\t", index=False) - click.echo(f"Wrote predicted scores to {DIRECTORY.joinpath(filename)}") + df["abstract"] = df["abstract"].apply(lambda x: textwrap.shorten(x, 25)) + path = Path(path).resolve() + df.to_csv(path, sep="\t", index=False) + click.echo(f"Wrote predicted scores to {path}") def _first_of_month() -> str: @@ -282,11 +338,51 @@ def _first_of_month() -> str: return datetime.date(today.year, today.month, 1).isoformat() +def _get_meta_results( + classifiers: Classifiers, x_train: XTrain, x_test: XTest, y_train: YTrain, y_test: YTest +) -> tuple[LogisticRegression, MetaClassifierEvaluationResults]: + meta_features = generate_meta_features(classifiers, x_train, y_train) + meta_clf = LogisticRegression() + meta_clf.fit(meta_features, y_train) + + x_test_meta = pd.DataFrame() + for name, clf in classifiers: + x_test_meta[name] = _predict(clf, x_test) + + return meta_clf, _evaluate_meta_classifier(meta_clf, x_test_meta.to_numpy(), y_test) + + +def _get_evaluation_df( + classifiers: Classifiers, x_train: XTrain, x_test: XTest, y_train: YTrain, y_test: YTest +) -> tuple[LogisticRegression, pd.DataFrame]: + scores = [] + for name, clf in tqdm(classifiers, desc="evaluating"): + y_pred = clf.predict(x_test) + try: + mcc = matthews_corrcoef(y_test, y_pred) + except ValueError as e: + tqdm.write(click.style(f"{clf} failed to calculate MCC: {e}", fg="yellow")) + mcc = None + roc_auc = roc_auc_score(y_test, _predict(clf, x_test)) + if not mcc and not roc_auc: + continue + scores.append((name, mcc or float("nan"), roc_auc or float("nan"))) + + meta_clf, meta_clf_results = _get_meta_results( + classifiers, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test + ) + scores.append(("meta_classifier", meta_clf_results.mcc, meta_clf_results.roc_auc)) + + evaluation_df = pd.DataFrame(scores, columns=["classifier", "mcc", "auc_roc"]).round(3) + return meta_clf, evaluation_df + + @click.command() @click.option( "--bioregistry-file", type=Path, help="Path to the bioregistry.json file", + default=BIOREGISTRY_PATH, ) @click.option( "--start-date", @@ -307,12 +403,33 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None: :param start_date: The start date of the period for which papers are being ranked. :param end_date: The end date of the period for which papers are being ranked. """ - publication_df = load_bioregistry_json(bioregistry_file) - curation_df = load_curation_data() - curated_papers_df = load_curated_papers(CURATED_PAPERS_PATH) + runner( + bioregistry_file=bioregistry_file, + curated_papers_path=CURATED_PAPERS_PATH, + start_date=start_date, + end_date=end_date, + output_path=DIRECTORY, + ) + + +def runner( + *, + bioregistry_file: Path, + curated_papers_path: Path, + start_date: str, + end_date: str, + include_remote: bool = True, + output_path: Path, +) -> None: + """Run functionality directly.""" + publication_df = get_publications_from_bioregistry(bioregistry_file) + curated_papers_df = load_curated_papers(curated_papers_path) + + curated_dfs = [curated_papers_df] + if include_remote: + curated_dfs.append(load_google_curation_df()) - # Combine all data sources - df = pd.concat([curation_df, publication_df, curated_papers_df]) + df = pd.concat([publication_df, *curated_dfs]) df["abstract"] = df["abstract"].fillna("") df["title_abstract"] = df["title"] + " " + df["abstract"] @@ -329,47 +446,11 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None: classifiers = train_classifiers(x_train, y_train) - click.echo("Scoring individual classifiers") - scores = [] - for name, clf in classifiers: - y_pred = clf.predict(x_test) - try: - mcc = matthews_corrcoef(y_test, y_pred) - except ValueError as e: - click.secho(f"{clf} failed to calculate MCC: {e:.2f}", fg="yellow") - mcc = None - try: - if hasattr(clf, "predict_proba"): - roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]) - else: - roc_auc = roc_auc_score(y_test, clf.decision_function(x_test)) - except AttributeError as e: - click.secho(f"{clf} failed to calculate AUC-ROC: {e}", fg="yellow") - roc_auc = None - if not mcc and not roc_auc: - continue - scores.append((name, mcc or float("nan"), roc_auc or float("nan"))) - - evaluation_df = pd.DataFrame(scores, columns=["classifier", "mcc", "auc_roc"]).round(3) + meta_clf, evaluation_df = _get_evaluation_df( + classifiers, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test + ) click.echo(evaluation_df.to_markdown(index=False)) - - meta_features = generate_meta_features(classifiers, x_train, y_train) - meta_clf = LogisticRegression() - meta_clf.fit(meta_features, y_train) - - x_test_meta = pd.DataFrame() - for name, clf in classifiers: - if hasattr(clf, "predict_proba"): - x_test_meta[name] = clf.predict_proba(x_test)[:, 1] - else: - x_test_meta[name] = clf.decision_function(x_test) - - mcc, roc_auc = evaluate_meta_classifier(meta_clf, x_test_meta.to_numpy(), y_test) - click.echo(f"Meta-Classifier MCC: {mcc:.2f}, AUC-ROC: {roc_auc:.2f}") - new_row = {"classifier": "meta_classifier", "mcc": mcc, "auc_roc": roc_auc} - evaluation_df = pd.concat([evaluation_df, pd.DataFrame([new_row])], ignore_index=True) - - evaluation_path = DIRECTORY.joinpath("evaluation.tsv") + evaluation_path = output_path.joinpath("evaluation.tsv") click.echo(f"Writing evaluation to {evaluation_path}") evaluation_df.to_csv(evaluation_path, sep="\t", index=False) @@ -377,13 +458,11 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None: lr_clf = classifiers[1][1] importances_df = ( pd.DataFrame( - list( - zip( - vectorizer.get_feature_names_out(), - vectorizer.idf_, - random_forest_clf.feature_importances_, - lr_clf.coef_[0], - ) + zip( + vectorizer.get_feature_names_out(), + vectorizer.idf_, + random_forest_clf.feature_importances_, + lr_clf.coef_[0], ), columns=["word", "idf", "rf_importance", "lr_importance"], ) @@ -391,20 +470,21 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None: .round(4) ) click.echo(importances_df.head(15).to_markdown(index=False)) - - importance_path = DIRECTORY.joinpath("importances.tsv") + importance_path = output_path.joinpath("importances.tsv") click.echo(f"Writing feature (word) importances to {importance_path}") importances_df.to_csv(importance_path, sep="\t", index=False) # These have already been curated and will therefore be filtered out - curated_pmids = set(curated_papers_df["pubmed"]).union( - publication_df["pubmed"], curation_df["pubmed"] - ) - - new_pub_df = fetch_pubmed_papers(curated_pmids) - if not new_pub_df.empty: - filename = f"predictions_{start_date}_to_{end_date}.tsv" - predict_and_save(new_pub_df, vectorizer, classifiers, meta_clf, filename) + curated_pubmed_ids: set[str] = {str(pubmed) for pubmed in df["pubmed"] if pd.notna(pubmed)} + + # FIXME the fetch_pubmed_papers function should + # take into account the start and end date. as + predictions_df = fetch_pubmed_papers(pubmed_ids_to_filter=curated_pubmed_ids, relative_date=30) + if not predictions_df.empty: + # TODO update the way naming this file works, see discussion on + # https://github.com/biopragmatics/bioregistry/pull/1350 + predictions_path = output_path.joinpath(f"predictions_{start_date}_to_{end_date}.tsv") + predict_and_save(predictions_df, vectorizer, classifiers, meta_clf, predictions_path) if __name__ == "__main__": diff --git a/src/bioregistry/analysis/paper_ranking_requirements.txt b/src/bioregistry/analysis/paper_ranking_requirements.txt deleted file mode 100644 index 8768d28cd..000000000 --- a/src/bioregistry/analysis/paper_ranking_requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -click -curies -indra -pandas -scikit-learn -tabulate diff --git a/tests/mock_pubmed_data.json b/tests/resources/mock_pubmed_data.json similarity index 100% rename from tests/mock_pubmed_data.json rename to tests/resources/mock_pubmed_data.json diff --git a/tests/resources/mock_search.json b/tests/resources/mock_search.json new file mode 100644 index 000000000..95f371be2 --- /dev/null +++ b/tests/resources/mock_search.json @@ -0,0 +1,7 @@ +{ + "database": [], + "ontology": [], + "resource": [], + "vocabulary": [], + "nomenclature": [] +} \ No newline at end of file diff --git a/tests/test_paper_ranking.py b/tests/test_paper_ranking.py index 327ad7b95..9acd7588f 100644 --- a/tests/test_paper_ranking.py +++ b/tests/test_paper_ranking.py @@ -2,68 +2,61 @@ import datetime import json +import tempfile import unittest +import unittest.mock from pathlib import Path -from unittest.mock import patch -from click.testing import CliRunner +from bioregistry.analysis.paper_ranking import runner +from bioregistry.constants import BIOREGISTRY_PATH, CURATED_PAPERS_PATH -from bioregistry.analysis.paper_ranking import main +HERE = Path(__file__).parent.resolve() +RESOURCES = HERE.joinpath("resources") +MOCK_DATA_PATH = RESOURCES.joinpath("mock_pubmed_data.json") +MOCK_SEARCH_PATH = RESOURCES.joinpath("mock_search.json") class TestPaperRanking(unittest.TestCase): """Tests the paper ranking model.""" - def setUp(self): - """Set up the test case with paths for the files.""" - root_dir = root_dir = Path(__file__).resolve().parent.parent - self.bioregistry_file = root_dir / "src" / "bioregistry" / "data" / "bioregistry.json" - self.output_directory = root_dir / "exports" / "analyses" / "paper_ranking" - self.mock_data_path = root_dir / "tests" / "mock_pubmed_data.json" - - # Check if bioregistry and mock data files exists - self.assertTrue(self.mock_data_path.exists(), "Mock data file does not exist") - self.assertTrue(self.bioregistry_file.exists(), "Bioregistry file does not exist") - - @patch("bioregistry.analysis.paper_ranking.pubmed_client.get_metadata_for_ids") - def test_pipeline(self, mock_get_metadata_for_ids): + @unittest.mock.patch("bioregistry.analysis.paper_ranking._get_metadata_for_ids") + @unittest.mock.patch("bioregistry.analysis.paper_ranking._get_ids") + def test_pipeline(self, mock_get_metadata_for_ids, mock_get_ids): """Smoke test to ensure pipeline runs successfully without error.""" + # set the data that gets returned by each of the INDRA-wrapping + # funcs using JSON files in the tests/resources/ folder + mock_get_metadata_for_ids.return_value = json.loads(MOCK_DATA_PATH.read_text()) + mock_get_ids.return_value = {} + + # these are dummy values, since we will mock + # the functions that use them start_date = datetime.date.today().isoformat() end_date = datetime.date.today().isoformat() - # Mock return value for get_metadata_for_ids - with open(self.mock_data_path, "r", encoding="utf-8") as file: - mock_data = json.load(file) - - mock_get_metadata_for_ids.return_value = mock_data - - runner = CliRunner() - - result = runner.invoke( - main, - [ - "--bioregistry-file", - str(self.bioregistry_file), - "--start-date", - start_date, - "--end-date", - end_date, - ], - ) - - # Check if the pipeline ran successfully - self.assertEqual(result.exit_code, 0, f"Pipeline failed with: {result.exit_code}") - - # Check if the output directory exists - self.assertTrue(self.output_directory.exists(), f"{self.output_directory} does not exist") - - # Check if the evaluation file was created - evaluation_file = self.output_directory.joinpath("evaluation.tsv") - self.assertTrue(evaluation_file.exists(), f"{evaluation_file} was not created") - - # Check if the importances file was created - importances_file = self.output_directory.joinpath("importances.tsv") - self.assertTrue(importances_file.exists(), f"{importances_file} was not created") + with tempfile.TemporaryDirectory() as directory: + directory_ = Path(directory) + + runner( + # TODO create test data + bioregistry_file=BIOREGISTRY_PATH, + # TODO create test data + curated_papers_path=CURATED_PAPERS_PATH, + start_date=start_date, + end_date=end_date, + include_remote=False, + output_path=directory_, + ) + + # TODO ideally the tests check the actual functionality, and not the I/O, + # using some test data instead of live real data, which changes over time + + # Check if the evaluation file was created + evaluation_file = directory_.joinpath("evaluation.tsv") + self.assertTrue(evaluation_file.exists(), f"{evaluation_file} was not created") + + # Check if the importances file was created + importances_file = directory_.joinpath("importances.tsv") + self.assertTrue(importances_file.exists(), f"{importances_file} was not created") if __name__ == "__main__": diff --git a/tox.ini b/tox.ini index e1959b2c6..5094443cd 100644 --- a/tox.ini +++ b/tox.ini @@ -61,6 +61,14 @@ passenv = NDEX_USERNAME NDEX_PASSWORD +[testenv:paper-ranking] +description = Run the paper ranking workflow to identify new potential prefixes and other curations +commands = + python -m bioregistry.analysis.paper_ranking +usedevelop = true +extras = + paper-ranking + [testenv:coverage-clean] deps = coverage skip_install = true