From 2ced69ee6a6c6405e4b646d8f0952c9782ad9187 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Wed, 15 Jan 2025 10:16:38 -0500
Subject: [PATCH] Refactor paper ranking workflow (#1358)

---
 .github/workflows/paper_ranking.yml           |   4 +-
 pyproject.toml                                |   7 +
 src/bioregistry/analysis/paper_ranking.py     | 400 +++++++++++-------
 .../analysis/paper_ranking_requirements.txt   |   6 -
 tests/{ => resources}/mock_pubmed_data.json   |   0
 tests/resources/mock_search.json              |   7 +
 tests/test_paper_ranking.py                   |  91 ++--
 tox.ini                                       |   8 +
 8 files changed, 307 insertions(+), 216 deletions(-)
 delete mode 100644 src/bioregistry/analysis/paper_ranking_requirements.txt
 rename tests/{ => resources}/mock_pubmed_data.json (100%)
 create mode 100644 tests/resources/mock_search.json

diff --git a/.github/workflows/paper_ranking.yml b/.github/workflows/paper_ranking.yml
index 35913e803..f181f2608 100644
--- a/.github/workflows/paper_ranking.yml
+++ b/.github/workflows/paper_ranking.yml
@@ -20,8 +20,9 @@ jobs:
 
     - name: Install dependencies
       run: |
+        # TODO update to using uv
         python -m pip install --upgrade pip
-        pip install -r src/bioregistry/analysis/paper_ranking_requirements.txt
+        pip install .[paper-ranking]
 
     - name: Set Date Variables
       id: set-date-variables
@@ -39,6 +40,7 @@ jobs:
       id: run-ranking-script
       run: |
         echo "PYTHONPATH=$PYTHONPATH"  # Verify PYTHONPATH
+        # TODO update to using python -m
         python src/bioregistry/analysis/paper_ranking.py --start-date ${{ env.START_DATE }} --end-date ${{ env.END_DATE }}
 
     - name: Upload Full List as Artifact
diff --git a/pyproject.toml b/pyproject.toml
index a2f7fcfa4..699dc000f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -136,6 +136,13 @@ web = [
     "curies[fastapi]",
     "a2wsgi",
 ]
+paper-ranking = [
+    "indra",
+    "pandas",
+    "scikit-learn",
+    "tabulate",
+    "more_itertools",
+]
 
 
 # See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#urls
diff --git a/src/bioregistry/analysis/paper_ranking.py b/src/bioregistry/analysis/paper_ranking.py
index 1bd334635..f646678fc 100644
--- a/src/bioregistry/analysis/paper_ranking.py
+++ b/src/bioregistry/analysis/paper_ranking.py
@@ -1,39 +1,80 @@
-"""Train a TF-IDF classifier and use it to score the relevance of new PubMed papers to the Bioregistry."""
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "bioregistry[paper-ranking]",
+# ]
+#
+# [tool.uv.sources]
+# bioregistry = { path = "../../../" }
+# ///
+
+"""Train a TF-IDF classifier and use it to score the relevance of new PubMed papers to the Bioregistry.
+
+Run with:
+
+1. ``python -m bioregistry.analysis.paper_ranking``
+2. ``tox -e paper-ranking``
+3. ``uv run --script paper_ranking.py``
+"""
 
 from __future__ import annotations
 
 import datetime
 import json
+import logging
+import textwrap
 from collections import defaultdict
+from collections.abc import Iterable
 from pathlib import Path
+from typing import Any, NamedTuple, Optional, Union
 
 import click
-import indra.literature.pubmed_client as pubmed_client
 import numpy as np
 import pandas as pd
+from more_itertools import chunked
 from numpy.typing import NDArray
 from sklearn.base import ClassifierMixin
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.metrics import matthews_corrcoef, roc_auc_score
 from sklearn.model_selection import cross_val_predict, train_test_split
 from sklearn.svm import SVC, LinearSVC
 from sklearn.tree import DecisionTreeClassifier
+from tqdm import tqdm
+from typing_extensions import TypeAlias
 
 from bioregistry.constants import BIOREGISTRY_PATH, CURATED_PAPERS_PATH
 
+logger = logging.getLogger(__name__)
+
 HERE = Path(__file__).parent.resolve()
 ROOT = HERE.parent.parent.parent.resolve()
 
 DIRECTORY = ROOT.joinpath("exports", "analyses", "paper_ranking")
 DIRECTORY.mkdir(exist_ok=True, parents=True)
 
-URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRPtP-tcXSx8zvhCuX6fqz_\
-QvHowyAoDahnkixARk9rFTe0gfBN9GfdG6qTNQHHVL0i33XGSp_nV9XM/pub?output=csv"
+URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRPtP-tcXSx8zvhCuX6fqz_QvHowyAoDahnkixARk9rFTe0gfBN9GfdG6qTNQHHVL0i33XGSp_nV9XM/pub?output=csv"
+
+XTrain: TypeAlias = NDArray[np.float64]
+YTrain: TypeAlias = NDArray[np.float64]
+XTest: TypeAlias = NDArray[np.str_]
+YTest: TypeAlias = NDArray[np.str_]
+
+ClassifierHint: TypeAlias = Union[ClassifierMixin, LinearClassifierMixin]
+Classifiers: TypeAlias = list[tuple[str, ClassifierHint]]
+
+DEFAULT_SEARCH_TERMS = [
+    "database",
+    "ontology",
+    "resource",
+    "vocabulary",
+    "nomenclature",
+]
 
 
-def load_bioregistry_json(path: Path | None = None) -> pd.DataFrame:
+def get_publications_from_bioregistry(path: Optional[Path] = None) -> pd.DataFrame:
     """Load bioregistry data from a JSON file, extracting publication details and fetching abstracts if missing.
 
     :param path: Path to the bioregistry JSON file.
@@ -41,34 +82,26 @@ def load_bioregistry_json(path: Path | None = None) -> pd.DataFrame:
     """
     if path is None:
         path = BIOREGISTRY_PATH
-    try:
-        data = json.loads(path.read_text(encoding="utf-8"))
-    except json.JSONDecodeError as e:
-        click.echo(f"JSONDecodeError: {e.msg}")
-        click.echo(f"Error at line {e.lineno}, column {e.colno}")
-        click.echo(f"Error at position {e.pos}")
-        return pd.DataFrame()
 
+    records = json.loads(path.read_text(encoding="utf-8"))
     publications = []
-    pmids_to_fetch = []
-    for entry in data.values():
-        if "publications" in entry:
-            for pub in entry["publications"]:
-                pmid = pub.get("pubmed")
-                title = pub.get("title")
-                if pmid:
-                    pmids_to_fetch.append(pmid)
-                publications.append({"pubmed": pmid, "title": title, "abstract": "", "label": 1})
-
-    fetched_metadata = {}
-    for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]:
-        fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))
-
-    for pub in publications:
-        if pub["pubmed"] in fetched_metadata:
-            pub["abstract"] = fetched_metadata[pub["pubmed"]].get("abstract", "")
+    pubmeds = set()
+    for record in records.values():
+        # TODO replace with usage of bioregistry code, this is duplicate logic
+        #  see Resource.get_publications()
+        for publication in record.get("publications", []):
+            pubmed = publication.get("pubmed")
+            if pubmed:
+                pubmeds.add(pubmed)
+            publications.append({"pubmed": pubmed, "title": publication.get("title"), "label": 1})
+
+    pubmed_to_metadata = _get_metadata_for_ids(sorted(pubmeds))
+    for publication in publications:
+        publication["abstract"] = pubmed_to_metadata.get(publication["pubmed"], {}).get(
+            "abstract", ""
+        )
 
-    click.echo(f"Got {len(publications):,} publications from the bioregistry")
+    logger.info(f"Got {len(publications):,} publications from the bioregistry")
 
     return pd.DataFrame(publications)
 
@@ -84,10 +117,8 @@ def load_curated_papers(file_path: Path = CURATED_PAPERS_PATH) -> pd.DataFrame:
     curated_df["title"] = ""
     curated_df["abstract"] = ""
 
-    pmids_to_fetch = curated_df["pubmed"].tolist()
-    fetched_metadata = {}
-    for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]:
-        fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))
+    pubmeds = curated_df["pubmed"].tolist()
+    fetched_metadata = _get_metadata_for_ids(pubmeds)
 
     for index, row in curated_df.iterrows():
         if row["pubmed"] in fetched_metadata:
@@ -98,32 +129,52 @@ def load_curated_papers(file_path: Path = CURATED_PAPERS_PATH) -> pd.DataFrame:
     return curated_df
 
 
-def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame:
-    """Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers.
+def _get_metadata_for_ids(pubmed_ids: Iterable[Union[int, str]]) -> dict[str, dict[str, Any]]:
+    """Get metadata for articles in PubMed, wrapping the INDRA client."""
+    from indra.literature import pubmed_client
 
-    :param curated_pmids: List containing already curated PMIDs
-    :return: DataFrame containing PubMed paper details.
-    """
-    click.echo("Starting fetch_pubmed_papers")
+    fetched_metadata = {}
+    for chunk in chunked(
+        tqdm(pubmed_ids, unit="article", unit_scale=True, desc="Getting metadata"), 200
+    ):
+        fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))
+    return fetched_metadata
 
-    search_terms = ["database", "ontology", "resource", "vocabulary", "nomenclature"]
-    paper_to_terms: defaultdict[str, list[str]] = defaultdict(list)
 
-    for term in search_terms:
-        pubmed_ids = pubmed_client.get_ids(term, use_text_word=True, reldate=30)
-        for pubmed_id in pubmed_ids:
-            if pubmed_id not in curated_pmids:
+def _get_ids(term: str, use_text_word: bool, relative_date: int) -> set[str]:
+    from indra.literature import pubmed_client
+
+    return {
+        str(pubmed_id)
+        for pubmed_id in pubmed_client.get_ids(
+            term, use_text_word=use_text_word, reldate=relative_date
+        )
+    }
+
+
+def _search(
+    terms: list[str], pubmed_ids_to_filter: set[str], relative_date: int
+) -> dict[str, list[str]]:
+    paper_to_terms: defaultdict[str, list[str]] = defaultdict(list)
+    for term in tqdm(terms, desc="Searching PubMed", unit="search term", leave=False):
+        for pubmed_id in _get_ids(term, use_text_word=True, relative_date=relative_date):
+            if pubmed_id not in pubmed_ids_to_filter:
                 paper_to_terms[pubmed_id].append(term)
+    return dict(paper_to_terms)
+
 
-    all_pmids = list(paper_to_terms.keys())
-    click.echo(f"{len(all_pmids):,} articles found")
-    if not all_pmids:
-        click.echo(f"No articles found for the last 30 days with the search terms: {search_terms}")
-        return pd.DataFrame()
+def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], relative_date: int) -> pd.DataFrame:
+    """Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers.
 
-    papers = {}
-    for chunk in [all_pmids[i : i + 200] for i in range(0, len(all_pmids), 200)]:
-        papers.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))
+    :param pubmed_ids_to_filter: List containing already curated PMIDs
+    :param relative_date: the number of recent days to search
+    :return: DataFrame containing PubMed paper details.
+    """
+    paper_to_terms = _search(
+        DEFAULT_SEARCH_TERMS, pubmed_ids_to_filter=pubmed_ids_to_filter, relative_date=relative_date
+    )
+
+    papers = _get_metadata_for_ids(paper_to_terms)
 
     records = []
     for pubmed_id, paper in papers.items():
@@ -137,7 +188,7 @@ def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame:
                     "title": title,
                     "abstract": abstract,
                     "year": paper.get("publication_date", {}).get("year"),
-                    "search_terms": paper_to_terms.get(pubmed_id),
+                    "search_terms": paper_to_terms[pubmed_id],
                 }
             )
 
@@ -145,7 +196,7 @@ def fetch_pubmed_papers(curated_pmids: set[int]) -> pd.DataFrame:
     return pd.DataFrame(records)
 
 
-def load_curation_data() -> pd.DataFrame:
+def load_google_curation_df() -> pd.DataFrame:
     """Download and load curation data from a Google Sheets URL.
 
     :return: DataFrame containing curated publication details.
@@ -156,9 +207,7 @@ def load_curation_data() -> pd.DataFrame:
     df = df[["pubmed", "title", "abstract", "label"]]
 
     pmids_to_fetch = df[df["abstract"] == ""].pubmed.tolist()
-    fetched_metadata = {}
-    for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]:
-        fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))
+    fetched_metadata = _get_metadata_for_ids(pmids_to_fetch)
 
     for index, row in df.iterrows():
         if row["pubmed"] in fetched_metadata:
@@ -168,7 +217,7 @@ def load_curation_data() -> pd.DataFrame:
     return df
 
 
-def _map_labels(s: str) -> int | None:
+def _map_labels(s: str) -> Optional[int]:
     """Map labels to binary values.
 
     :param s: Label value.
@@ -181,10 +230,7 @@ def _map_labels(s: str) -> int | None:
     return None
 
 
-Classifiers = list[tuple[str, ClassifierMixin]]
-
-
-def train_classifiers(x_train: NDArray[np.float64], y_train: NDArray[np.str_]) -> Classifiers:
+def train_classifiers(x_train: XTrain, y_train: YTrain) -> Classifiers:
     """Train multiple classifiers on the training data.
 
     :param x_train: Training features.
@@ -198,36 +244,54 @@ def train_classifiers(x_train: NDArray[np.float64], y_train: NDArray[np.str_]) -
         ("svc", LinearSVC()),
         ("svm", SVC(kernel="rbf", probability=True)),
     ]
-    for _, clf in classifiers:
+    for _, clf in tqdm(classifiers, desc="Training classifiers"):
         clf.fit(x_train, y_train)
     return classifiers
 
 
 def generate_meta_features(
-    classifiers: Classifiers, x_train: NDArray[np.float64], y_train: NDArray[np.str_]
+    classifiers: Classifiers, x_train: XTrain, y_train: YTrain, cv: int = 5
 ) -> pd.DataFrame:
     """Generate meta-features for training a meta-classifier using cross-validation predictions.
 
+    .. todo:: explain what this approach is doing and why. What is a meta-feature?
+
     :param classifiers: List of trained classifiers.
     :param x_train: Training features.
     :param y_train: Training labels.
     :return: DataFrame containing meta-features.
     """
-    meta_features = pd.DataFrame()
+    df = pd.DataFrame()
     for name, clf in classifiers:
-        if hasattr(clf, "predict_proba"):
-            predictions = cross_val_predict(clf, x_train, y_train, cv=5, method="predict_proba")[
-                :, 1
-            ]
-        else:
-            predictions = cross_val_predict(clf, x_train, y_train, cv=5, method="decision_function")
-        meta_features[name] = predictions
-    return meta_features
-
-
-def evaluate_meta_classifier(
-    meta_clf: ClassifierMixin, x_test_meta: NDArray[np.float64], y_test: NDArray[np.str_]
-) -> tuple[float, float]:
+        df[name] = _cross_val_predict(clf, x_train, y_train, cv=cv)
+    return df
+
+
+def _cross_val_predict(
+    clf: ClassifierHint, x_train: XTrain, y_train: YTrain, cv: int
+) -> NDArray[np.float64]:
+    if not hasattr(clf, "predict_proba"):
+        return cross_val_predict(clf, x_train, y_train, cv=cv, method="decision_function")
+    return cross_val_predict(clf, x_train, y_train, cv=cv, method="predict_proba")[:, 1]
+
+
+def _predict(clf: ClassifierHint, x: NDArray[np.float64]) -> NDArray[np.float64]:
+    if hasattr(clf, "predict_proba"):
+        return clf.predict_proba(x)[:, 1]
+    else:
+        return clf.decision_function(x)
+
+
+class MetaClassifierEvaluationResults(NamedTuple):
+    """A tuple for meta classifier results."""
+
+    mcc: float
+    roc_auc: float
+
+
+def _evaluate_meta_classifier(
+    meta_clf: ClassifierMixin, x_test_meta: XTest, y_test: YTest
+) -> MetaClassifierEvaluationResults:
     """Evaluate meta-classifier using MCC and AUC-ROC scores.
 
     :param meta_clf: Trained meta-classifier.
@@ -237,14 +301,8 @@ def evaluate_meta_classifier(
     """
     y_pred = meta_clf.predict(x_test_meta)
     mcc = matthews_corrcoef(y_test, y_pred)
-    roc_auc = roc_auc_score(y_test, meta_clf.predict_proba(x_test_meta)[:, 1])
-    return mcc, roc_auc
-
-
-def truncate_text(text: str, max_length: int) -> str:
-    """Truncate text to a specified maximum length."""
-    # FIXME replace with builtin textwrap function
-    return text if len(text) <= max_length else text[:max_length] + "..."
+    roc_auc = roc_auc_score(y_test, _predict(meta_clf, x_test_meta))
+    return MetaClassifierEvaluationResults(mcc, roc_auc)
 
 
 def predict_and_save(
@@ -252,7 +310,7 @@ def predict_and_save(
     vectorizer: TfidfVectorizer,
     classifiers: Classifiers,
     meta_clf: ClassifierMixin,
-    filename: str | Path,
+    path: str | Path,
 ) -> None:
     """Predict and save scores for new data using trained classifiers and meta-classifier.
 
@@ -260,21 +318,19 @@ def predict_and_save(
     :param vectorizer: Trained TF-IDF vectorizer.
     :param classifiers: List of trained classifiers.
     :param meta_clf: Trained meta-classifier.
-    :param filename: Filename to save the predictions.
+    :param path: Path to save the predictions.
     """
     x_meta = pd.DataFrame()
     x_transformed = vectorizer.transform(df.title + " " + df.abstract)
     for name, clf in classifiers:
-        if hasattr(clf, "predict_proba"):
-            x_meta[name] = clf.predict_proba(x_transformed)[:, 1]
-        else:
-            x_meta[name] = clf.decision_function(x_transformed)
+        x_meta[name] = _predict(clf, x_transformed)
 
-    df["meta_score"] = meta_clf.predict_proba(x_meta)[:, 1]
+    df["meta_score"] = _predict(meta_clf, x_meta)
     df = df.sort_values(by="meta_score", ascending=False)
-    df["abstract"] = df["abstract"].apply(lambda x: truncate_text(x, 25))
-    df.to_csv(DIRECTORY.joinpath(filename), sep="\t", index=False)
-    click.echo(f"Wrote predicted scores to {DIRECTORY.joinpath(filename)}")
+    df["abstract"] = df["abstract"].apply(lambda x: textwrap.shorten(x, 25))
+    path = Path(path).resolve()
+    df.to_csv(path, sep="\t", index=False)
+    click.echo(f"Wrote predicted scores to {path}")
 
 
 def _first_of_month() -> str:
@@ -282,11 +338,51 @@ def _first_of_month() -> str:
     return datetime.date(today.year, today.month, 1).isoformat()
 
 
+def _get_meta_results(
+    classifiers: Classifiers, x_train: XTrain, x_test: XTest, y_train: YTrain, y_test: YTest
+) -> tuple[LogisticRegression, MetaClassifierEvaluationResults]:
+    meta_features = generate_meta_features(classifiers, x_train, y_train)
+    meta_clf = LogisticRegression()
+    meta_clf.fit(meta_features, y_train)
+
+    x_test_meta = pd.DataFrame()
+    for name, clf in classifiers:
+        x_test_meta[name] = _predict(clf, x_test)
+
+    return meta_clf, _evaluate_meta_classifier(meta_clf, x_test_meta.to_numpy(), y_test)
+
+
+def _get_evaluation_df(
+    classifiers: Classifiers, x_train: XTrain, x_test: XTest, y_train: YTrain, y_test: YTest
+) -> tuple[LogisticRegression, pd.DataFrame]:
+    scores = []
+    for name, clf in tqdm(classifiers, desc="evaluating"):
+        y_pred = clf.predict(x_test)
+        try:
+            mcc = matthews_corrcoef(y_test, y_pred)
+        except ValueError as e:
+            tqdm.write(click.style(f"{clf} failed to calculate MCC: {e}", fg="yellow"))
+            mcc = None
+        roc_auc = roc_auc_score(y_test, _predict(clf, x_test))
+        if not mcc and not roc_auc:
+            continue
+        scores.append((name, mcc or float("nan"), roc_auc or float("nan")))
+
+    meta_clf, meta_clf_results = _get_meta_results(
+        classifiers, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test
+    )
+    scores.append(("meta_classifier", meta_clf_results.mcc, meta_clf_results.roc_auc))
+
+    evaluation_df = pd.DataFrame(scores, columns=["classifier", "mcc", "auc_roc"]).round(3)
+    return meta_clf, evaluation_df
+
+
 @click.command()
 @click.option(
     "--bioregistry-file",
     type=Path,
     help="Path to the bioregistry.json file",
+    default=BIOREGISTRY_PATH,
 )
 @click.option(
     "--start-date",
@@ -307,12 +403,33 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None:
     :param start_date: The start date of the period for which papers are being ranked.
     :param end_date: The end date of the period for which papers are being ranked.
     """
-    publication_df = load_bioregistry_json(bioregistry_file)
-    curation_df = load_curation_data()
-    curated_papers_df = load_curated_papers(CURATED_PAPERS_PATH)
+    runner(
+        bioregistry_file=bioregistry_file,
+        curated_papers_path=CURATED_PAPERS_PATH,
+        start_date=start_date,
+        end_date=end_date,
+        output_path=DIRECTORY,
+    )
+
+
+def runner(
+    *,
+    bioregistry_file: Path,
+    curated_papers_path: Path,
+    start_date: str,
+    end_date: str,
+    include_remote: bool = True,
+    output_path: Path,
+) -> None:
+    """Run functionality directly."""
+    publication_df = get_publications_from_bioregistry(bioregistry_file)
+    curated_papers_df = load_curated_papers(curated_papers_path)
+
+    curated_dfs = [curated_papers_df]
+    if include_remote:
+        curated_dfs.append(load_google_curation_df())
 
-    # Combine all data sources
-    df = pd.concat([curation_df, publication_df, curated_papers_df])
+    df = pd.concat([publication_df, *curated_dfs])
     df["abstract"] = df["abstract"].fillna("")
     df["title_abstract"] = df["title"] + " " + df["abstract"]
 
@@ -329,47 +446,11 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None:
 
     classifiers = train_classifiers(x_train, y_train)
 
-    click.echo("Scoring individual classifiers")
-    scores = []
-    for name, clf in classifiers:
-        y_pred = clf.predict(x_test)
-        try:
-            mcc = matthews_corrcoef(y_test, y_pred)
-        except ValueError as e:
-            click.secho(f"{clf} failed to calculate MCC: {e:.2f}", fg="yellow")
-            mcc = None
-        try:
-            if hasattr(clf, "predict_proba"):
-                roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
-            else:
-                roc_auc = roc_auc_score(y_test, clf.decision_function(x_test))
-        except AttributeError as e:
-            click.secho(f"{clf} failed to calculate AUC-ROC: {e}", fg="yellow")
-            roc_auc = None
-        if not mcc and not roc_auc:
-            continue
-        scores.append((name, mcc or float("nan"), roc_auc or float("nan")))
-
-    evaluation_df = pd.DataFrame(scores, columns=["classifier", "mcc", "auc_roc"]).round(3)
+    meta_clf, evaluation_df = _get_evaluation_df(
+        classifiers, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test
+    )
     click.echo(evaluation_df.to_markdown(index=False))
-
-    meta_features = generate_meta_features(classifiers, x_train, y_train)
-    meta_clf = LogisticRegression()
-    meta_clf.fit(meta_features, y_train)
-
-    x_test_meta = pd.DataFrame()
-    for name, clf in classifiers:
-        if hasattr(clf, "predict_proba"):
-            x_test_meta[name] = clf.predict_proba(x_test)[:, 1]
-        else:
-            x_test_meta[name] = clf.decision_function(x_test)
-
-    mcc, roc_auc = evaluate_meta_classifier(meta_clf, x_test_meta.to_numpy(), y_test)
-    click.echo(f"Meta-Classifier MCC: {mcc:.2f}, AUC-ROC: {roc_auc:.2f}")
-    new_row = {"classifier": "meta_classifier", "mcc": mcc, "auc_roc": roc_auc}
-    evaluation_df = pd.concat([evaluation_df, pd.DataFrame([new_row])], ignore_index=True)
-
-    evaluation_path = DIRECTORY.joinpath("evaluation.tsv")
+    evaluation_path = output_path.joinpath("evaluation.tsv")
     click.echo(f"Writing evaluation to {evaluation_path}")
     evaluation_df.to_csv(evaluation_path, sep="\t", index=False)
 
@@ -377,13 +458,11 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None:
     lr_clf = classifiers[1][1]
     importances_df = (
         pd.DataFrame(
-            list(
-                zip(
-                    vectorizer.get_feature_names_out(),
-                    vectorizer.idf_,
-                    random_forest_clf.feature_importances_,
-                    lr_clf.coef_[0],
-                )
+            zip(
+                vectorizer.get_feature_names_out(),
+                vectorizer.idf_,
+                random_forest_clf.feature_importances_,
+                lr_clf.coef_[0],
             ),
             columns=["word", "idf", "rf_importance", "lr_importance"],
         )
@@ -391,20 +470,21 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None:
         .round(4)
     )
     click.echo(importances_df.head(15).to_markdown(index=False))
-
-    importance_path = DIRECTORY.joinpath("importances.tsv")
+    importance_path = output_path.joinpath("importances.tsv")
     click.echo(f"Writing feature (word) importances to {importance_path}")
     importances_df.to_csv(importance_path, sep="\t", index=False)
 
     # These have already been curated and will therefore be filtered out
-    curated_pmids = set(curated_papers_df["pubmed"]).union(
-        publication_df["pubmed"], curation_df["pubmed"]
-    )
-
-    new_pub_df = fetch_pubmed_papers(curated_pmids)
-    if not new_pub_df.empty:
-        filename = f"predictions_{start_date}_to_{end_date}.tsv"
-        predict_and_save(new_pub_df, vectorizer, classifiers, meta_clf, filename)
+    curated_pubmed_ids: set[str] = {str(pubmed) for pubmed in df["pubmed"] if pd.notna(pubmed)}
+
+    # FIXME the fetch_pubmed_papers function should
+    #  take into account the start and end date. as
+    predictions_df = fetch_pubmed_papers(pubmed_ids_to_filter=curated_pubmed_ids, relative_date=30)
+    if not predictions_df.empty:
+        # TODO update the way naming this file works, see discussion on
+        #  https://github.com/biopragmatics/bioregistry/pull/1350
+        predictions_path = output_path.joinpath(f"predictions_{start_date}_to_{end_date}.tsv")
+        predict_and_save(predictions_df, vectorizer, classifiers, meta_clf, predictions_path)
 
 
 if __name__ == "__main__":
diff --git a/src/bioregistry/analysis/paper_ranking_requirements.txt b/src/bioregistry/analysis/paper_ranking_requirements.txt
deleted file mode 100644
index 8768d28cd..000000000
--- a/src/bioregistry/analysis/paper_ranking_requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-click
-curies
-indra
-pandas
-scikit-learn
-tabulate
diff --git a/tests/mock_pubmed_data.json b/tests/resources/mock_pubmed_data.json
similarity index 100%
rename from tests/mock_pubmed_data.json
rename to tests/resources/mock_pubmed_data.json
diff --git a/tests/resources/mock_search.json b/tests/resources/mock_search.json
new file mode 100644
index 000000000..95f371be2
--- /dev/null
+++ b/tests/resources/mock_search.json
@@ -0,0 +1,7 @@
+{
+  "database": [],
+  "ontology": [],
+  "resource": [],
+  "vocabulary": [],
+  "nomenclature": []
+}
\ No newline at end of file
diff --git a/tests/test_paper_ranking.py b/tests/test_paper_ranking.py
index 327ad7b95..9acd7588f 100644
--- a/tests/test_paper_ranking.py
+++ b/tests/test_paper_ranking.py
@@ -2,68 +2,61 @@
 
 import datetime
 import json
+import tempfile
 import unittest
+import unittest.mock
 from pathlib import Path
-from unittest.mock import patch
 
-from click.testing import CliRunner
+from bioregistry.analysis.paper_ranking import runner
+from bioregistry.constants import BIOREGISTRY_PATH, CURATED_PAPERS_PATH
 
-from bioregistry.analysis.paper_ranking import main
+HERE = Path(__file__).parent.resolve()
+RESOURCES = HERE.joinpath("resources")
+MOCK_DATA_PATH = RESOURCES.joinpath("mock_pubmed_data.json")
+MOCK_SEARCH_PATH = RESOURCES.joinpath("mock_search.json")
 
 
 class TestPaperRanking(unittest.TestCase):
     """Tests the paper ranking model."""
 
-    def setUp(self):
-        """Set up the test case with paths for the files."""
-        root_dir = root_dir = Path(__file__).resolve().parent.parent
-        self.bioregistry_file = root_dir / "src" / "bioregistry" / "data" / "bioregistry.json"
-        self.output_directory = root_dir / "exports" / "analyses" / "paper_ranking"
-        self.mock_data_path = root_dir / "tests" / "mock_pubmed_data.json"
-
-        # Check if bioregistry and mock data files exists
-        self.assertTrue(self.mock_data_path.exists(), "Mock data file does not exist")
-        self.assertTrue(self.bioregistry_file.exists(), "Bioregistry file does not exist")
-
-    @patch("bioregistry.analysis.paper_ranking.pubmed_client.get_metadata_for_ids")
-    def test_pipeline(self, mock_get_metadata_for_ids):
+    @unittest.mock.patch("bioregistry.analysis.paper_ranking._get_metadata_for_ids")
+    @unittest.mock.patch("bioregistry.analysis.paper_ranking._get_ids")
+    def test_pipeline(self, mock_get_metadata_for_ids, mock_get_ids):
         """Smoke test to ensure pipeline runs successfully without error."""
+        # set the data that gets returned by each of the INDRA-wrapping
+        # funcs using JSON files in the tests/resources/ folder
+        mock_get_metadata_for_ids.return_value = json.loads(MOCK_DATA_PATH.read_text())
+        mock_get_ids.return_value = {}
+
+        # these are dummy values, since we will mock
+        # the functions that use them
         start_date = datetime.date.today().isoformat()
         end_date = datetime.date.today().isoformat()
 
-        # Mock return value for get_metadata_for_ids
-        with open(self.mock_data_path, "r", encoding="utf-8") as file:
-            mock_data = json.load(file)
-
-        mock_get_metadata_for_ids.return_value = mock_data
-
-        runner = CliRunner()
-
-        result = runner.invoke(
-            main,
-            [
-                "--bioregistry-file",
-                str(self.bioregistry_file),
-                "--start-date",
-                start_date,
-                "--end-date",
-                end_date,
-            ],
-        )
-
-        # Check if the pipeline ran successfully
-        self.assertEqual(result.exit_code, 0, f"Pipeline failed with: {result.exit_code}")
-
-        # Check if the output directory exists
-        self.assertTrue(self.output_directory.exists(), f"{self.output_directory} does not exist")
-
-        # Check if the evaluation file was created
-        evaluation_file = self.output_directory.joinpath("evaluation.tsv")
-        self.assertTrue(evaluation_file.exists(), f"{evaluation_file} was not created")
-
-        # Check if the importances file was created
-        importances_file = self.output_directory.joinpath("importances.tsv")
-        self.assertTrue(importances_file.exists(), f"{importances_file} was not created")
+        with tempfile.TemporaryDirectory() as directory:
+            directory_ = Path(directory)
+
+            runner(
+                # TODO create test data
+                bioregistry_file=BIOREGISTRY_PATH,
+                # TODO create test data
+                curated_papers_path=CURATED_PAPERS_PATH,
+                start_date=start_date,
+                end_date=end_date,
+                include_remote=False,
+                output_path=directory_,
+            )
+
+            # TODO ideally the tests check the actual functionality, and not the I/O,
+            # using some test data instead of live real data, which changes over time
+
+            # Check if the evaluation file was created
+            evaluation_file = directory_.joinpath("evaluation.tsv")
+            self.assertTrue(evaluation_file.exists(), f"{evaluation_file} was not created")
+
+            # Check if the importances file was created
+            importances_file = directory_.joinpath("importances.tsv")
+            self.assertTrue(importances_file.exists(), f"{importances_file} was not created")
 
 
 if __name__ == "__main__":
diff --git a/tox.ini b/tox.ini
index e1959b2c6..5094443cd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -61,6 +61,14 @@ passenv =
     NDEX_USERNAME
     NDEX_PASSWORD
 
+[testenv:paper-ranking]
+description = Run the paper ranking workflow to identify new potential prefixes and other curations
+commands =
+    python -m bioregistry.analysis.paper_ranking
+usedevelop = true
+extras =
+    paper-ranking
+
 [testenv:coverage-clean]
 deps = coverage
 skip_install = true