From bef7b4c44795314ef1ed4b5727d8a8832a91dcdd Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 9 Sep 2023 11:35:00 +0200 Subject: [PATCH 1/8] Add ability to specify strictness References #63 --- src/curies/api.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index b270ea33..fd9eefb3 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -1037,6 +1037,13 @@ def standardize_curie(self, curie: str) -> Optional[str]: return None return self.format_curie(norm_prefix, identifier) + def standardize_curie_strict(self, curie: str) -> str: + """Standardize a CURIE and error if not possible.""" + norm_curie = self.standardize_curie(curie) + if norm_curie is None: + raise ExpansionError(curie) + return norm_curie + def standardize_uri(self, uri: str) -> Optional[str]: """Standardize a URI. @@ -1076,6 +1083,8 @@ def pd_compress( df: "pandas.DataFrame", column: Union[str, int], target_column: Union[None, str, int] = None, + *, + strict: bool = False, ) -> None: """Convert all URIs in the given column to CURIEs. @@ -1083,13 +1092,16 @@ def pd_compress( :param column: The column in the dataframe containing URIs to convert to CURIEs. :param target_column: The column to put the results in. Defaults to input column. """ - df[column if target_column is None else target_column] = df[column].map(self.compress) + func = self.compress_strict if strict else self.compress + df[column if target_column is None else target_column] = df[column].map(func) def pd_expand( self, df: "pandas.DataFrame", column: Union[str, int], + *, target_column: Union[None, str, int] = None, + strict: bool = False ) -> None: """Convert all CURIEs in the given column to URIs. @@ -1097,7 +1109,8 @@ def pd_expand( :param column: The column in the dataframe containing CURIEs to convert to URIs. :param target_column: The column to put the results in. Defaults to input column. """ - df[column if target_column is None else target_column] = df[column].map(self.expand) + func = self.expand_strict if strict else self.expand + df[column if target_column is None else target_column] = df[column].map(func) def pd_standardize_prefix( self, @@ -1122,6 +1135,7 @@ def pd_standardize_curie( *, column: Union[str, int], target_column: Union[None, str, int] = None, + strict: bool = False, ) -> None: r"""Standardize all CURIEs in the given column. @@ -1144,9 +1158,8 @@ def pd_standardize_curie( >>> converter = curies.get_bioregistry_converter() >>> converter.pd_standardize_curie(df, column="object_id") """ - df[column if target_column is None else target_column] = df[column].map( - self.standardize_curie - ) + func = self.standardize_curie_strict if strict else self.standardize_curie + df[column if target_column is None else target_column] = df[column].map(func) def pd_standardize_uri( self, From 51e94704278581ae1df8409e56e3d2973039d7ab Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 9 Sep 2023 11:44:07 +0200 Subject: [PATCH 2/8] Update api.py --- src/curies/api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index fd9eefb3..9a92c6a4 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -1098,10 +1098,10 @@ def pd_compress( def pd_expand( self, df: "pandas.DataFrame", - column: Union[str, int], *, + column: Union[str, int], target_column: Union[None, str, int] = None, - strict: bool = False + strict: bool = False, ) -> None: """Convert all CURIEs in the given column to URIs. @@ -1118,6 +1118,7 @@ def pd_standardize_prefix( *, column: Union[str, int], target_column: Union[None, str, int] = None, + strict: bool = False, ) -> None: """Standardize all prefixes in the given column. @@ -1167,6 +1168,7 @@ def pd_standardize_uri( *, column: Union[str, int], target_column: Union[None, str, int] = None, + strict: bool = False, ) -> None: """Standardize all URIs in the given column. From 25619804d0a7dc5ed2837b89d984683aa0a660e4 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Sep 2023 12:04:01 +0200 Subject: [PATCH 3/8] Add initial data science utilities --- notebooks/Data Science Demo.ipynb | 102 ++++++++++++++++++++ src/curies/api.py | 149 +++++++++++++++++++++++++++++- tests/test_data_science.py | 20 ++++ 3 files changed, 266 insertions(+), 5 deletions(-) create mode 100644 notebooks/Data Science Demo.ipynb create mode 100644 tests/test_data_science.py diff --git a/notebooks/Data Science Demo.ipynb b/notebooks/Data Science Demo.ipynb new file mode 100644 index 00000000..8ec01757 --- /dev/null +++ b/notebooks/Data Science Demo.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cfb169ac-d6e7-4132-9ffc-a14edf8a918f", + "metadata": {}, + "outputs": [], + "source": [ + "import curies\n", + "import pandas as pd\n", + "import itertools as itt\n", + "import pystow" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3091dc17-b60d-4cc1-94a9-c523b3cce4e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 73.4 ms, sys: 3.43 ms, total: 76.9 ms\n", + "Wall time: 333 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "obo_converter = curies.get_obo_converter()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e138e4-31f5-4c0d-ba0f-9849586af00c", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "bioregistry_converter = curies.get_bioregistry_converter()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5301bafc-15eb-45bc-adf6-6281d6da1b3e", + "metadata": {}, + "outputs": [], + "source": [ + "commit = \"faca4fc335f9a61902b9c47a1facd52a0d3d2f8b\"\n", + "url = f\"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv\"\n", + "df = pystow.ensure_csv(\"tmp\", url=url, read_csv_kwargs=dict(comment='#'))\n", + "df.head()[[\"subject_id\", \"predicate_id\", \"object_id\"]].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53ae14ad-1665-472f-a849-f6e2fa95fde4", + "metadata": {}, + "outputs": [], + "source": [ + "obo_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e028a67-634a-4b2e-ad16-aca23fc47e28", + "metadata": {}, + "outputs": [], + "source": [ + "bioregistry_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/curies/api.py b/src/curies/api.py index 4049092b..02f51695 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -3,9 +3,12 @@ """Data structures and algorithms for :mod:`curies`.""" import csv +import dataclasses import itertools as itt import json -from collections import defaultdict +import random +import typing +from collections import Counter, defaultdict from pathlib import Path from typing import ( TYPE_CHECKING, @@ -378,6 +381,124 @@ def _prepare(data: LocationOr[X]) -> X: return data +@dataclasses.dataclass +class Report: + converter: "Converter" + stayed: int + updated: int + failures: Mapping[str, typing.Counter[str]] = dataclasses.field(repr=False) + + def count_prefixes(self) -> typing.Counter[str]: + """Count the frequency of each failing prefix.""" + return Counter({prefix: len(counter) for prefix, counter in self.failures.items()}) + + def get_df(self) -> "pandas.DataFrame": + """Summarize standardization issues in a dataframe.""" + import pandas as pd + + rows = [ + ( + prefix, + sum(counter.values()), + ", ".join(sorted(set(random.choices(list(counter), k=5)))), + ) + for prefix, counter in sorted(self.failures.items(), key=lambda p: p[0].casefold()) + ] + return pd.DataFrame(rows, columns=["prefix", "count", "examples"]) + + def get_suggestions(self) -> Dict[str, str]: + """Get a mapping from missing prefix to suggestion text.""" + try: + import bioregistry + except ImportError: + bioregistry = None + + norm_to_prefix = defaultdict(set) + + def _norm(s: str) -> str: + for x in "_.- ": + s = s.replace(x, "") + return s.casefold() + + for record in self.converter.records: + for p in record._all_prefixes: + norm_to_prefix[_norm(p)].add(p) + + rv = {} + for prefix, c in self.failures.items(): + if prefix in {"url", "uri", "iri"}: + rv[prefix] = "is an incorrect way of encoding a URI" + continue + if prefix in {"urn"}: + rv[ + prefix + ] = "means data is encoded using URNs, which isn't explicitly handled by this package." + continue + if prefix in {"http", "https", "ftp"}: + rv[prefix] = "entries are not CURIEs, try and compressing your data first." + continue + if len(c) == 1: + first = list(c)[0] + if first == prefix: + rv[prefix] = f"is not a valid CURIE" + continue + elif first.lower() == f"{prefix.lower()}:{prefix.lower()}": + rv[prefix] = f"has a double prefix annotation: {first}" + continue + correct = sorted(norm_to_prefix.get(_norm(prefix), [])) + if correct: + rv[prefix] = f"is a case/punctuation variant. Try using {_list(correct)}" + continue + + if bioregistry is not None: + norm_prefix = bioregistry.normalize_prefix(prefix) + if norm_prefix: + rv[prefix] = ( + f"appears in Bioregistry under [`{norm_prefix}`](https://bioregistry." + f"io/{norm_prefix}). Consider chaining your converter with the Bioregistry using " + f"[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." + ) + continue + + # TODO check for bananas? + rv[prefix] = ( + "can either be added to the converter if it is local to the project, " + f"or if it is globally useful, contributed to the Bioregistry" + ) + return rv + + def _repr_markdown_(self): + try: + import bioregistry + except ImportError: + bioregistry = None + + failures = sum(len(c) for c in self.failures.values()) + total = self.stayed + self.updated + failures + text = "## Summary\n" + if bioregistry is None: + text += "\nInstall the Bioregistry with `pip install bioregistry` for more detailed suggestions\n\n" + text = ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}), " + f"resulted in {self.updated:,} updates ({self.updated/total:.1%}), and {failures:,} failures " + f"({failures/total:.1%}). Here's a breakdown of the prefixes that weren't possible to standardize:\n\n" + ) + text += self.get_df().to_markdown(index=False) + text += "\n\n## Suggestions\n\n" + for prefix, suggestion in self.get_suggestions().items(): + text += f"- {prefix} {suggestion}\n" + return text + + +def _list(correct: Sequence[str]) -> str: + if len(correct) == 1: + return f"`{correct[0]}`" + if len(correct) == 2: + return f"`{correct[0]}` or `{correct[1]}`" + x = ", ".join(f"`{v}`" for v in correct[:-1]) + return f"{x}, or `{correct[-1]}`" + + class Converter: """A cached prefix map data structure. @@ -1194,7 +1315,7 @@ def pd_standardize_curie( column: Union[str, int], target_column: Union[None, str, int] = None, strict: bool = False, - ) -> None: + ) -> Report: r"""Standardize all CURIEs in the given column. :param df: A pandas DataFrame @@ -1216,8 +1337,26 @@ def pd_standardize_curie( >>> converter = curies.get_bioregistry_converter() >>> converter.pd_standardize_curie(df, column="object_id") """ - func = self.standardize_curie_strict if strict else self.standardize_curie - df[column if target_column is None else target_column] = df[column].map(func) + norm_curies = [] + failures = defaultdict(Counter) + stayed = 0 + updated = 0 + for curie in df[column]: + norm_curie = self.standardize_curie(curie) + if norm_curie is None: + failures[curie.split(":")[0]][curie] += 1 + elif curie == norm_curie: + stayed += 1 + else: + updated += 1 + norm_curies.append(norm_curie) + report = Report(converter=self, failures=failures, stayed=stayed, updated=updated) + if strict and failures: + raise ValueError( + f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set `strict=False`, and entries that can't be parsed will be given `None`, or try and improve your context to better cover your data. Here's the report:\n\n{report.get_text()}" + ) + df[column if target_column is None else target_column] = norm_curies + return report def pd_standardize_uri( self, @@ -1354,7 +1493,7 @@ def _in(a: str, bs: Iterable[str], case_sensitive: bool) -> bool: return any(nfa == b.casefold() for b in bs) -def chain(converters: Sequence[Converter], *, case_sensitive: bool = True) -> Converter: +def chain(converters: Iterable[Converter], *, case_sensitive: bool = True) -> Converter: """Chain several converters. :param converters: A list or tuple of converters diff --git a/tests/test_data_science.py b/tests/test_data_science.py new file mode 100644 index 00000000..9fddb0a3 --- /dev/null +++ b/tests/test_data_science.py @@ -0,0 +1,20 @@ +import unittest + +import pandas as pd + +import curies + + +class TestDataScience(unittest.TestCase): + """""" + + def test_case_mismatch(self): + data = ["EFO:1", "nope:nope"] + df = pd.DataFrame([(row,) for row in data], columns=["curie"]) + + converter = curies.Converter.from_prefix_map({"efo": "https://identifiers.org/efo:"}) + with self.assertRaises(ValueError): + converter.pd_standardize_curie(df, column="curie", strict=True) + + results = converter.pd_standardize_curie(df, column="curie") + suggestions = results.get_suggestions() From d0707f18c965d2c7692cf7f9eee58ffa8cbfbac6 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Sep 2023 12:32:39 +0200 Subject: [PATCH 4/8] Enrich bioregistry EPM with case variants --- src/curies/sources.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/curies/sources.py b/src/curies/sources.py index 13195360..61bfbfd2 100644 --- a/src/curies/sources.py +++ b/src/curies/sources.py @@ -4,7 +4,7 @@ from typing import Any -from .api import Converter +from .api import Converter, Record __all__ = [ "get_obo_converter", @@ -61,6 +61,19 @@ def get_bioregistry_converter(web: bool = False, **kwargs: Any) -> Converter: pass else: epm = bioregistry.manager.get_curies_records() # pragma: no cover + for record in epm: # pragma: no cover + # Remove this after https://github.com/biopragmatics/bioregistry/issues/935 is fixed + _augment_curie_prefix_synonyms(record) # pragma: no cover return Converter.from_extended_prefix_map(epm) # pragma: no cover url = f"{BIOREGISTRY_CONTEXTS}/bioregistry.epm.json" return Converter.from_extended_prefix_map(url, **kwargs) + + +def _augment_curie_prefix_synonyms(record: Record): + new_prefix_synonyms = set() + for s in record._all_prefixes: + new_prefix_synonyms.add(s) + new_prefix_synonyms.add(s.lower()) + new_prefix_synonyms.add(s.upper()) + new_prefix_synonyms.difference_update(record.prefix) + record.prefix_synonyms = sorted(new_prefix_synonyms) From 944bf67b11c06c0ae70ed89f3acb181410cf5e05 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Sep 2023 13:24:18 +0200 Subject: [PATCH 5/8] Improve feedback --- notebooks/Data Science Demo.ipynb | 198 ++++++++++++++++++++++++++++-- src/curies/api.py | 79 +++++++++--- src/curies/sources.py | 2 +- 3 files changed, 249 insertions(+), 30 deletions(-) diff --git a/notebooks/Data Science Demo.ipynb b/notebooks/Data Science Demo.ipynb index 8ec01757..75c2ee2b 100644 --- a/notebooks/Data Science Demo.ipynb +++ b/notebooks/Data Science Demo.ipynb @@ -23,8 +23,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 73.4 ms, sys: 3.43 ms, total: 76.9 ms\n", - "Wall time: 333 ms\n" + "CPU times: user 30 ms, sys: 4.59 ms, total: 34.6 ms\n", + "Wall time: 646 ms\n" ] } ], @@ -35,47 +35,223 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "d4e138e4-31f5-4c0d-ba0f-9849586af00c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7.08 s, sys: 69.5 ms, total: 7.15 s\n", + "Wall time: 7.22 s\n" + ] + } + ], "source": [ "%%time\n", "bioregistry_converter = curies.get_bioregistry_converter()" ] }, + { + "cell_type": "markdown", + "id": "f94b0791-ab75-481b-9e83-8990f0fbc4f1", + "metadata": {}, + "source": [ + "# Disease Ontology SSSOM Demo" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5301bafc-15eb-45bc-adf6-6281d6da1b3e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([['DOID:8717', 'oboInOwl:hasDbXref', 'NCI:C50706'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'MESH:D003668'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'ICD9CM:707.0'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref',\n", + " 'SNOMEDCT_US_2021_09_01:28103007'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'UMLS_CUI:C0011127']],\n", + " dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "commit = \"faca4fc335f9a61902b9c47a1facd52a0d3d2f8b\"\n", "url = f\"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv\"\n", - "df = pystow.ensure_csv(\"tmp\", url=url, read_csv_kwargs=dict(comment='#'))\n", + "df = pystow.ensure_csv(\"tmp\", url=url, read_csv_kwargs=dict(comment=\"#\"))\n", "df.head()[[\"subject_id\", \"predicate_id\", \"object_id\"]].values" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "53ae14ad-1665-472f-a849-f6e2fa95fde4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/markdown": [ + "## Summary\n", + "\n", + "Standardization was not necessary for 2 (0.0%), resulted in 0 updates (0.0%), and 34,522 failures (100.0%) in column `object_id`. Here's a breakdown of the prefixes that weren't possible to standardize:\n", + "\n", + "| prefix | count | examples |\n", + "|:-----------------------|--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| EFO | 131 | EFO:0000195, EFO:0000612, EFO:0000729, EFO:0003914, EFO:0004222 |\n", + "| GARD | 2030 | GARD:1224, GARD:4771, GARD:6464, GARD:7179, GARD:7475 |\n", + "| ICD10CM | 3666 | ICD10CM:E75.0, ICD10CM:H35.42, ICD10CM:I75, ICD10CM:K59.39, ICD10CM:Q38.1 |\n", + "| ICD9CM | 2266 | ICD9CM:335.22, ICD9CM:368.51, ICD9CM:375.15, ICD9CM:618.8, ICD9CM:622.2 |\n", + "| ICDO | 361 | ICDO:8050/3, ICDO:8051/3, ICDO:8290/0, ICDO:8470/3, ICDO:8920/3 |\n", + "| KEGG | 41 | KEGG:05210, KEGG:05219, KEGG:05221, KEGG:05310, KEGG:H02296 |\n", + "| MEDDRA | 41 | MEDDRA:10001229, MEDDRA:10036794, MEDDRA:10066387, MEDDRA:10068842 |\n", + "| MESH | 3847 | MESH:C562745, MESH:D003882, MESH:D008288, MESH:D009072, MESH:D015270 |\n", + "| NCI | 4788 | NCI:C27472, NCI:C3406, NCI:C39860, NCI:C4296, NCI:C84886 |\n", + "| OMIM | 5539 | OMIM:154800, OMIM:229050, OMIM:255300, OMIM:614465, OMIM:615725 |\n", + "| ORDO | 2023 | ORDO:2554, ORDO:295195, ORDO:397593, ORDO:733, ORDO:79257 |\n", + "| SNOMEDCT_US_2020_03_01 | 6 | SNOMEDCT_US_2020_03_01:236818008, SNOMEDCT_US_2020_03_01:254828009, SNOMEDCT_US_2020_03_01:52564001 |\n", + "| SNOMEDCT_US_2020_09_01 | 1 | SNOMEDCT_US_2020_09_01:1112003 |\n", + "| SNOMEDCT_US_2021_07_31 | 10 | SNOMEDCT_US_2021_07_31:205329008, SNOMEDCT_US_2021_07_31:268180007, SNOMEDCT_US_2021_07_31:75931002, SNOMEDCT_US_2021_07_31:785879009, SNOMEDCT_US_2021_07_31:86249007 |\n", + "| SNOMEDCT_US_2021_09_01 | 5088 | SNOMEDCT_US_2021_09_01:128925001, SNOMEDCT_US_2021_09_01:254916002, SNOMEDCT_US_2021_09_01:267572005, SNOMEDCT_US_2021_09_01:389261002, SNOMEDCT_US_2021_09_01:94069006 |\n", + "| UMLS_CUI | 6890 | UMLS_CUI:C0085574, UMLS_CUI:C0153212, UMLS_CUI:C0282492, UMLS_CUI:C1332356, UMLS_CUI:C1838329 |\n", + "\n", + "## Suggestions\n", + "\n", + "- NCI appears in Bioregistry under [`ncit`](https://bioregistry.io/ncit). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- MESH appears in Bioregistry under [`mesh`](https://bioregistry.io/mesh). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- ICD9CM appears in Bioregistry under [`icd9cm`](https://bioregistry.io/icd9cm). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- SNOMEDCT_US_2021_09_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- UMLS_CUI appears in Bioregistry under [`umls`](https://bioregistry.io/umls). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- ICD10CM appears in Bioregistry under [`icd10cm`](https://bioregistry.io/icd10cm). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- ORDO appears in Bioregistry under [`orphanet.ordo`](https://bioregistry.io/orphanet.ordo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- GARD appears in Bioregistry under [`gard`](https://bioregistry.io/gard). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- OMIM appears in Bioregistry under [`omim`](https://bioregistry.io/omim). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- ICDO appears in Bioregistry under [`icdo`](https://bioregistry.io/icdo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- EFO appears in Bioregistry under [`efo`](https://bioregistry.io/efo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- MEDDRA appears in Bioregistry under [`meddra`](https://bioregistry.io/meddra). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- KEGG appears in Bioregistry under [`kegg`](https://bioregistry.io/kegg). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- SNOMEDCT_US_2021_07_31 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- SNOMEDCT_US_2020_03_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", + "- SNOMEDCT_US_2020_09_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n" + ], + "text/plain": [ + "Report(converter=, column='object_id', nones=0, stayed=2, updated=0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "obo_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, + "id": "245227da-d4e2-4ede-9844-bd448ef0e54b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bioregistry_converter" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "4e028a67-634a-4b2e-ad16-aca23fc47e28", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/markdown": [ + "Standardization was successfully applied to all 36,730 CURIEs in column `object_id`." + ], + "text/plain": [ + "Report(converter=, column='object_id', nones=0, stayed=0, updated=36730)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "bioregistry_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" ] + }, + { + "cell_type": "markdown", + "id": "4fa4f1f8-e2cc-4230-8a36-2f8eb9d8b93f", + "metadata": {}, + "source": [ + "# Mixed CURIEs and URIs demo" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7239d782-e952-40fc-9a0a-5ae0753fdb22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Summary\n", + "\n", + "Standardization was not necessary for 1 (20.0%), resulted in 1 updates (20.0%), and 2 failures (40.0%) in column `0`. Here's a breakdown of the prefixes that weren't possible to standardize:\n", + "\n", + "| prefix | count | examples |\n", + "|:------------|--------:|:---------------------------------------|\n", + "| http | 1 | http://purl.obolibrary.org/obo/CHEBI_2 |\n", + "| not_a_curie | 1 | not_a_curie |\n", + "\n", + "## Suggestions\n", + "\n", + "- http entries are not CURIEs, try and compressing your data first.\n", + "- not_a_curie is not a valid CURIE\n" + ], + "text/plain": [ + "Report(converter=, column=0, nones=1, stayed=1, updated=1)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mixed_df = pd.DataFrame(\n", + " [\n", + " (\"chebi:1\",),\n", + " (\"http://purl.obolibrary.org/obo/CHEBI_2\",),\n", + " (\"CHEBI:3\",),\n", + " (\"not_a_curie\",),\n", + " (None,),\n", + " ]\n", + ")\n", + "bioregistry_converter.pd_standardize_curie(mixed_df, column=0)" + ] } ], "metadata": { diff --git a/src/curies/api.py b/src/curies/api.py index 02f51695..ad3b6dff 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -383,7 +383,11 @@ def _prepare(data: LocationOr[X]) -> X: @dataclasses.dataclass class Report: + """A report on CURIEs standardization.""" + converter: "Converter" + column: str | int + nones: int stayed: int updated: int failures: Mapping[str, typing.Counter[str]] = dataclasses.field(repr=False) @@ -400,7 +404,7 @@ def get_df(self) -> "pandas.DataFrame": ( prefix, sum(counter.values()), - ", ".join(sorted(set(random.choices(list(counter), k=5)))), + ", ".join(sorted(set(random.choices(list(counter), k=5)))), # noqa:S311 ) for prefix, counter in sorted(self.failures.items(), key=lambda p: p[0].casefold()) ] @@ -440,7 +444,7 @@ def _norm(s: str) -> str: if len(c) == 1: first = list(c)[0] if first == prefix: - rv[prefix] = f"is not a valid CURIE" + rv[prefix] = "is not a valid CURIE" continue elif first.lower() == f"{prefix.lower()}:{prefix.lower()}": rv[prefix] = f"has a double prefix annotation: {first}" @@ -456,39 +460,58 @@ def _norm(s: str) -> str: rv[prefix] = ( f"appears in Bioregistry under [`{norm_prefix}`](https://bioregistry." f"io/{norm_prefix}). Consider chaining your converter with the Bioregistry using " - f"[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." + "[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." ) continue # TODO check for bananas? rv[prefix] = ( "can either be added to the converter if it is local to the project, " - f"or if it is globally useful, contributed to the Bioregistry" + "or if it is globally useful, contributed to the Bioregistry" ) return rv - def _repr_markdown_(self): + def get_markdown(self) -> str: + """Get markdown text.""" try: import bioregistry except ImportError: bioregistry = None failures = sum(len(c) for c in self.failures.values()) - total = self.stayed + self.updated + failures - text = "## Summary\n" + total = self.nones + self.stayed + self.updated + failures + df = self.get_df() + + # TODO write # CURIEs, # unique CURIEs, and # unique prefixes + text = "## Summary\n\n" + if 0 == len(df.index): + if not self.stayed: + return f"Standardization was successfully applied to all {self.updated:,} CURIEs in column `{self.column}`." + return ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}) CURIEs " + f"and resulted in updates for {self.updated:,} ({self.updated/total:.1%}) CURIEs in column `{self.column}`" + ) + if bioregistry is None: text += "\nInstall the Bioregistry with `pip install bioregistry` for more detailed suggestions\n\n" - text = ( + text += ( f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}), " f"resulted in {self.updated:,} updates ({self.updated/total:.1%}), and {failures:,} failures " - f"({failures/total:.1%}). Here's a breakdown of the prefixes that weren't possible to standardize:\n\n" + f"({failures/total:.1%}) in column `{self.column}`. Here's a breakdown of the prefixes that " + f"weren't possible to standardize:\n\n" ) - text += self.get_df().to_markdown(index=False) - text += "\n\n## Suggestions\n\n" - for prefix, suggestion in self.get_suggestions().items(): - text += f"- {prefix} {suggestion}\n" + text += df.to_markdown(index=False) + + suggestions = self.get_suggestions() + if suggestions: + text += "\n\n## Suggestions\n\n" + for prefix, suggestion in suggestions.items(): + text += f"- {prefix} {suggestion}\n" return text + def _repr_markdown_(self) -> str: + return self.get_markdown() + def _list(correct: Sequence[str]) -> str: if len(correct) == 1: @@ -1337,12 +1360,25 @@ def pd_standardize_curie( >>> converter = curies.get_bioregistry_converter() >>> converter.pd_standardize_curie(df, column="object_id") """ - norm_curies = [] - failures = defaultdict(Counter) + import pandas as pd + + norm_curies: List[Optional[str]] = [] + failures: DefaultDict[str, Counter[str]] = defaultdict(Counter) stayed = 0 updated = 0 + nones = 0 + invalid = 0 for curie in df[column]: - norm_curie = self.standardize_curie(curie) + if pd.isna(curie): + nones += 1 + norm_curies.append(None) + continue + try: + norm_curie = self.standardize_curie(curie) + except ValueError: + # happens on an invalid curie, i.e., without a : + invalid += 1 + norm_curie = None if norm_curie is None: failures[curie.split(":")[0]][curie] += 1 elif curie == norm_curie: @@ -1350,10 +1386,17 @@ def pd_standardize_curie( else: updated += 1 norm_curies.append(norm_curie) - report = Report(converter=self, failures=failures, stayed=stayed, updated=updated) + report = Report( + converter=self, + failures=failures, + nones=nones, + stayed=stayed, + updated=updated, + column=column, + ) if strict and failures: raise ValueError( - f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set `strict=False`, and entries that can't be parsed will be given `None`, or try and improve your context to better cover your data. Here's the report:\n\n{report.get_text()}" + f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set `strict=False`, and entries that can't be parsed will be given `None`, or try and improve your context to better cover your data. Here's the report:\n\n{report.get_markdown()}" ) df[column if target_column is None else target_column] = norm_curies return report diff --git a/src/curies/sources.py b/src/curies/sources.py index 61bfbfd2..190b7008 100644 --- a/src/curies/sources.py +++ b/src/curies/sources.py @@ -69,7 +69,7 @@ def get_bioregistry_converter(web: bool = False, **kwargs: Any) -> Converter: return Converter.from_extended_prefix_map(url, **kwargs) -def _augment_curie_prefix_synonyms(record: Record): +def _augment_curie_prefix_synonyms(record: Record) -> None: new_prefix_synonyms = set() for s in record._all_prefixes: new_prefix_synonyms.add(s) From efb9f4728a75b8f73ee51788c9f2b76941ee79f4 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 21 Sep 2023 20:33:20 +0200 Subject: [PATCH 6/8] Reorg data sciecnce code --- notebooks/Data Science Demo.ipynb | 88 +++++++-------- src/curies/__init__.py | 2 + src/curies/api.py | 163 +++------------------------- src/curies/report.py | 174 ++++++++++++++++++++++++++++++ tests/test_data_science.py | 8 +- 5 files changed, 241 insertions(+), 194 deletions(-) create mode 100644 src/curies/report.py diff --git a/notebooks/Data Science Demo.ipynb b/notebooks/Data Science Demo.ipynb index 75c2ee2b..7943aed1 100644 --- a/notebooks/Data Science Demo.ipynb +++ b/notebooks/Data Science Demo.ipynb @@ -23,8 +23,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 30 ms, sys: 4.59 ms, total: 34.6 ms\n", - "Wall time: 646 ms\n" + "CPU times: user 185 ms, sys: 108 ms, total: 293 ms\n", + "Wall time: 917 ms\n" ] } ], @@ -43,8 +43,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 7.08 s, sys: 69.5 ms, total: 7.15 s\n", - "Wall time: 7.22 s\n" + "CPU times: user 6.73 s, sys: 63 ms, total: 6.79 s\n", + "Wall time: 6.8 s\n" ] } ], @@ -104,46 +104,46 @@ "\n", "Standardization was not necessary for 2 (0.0%), resulted in 0 updates (0.0%), and 34,522 failures (100.0%) in column `object_id`. Here's a breakdown of the prefixes that weren't possible to standardize:\n", "\n", - "| prefix | count | examples |\n", - "|:-----------------------|--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", - "| EFO | 131 | EFO:0000195, EFO:0000612, EFO:0000729, EFO:0003914, EFO:0004222 |\n", - "| GARD | 2030 | GARD:1224, GARD:4771, GARD:6464, GARD:7179, GARD:7475 |\n", - "| ICD10CM | 3666 | ICD10CM:E75.0, ICD10CM:H35.42, ICD10CM:I75, ICD10CM:K59.39, ICD10CM:Q38.1 |\n", - "| ICD9CM | 2266 | ICD9CM:335.22, ICD9CM:368.51, ICD9CM:375.15, ICD9CM:618.8, ICD9CM:622.2 |\n", - "| ICDO | 361 | ICDO:8050/3, ICDO:8051/3, ICDO:8290/0, ICDO:8470/3, ICDO:8920/3 |\n", - "| KEGG | 41 | KEGG:05210, KEGG:05219, KEGG:05221, KEGG:05310, KEGG:H02296 |\n", - "| MEDDRA | 41 | MEDDRA:10001229, MEDDRA:10036794, MEDDRA:10066387, MEDDRA:10068842 |\n", - "| MESH | 3847 | MESH:C562745, MESH:D003882, MESH:D008288, MESH:D009072, MESH:D015270 |\n", - "| NCI | 4788 | NCI:C27472, NCI:C3406, NCI:C39860, NCI:C4296, NCI:C84886 |\n", - "| OMIM | 5539 | OMIM:154800, OMIM:229050, OMIM:255300, OMIM:614465, OMIM:615725 |\n", - "| ORDO | 2023 | ORDO:2554, ORDO:295195, ORDO:397593, ORDO:733, ORDO:79257 |\n", - "| SNOMEDCT_US_2020_03_01 | 6 | SNOMEDCT_US_2020_03_01:236818008, SNOMEDCT_US_2020_03_01:254828009, SNOMEDCT_US_2020_03_01:52564001 |\n", - "| SNOMEDCT_US_2020_09_01 | 1 | SNOMEDCT_US_2020_09_01:1112003 |\n", - "| SNOMEDCT_US_2021_07_31 | 10 | SNOMEDCT_US_2021_07_31:205329008, SNOMEDCT_US_2021_07_31:268180007, SNOMEDCT_US_2021_07_31:75931002, SNOMEDCT_US_2021_07_31:785879009, SNOMEDCT_US_2021_07_31:86249007 |\n", - "| SNOMEDCT_US_2021_09_01 | 5088 | SNOMEDCT_US_2021_09_01:128925001, SNOMEDCT_US_2021_09_01:254916002, SNOMEDCT_US_2021_09_01:267572005, SNOMEDCT_US_2021_09_01:389261002, SNOMEDCT_US_2021_09_01:94069006 |\n", - "| UMLS_CUI | 6890 | UMLS_CUI:C0085574, UMLS_CUI:C0153212, UMLS_CUI:C0282492, UMLS_CUI:C1332356, UMLS_CUI:C1838329 |\n", + "| prefix | count | examples |\n", + "|:-----------------------|--------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| EFO | 131 | EFO:0000274, EFO:0001071, EFO:0001075, EFO:0001422, EFO:0004705 |\n", + "| GARD | 2030 | GARD:2562, GARD:5721, GARD:6291, GARD:7065, GARD:8378 |\n", + "| ICD10CM | 3666 | ICD10CM:A21.0, ICD10CM:C03, ICD10CM:K72, ICD10CM:K82.4, ICD10CM:N30.0 |\n", + "| ICD9CM | 2266 | ICD9CM:214.4, ICD9CM:232.4, ICD9CM:377.75, ICD9CM:428.2, ICD9CM:745.6 |\n", + "| ICDO | 361 | ICDO:8300/0, ICDO:8840/3, ICDO:9442/1, ICDO:9530/0, ICDO:9590/3 |\n", + "| KEGG | 41 | KEGG:05016, KEGG:05133, KEGG:05142, KEGG:05222, KEGG:05414 |\n", + "| MEDDRA | 41 | MEDDRA:10001229, MEDDRA:10015487, MEDDRA:10021312, MEDDRA:10059200, MEDDRA:10060740 |\n", + "| MESH | 3847 | MESH:D002128, MESH:D005141, MESH:D009198, MESH:D011040, MESH:D017240 |\n", + "| NCI | 4788 | NCI:C26913, NCI:C27390, NCI:C27871, NCI:C40284, NCI:C6081 |\n", + "| OMIM | 5539 | OMIM:209700, OMIM:222300, OMIM:530000, OMIM:613021, OMIM:618224 |\n", + "| ORDO | 2023 | ORDO:139441, ORDO:2510, ORDO:255229, ORDO:420702, ORDO:48652 |\n", + "| SNOMEDCT_US_2020_03_01 | 6 | SNOMEDCT_US_2020_03_01:236818008, SNOMEDCT_US_2020_03_01:778024005, SNOMEDCT_US_2020_03_01:8757006 |\n", + "| SNOMEDCT_US_2020_09_01 | 1 | SNOMEDCT_US_2020_09_01:1112003 |\n", + "| SNOMEDCT_US_2021_07_31 | 10 | SNOMEDCT_US_2021_07_31:268180007, SNOMEDCT_US_2021_07_31:703536004, SNOMEDCT_US_2021_07_31:721311006, SNOMEDCT_US_2021_07_31:75931002 |\n", + "| SNOMEDCT_US_2021_09_01 | 5088 | SNOMEDCT_US_2021_09_01:111359004, SNOMEDCT_US_2021_09_01:155748004, SNOMEDCT_US_2021_09_01:238113006, SNOMEDCT_US_2021_09_01:38804009, SNOMEDCT_US_2021_09_01:92585006 |\n", + "| UMLS_CUI | 6890 | UMLS_CUI:C0031347, UMLS_CUI:C0206724, UMLS_CUI:C0276007, UMLS_CUI:C0392492, UMLS_CUI:C1515285 |\n", "\n", "## Suggestions\n", "\n", - "- NCI appears in Bioregistry under [`ncit`](https://bioregistry.io/ncit). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- MESH appears in Bioregistry under [`mesh`](https://bioregistry.io/mesh). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- ICD9CM appears in Bioregistry under [`icd9cm`](https://bioregistry.io/icd9cm). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- SNOMEDCT_US_2021_09_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- UMLS_CUI appears in Bioregistry under [`umls`](https://bioregistry.io/umls). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- ICD10CM appears in Bioregistry under [`icd10cm`](https://bioregistry.io/icd10cm). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- ORDO appears in Bioregistry under [`orphanet.ordo`](https://bioregistry.io/orphanet.ordo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- GARD appears in Bioregistry under [`gard`](https://bioregistry.io/gard). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- OMIM appears in Bioregistry under [`omim`](https://bioregistry.io/omim). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- ICDO appears in Bioregistry under [`icdo`](https://bioregistry.io/icdo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- EFO appears in Bioregistry under [`efo`](https://bioregistry.io/efo). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- MEDDRA appears in Bioregistry under [`meddra`](https://bioregistry.io/meddra). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- KEGG appears in Bioregistry under [`kegg`](https://bioregistry.io/kegg). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- SNOMEDCT_US_2021_07_31 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- SNOMEDCT_US_2020_03_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n", - "- SNOMEDCT_US_2020_09_01 appears in Bioregistry under [`snomedct`](https://bioregistry.io/snomedct). Consider chaining your converter with the Bioregistry using [`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html).\n" + "- NCI Suggestion.x7 - ncit\n", + "- MESH Suggestion.x7 - mesh\n", + "- ICD9CM Suggestion.x7 - icd9cm\n", + "- SNOMEDCT_US_2021_09_01 Suggestion.x7 - snomedct\n", + "- UMLS_CUI Suggestion.x7 - umls\n", + "- ICD10CM Suggestion.x7 - icd10cm\n", + "- ORDO Suggestion.x7 - orphanet.ordo\n", + "- GARD Suggestion.x7 - gard\n", + "- OMIM Suggestion.x7 - omim\n", + "- ICDO Suggestion.x7 - icdo\n", + "- EFO Suggestion.x7 - efo\n", + "- MEDDRA Suggestion.x7 - meddra\n", + "- KEGG Suggestion.x7 - kegg\n", + "- SNOMEDCT_US_2021_07_31 Suggestion.x7 - snomedct\n", + "- SNOMEDCT_US_2020_03_01 Suggestion.x7 - snomedct\n", + "- SNOMEDCT_US_2020_09_01 Suggestion.x7 - snomedct\n" ], "text/plain": [ - "Report(converter=, column='object_id', nones=0, stayed=2, updated=0)" + "Report(converter=, column='object_id', nones=0, stayed=2, updated=0)" ] }, "execution_count": 5, @@ -164,7 +164,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -188,7 +188,7 @@ "Standardization was successfully applied to all 36,730 CURIEs in column `object_id`." ], "text/plain": [ - "Report(converter=, column='object_id', nones=0, stayed=0, updated=36730)" + "Report(converter=, column='object_id', nones=0, stayed=0, updated=36730)" ] }, "execution_count": 7, @@ -228,11 +228,11 @@ "\n", "## Suggestions\n", "\n", - "- http entries are not CURIEs, try and compressing your data first.\n", - "- not_a_curie is not a valid CURIE\n" + "- http Suggestion.x2\n", + "- not_a_curie Suggestion.x3\n" ], "text/plain": [ - "Report(converter=, column=0, nones=1, stayed=1, updated=1)" + "Report(converter=, column=0, nones=1, stayed=1, updated=1)" ] }, "execution_count": 8, diff --git a/src/curies/__init__.py b/src/curies/__init__.py index 1145ae36..d5b918fc 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -16,6 +16,7 @@ load_prefix_map, ) from .reconciliation import remap_curie_prefixes, remap_uri_prefixes, rewire +from .report import Report from .sources import ( get_bioregistry_converter, get_go_converter, @@ -28,6 +29,7 @@ __all__ = [ "Converter", "Record", + "Report", "ReferenceTuple", "Reference", "DuplicateValueError", diff --git a/src/curies/api.py b/src/curies/api.py index ad3b6dff..38683016 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -3,11 +3,8 @@ """Data structures and algorithms for :mod:`curies`.""" import csv -import dataclasses import itertools as itt import json -import random -import typing from collections import Counter, defaultdict from pathlib import Path from typing import ( @@ -22,7 +19,6 @@ Mapping, NamedTuple, Optional, - Sequence, Set, Tuple, TypeVar, @@ -38,6 +34,9 @@ import pandas import rdflib + import curies + + __all__ = [ "Converter", "Reference", @@ -381,147 +380,6 @@ def _prepare(data: LocationOr[X]) -> X: return data -@dataclasses.dataclass -class Report: - """A report on CURIEs standardization.""" - - converter: "Converter" - column: str | int - nones: int - stayed: int - updated: int - failures: Mapping[str, typing.Counter[str]] = dataclasses.field(repr=False) - - def count_prefixes(self) -> typing.Counter[str]: - """Count the frequency of each failing prefix.""" - return Counter({prefix: len(counter) for prefix, counter in self.failures.items()}) - - def get_df(self) -> "pandas.DataFrame": - """Summarize standardization issues in a dataframe.""" - import pandas as pd - - rows = [ - ( - prefix, - sum(counter.values()), - ", ".join(sorted(set(random.choices(list(counter), k=5)))), # noqa:S311 - ) - for prefix, counter in sorted(self.failures.items(), key=lambda p: p[0].casefold()) - ] - return pd.DataFrame(rows, columns=["prefix", "count", "examples"]) - - def get_suggestions(self) -> Dict[str, str]: - """Get a mapping from missing prefix to suggestion text.""" - try: - import bioregistry - except ImportError: - bioregistry = None - - norm_to_prefix = defaultdict(set) - - def _norm(s: str) -> str: - for x in "_.- ": - s = s.replace(x, "") - return s.casefold() - - for record in self.converter.records: - for p in record._all_prefixes: - norm_to_prefix[_norm(p)].add(p) - - rv = {} - for prefix, c in self.failures.items(): - if prefix in {"url", "uri", "iri"}: - rv[prefix] = "is an incorrect way of encoding a URI" - continue - if prefix in {"urn"}: - rv[ - prefix - ] = "means data is encoded using URNs, which isn't explicitly handled by this package." - continue - if prefix in {"http", "https", "ftp"}: - rv[prefix] = "entries are not CURIEs, try and compressing your data first." - continue - if len(c) == 1: - first = list(c)[0] - if first == prefix: - rv[prefix] = "is not a valid CURIE" - continue - elif first.lower() == f"{prefix.lower()}:{prefix.lower()}": - rv[prefix] = f"has a double prefix annotation: {first}" - continue - correct = sorted(norm_to_prefix.get(_norm(prefix), [])) - if correct: - rv[prefix] = f"is a case/punctuation variant. Try using {_list(correct)}" - continue - - if bioregistry is not None: - norm_prefix = bioregistry.normalize_prefix(prefix) - if norm_prefix: - rv[prefix] = ( - f"appears in Bioregistry under [`{norm_prefix}`](https://bioregistry." - f"io/{norm_prefix}). Consider chaining your converter with the Bioregistry using " - "[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." - ) - continue - - # TODO check for bananas? - rv[prefix] = ( - "can either be added to the converter if it is local to the project, " - "or if it is globally useful, contributed to the Bioregistry" - ) - return rv - - def get_markdown(self) -> str: - """Get markdown text.""" - try: - import bioregistry - except ImportError: - bioregistry = None - - failures = sum(len(c) for c in self.failures.values()) - total = self.nones + self.stayed + self.updated + failures - df = self.get_df() - - # TODO write # CURIEs, # unique CURIEs, and # unique prefixes - text = "## Summary\n\n" - if 0 == len(df.index): - if not self.stayed: - return f"Standardization was successfully applied to all {self.updated:,} CURIEs in column `{self.column}`." - return ( - f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}) CURIEs " - f"and resulted in updates for {self.updated:,} ({self.updated/total:.1%}) CURIEs in column `{self.column}`" - ) - - if bioregistry is None: - text += "\nInstall the Bioregistry with `pip install bioregistry` for more detailed suggestions\n\n" - text += ( - f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}), " - f"resulted in {self.updated:,} updates ({self.updated/total:.1%}), and {failures:,} failures " - f"({failures/total:.1%}) in column `{self.column}`. Here's a breakdown of the prefixes that " - f"weren't possible to standardize:\n\n" - ) - text += df.to_markdown(index=False) - - suggestions = self.get_suggestions() - if suggestions: - text += "\n\n## Suggestions\n\n" - for prefix, suggestion in suggestions.items(): - text += f"- {prefix} {suggestion}\n" - return text - - def _repr_markdown_(self) -> str: - return self.get_markdown() - - -def _list(correct: Sequence[str]) -> str: - if len(correct) == 1: - return f"`{correct[0]}`" - if len(correct) == 2: - return f"`{correct[0]}` or `{correct[1]}`" - x = ", ".join(f"`{v}`" for v in correct[:-1]) - return f"{x}, or `{correct[-1]}`" - - class Converter: """A cached prefix map data structure. @@ -1292,6 +1150,7 @@ def pd_compress( :param df: A pandas DataFrame :param column: The column in the dataframe containing URIs to convert to CURIEs. :param target_column: The column to put the results in. Defaults to input column. + :param strict: Should errors be thrown if any IRIs are not compressable? """ func = self.compress_strict if strict else self.compress df[column if target_column is None else target_column] = df[column].map(func) @@ -1309,6 +1168,7 @@ def pd_expand( :param df: A pandas DataFrame :param column: The column in the dataframe containing CURIEs to convert to URIs. :param target_column: The column to put the results in. Defaults to input column. + :param strict: Should errors be thrown if any CURIEs are not expandable? """ func = self.expand_strict if strict else self.expand df[column if target_column is None else target_column] = df[column].map(func) @@ -1319,7 +1179,6 @@ def pd_standardize_prefix( *, column: Union[str, int], target_column: Union[None, str, int] = None, - strict: bool = False, ) -> None: """Standardize all prefixes in the given column. @@ -1338,12 +1197,15 @@ def pd_standardize_curie( column: Union[str, int], target_column: Union[None, str, int] = None, strict: bool = False, - ) -> Report: + ) -> "curies.Report": r"""Standardize all CURIEs in the given column. :param df: A pandas DataFrame :param column: The column in the dataframe containing CURIEs to standardize. :param target_column: The column to put the results in. Defaults to input column. + :param strict: Should errors be thrown if CURIEs are not standardizable? + :return: A report object + :raises ValueError: If strict is enabled and the column contains CURIEs that aren't standardizable The Disease Ontology curates mappings to other semantic spaces and distributes them in the tabular SSSOM format. However, they use a wide variety of non-standard prefixes for referring @@ -1362,6 +1224,8 @@ def pd_standardize_curie( """ import pandas as pd + from .report import Report + norm_curies: List[Optional[str]] = [] failures: DefaultDict[str, Counter[str]] = defaultdict(Counter) stayed = 0 @@ -1396,7 +1260,9 @@ def pd_standardize_curie( ) if strict and failures: raise ValueError( - f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set `strict=False`, and entries that can't be parsed will be given `None`, or try and improve your context to better cover your data. Here's the report:\n\n{report.get_markdown()}" + f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set " + f"`strict=False`, and entries that can't be parsed will be given `None`, or try " + f"and improve your context to better cover your data. Here's the report:\n\n{report.get_markdown()}" ) df[column if target_column is None else target_column] = norm_curies return report @@ -1407,7 +1273,6 @@ def pd_standardize_uri( *, column: Union[str, int], target_column: Union[None, str, int] = None, - strict: bool = False, ) -> None: """Standardize all URIs in the given column. diff --git a/src/curies/report.py b/src/curies/report.py new file mode 100644 index 00000000..897ffd22 --- /dev/null +++ b/src/curies/report.py @@ -0,0 +1,174 @@ +"""Report.""" + +import dataclasses +import random +import typing +from collections import Counter, defaultdict +from typing import TYPE_CHECKING, Dict, Mapping, Optional, Tuple +import enum +from .api import Converter + +if TYPE_CHECKING: + import pandas + +__all__ = [ + "Report", +] + + +def _list(correct: typing.Sequence[str]) -> str: + if len(correct) == 1: + return f"`{correct[0]}`" + if len(correct) == 2: + return f"`{correct[0]}` or `{correct[1]}`" + x = ", ".join(f"`{v}`" for v in correct[:-1]) + return f"{x}, or `{correct[-1]}`" + + +class Suggestion(enum.Enum): + """""" + + x1 = "means data is encoded using URNs, which isn't explicitly handled by this package." + x2 = "entries are not CURIEs, try and compressing your data first." + x3 = "is not a valid CURIE" + x4 = "has a double prefix annotation" + x5 = "is a case/punctuation variant" + x6 = "is an incorrect way of encoding a URI" + x7 = ( + f"appears in Bioregistry under. Consider chaining your converter with the Bioregistry using " + "[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." + ) + xx = ( + "can either be added to the converter if it is local to the project, " + "or if it is globally useful, contributed to the Bioregistry" + ) + + +@dataclasses.dataclass +class Report: + """A report on CURIEs standardization.""" + + converter: "Converter" + column: str | int + nones: int + stayed: int + updated: int + failures: Mapping[str, typing.Counter[str]] = dataclasses.field(repr=False) + + def count_prefixes(self) -> typing.Counter[str]: + """Count the frequency of each failing prefix.""" + return Counter({prefix: len(counter) for prefix, counter in self.failures.items()}) + + def get_df(self) -> "pandas.DataFrame": + """Summarize standardization issues in a dataframe.""" + import pandas as pd + + rows = [ + ( + prefix, + sum(counter.values()), + ", ".join(sorted(set(random.choices(list(counter), k=5)))), # noqa:S311 + ) + for prefix, counter in sorted(self.failures.items(), key=lambda p: p[0].casefold()) + ] + return pd.DataFrame(rows, columns=["prefix", "count", "examples"]) + + def get_suggestions(self) -> Dict[str, Tuple[Suggestion, Optional[str]]]: + """Get a mapping from missing prefix to suggestion text.""" + try: + import bioregistry + except ImportError: + bioregistry = None + + norm_to_prefix = defaultdict(set) + + def _norm(s: str) -> str: + for x in "_.- ": + s = s.replace(x, "") + return s.casefold() + + for record in self.converter.records: + for p in record._all_prefixes: + norm_to_prefix[_norm(p)].add(p) + + rv: dict[str, tuple[Suggestion, str | None]] = {} + for prefix, c in self.failures.items(): + if prefix in {"url", "uri", "iri"}: + rv[prefix] = Suggestion.x6, None + continue + if prefix in {"urn"}: + rv[prefix] = Suggestion.x1, None + continue + if prefix in {"http", "https", "ftp"}: + rv[prefix] = Suggestion.x2, None + continue + if len(c) == 1: + first = list(c)[0] + if first == prefix: + rv[prefix] = Suggestion.x3, None + continue + elif first.lower() == f"{prefix.lower()}:{prefix.lower()}": + rv[prefix] = Suggestion.x4, prefix.lower() + continue + correct = sorted(norm_to_prefix.get(_norm(prefix), [])) + if correct: + rv[prefix] = Suggestion.x5, _list(correct) + continue + + if bioregistry is not None: + norm_prefix = bioregistry.normalize_prefix(prefix) + if norm_prefix: + rv[prefix] = Suggestion.x7, norm_prefix + continue + + # TODO check for bananas? + rv[prefix] = Suggestion.xx, None + return rv + + def get_markdown(self) -> str: + """Get markdown text.""" + try: + import bioregistry + except ImportError: + bioregistry = None + + failures = sum(len(c) for c in self.failures.values()) + total = self.nones + self.stayed + self.updated + failures + df = self.get_df() + + # TODO write # CURIEs, # unique CURIEs, and # unique prefixes + text = "## Summary\n\n" + if 0 == len(df.index): + if not self.stayed: + return ( + f"Standardization was successfully applied to all " + f"{self.updated:,} CURIEs in column `{self.column}`." + ) + return ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}) CURIEs " + f"and resulted in updates for {self.updated:,} ({self.updated/total:.1%}) CURIEs " + f"in column `{self.column}`" + ) + + if bioregistry is None: + text += "\nInstall the Bioregistry with `pip install bioregistry` for more detailed suggestions\n\n" + text += ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}), " + f"resulted in {self.updated:,} updates ({self.updated/total:.1%}), and {failures:,} failures " + f"({failures/total:.1%}) in column `{self.column}`. Here's a breakdown of the prefixes that " + f"weren't possible to standardize:\n\n" + ) + text += df.to_markdown(index=False) + + suggestions = self.get_suggestions() + if suggestions: + text += "\n\n## Suggestions\n\n" + for prefix, (suggestion, extra) in suggestions.items(): + text += f"- {prefix} {suggestion}" + if extra: + text += f" - {extra}" + text += "\n" + return text + + def _repr_markdown_(self) -> str: + return self.get_markdown() diff --git a/tests/test_data_science.py b/tests/test_data_science.py index 9fddb0a3..73887faa 100644 --- a/tests/test_data_science.py +++ b/tests/test_data_science.py @@ -1,3 +1,5 @@ +"""Tests for data science utilities.""" + import unittest import pandas as pd @@ -6,9 +8,10 @@ class TestDataScience(unittest.TestCase): - """""" + """Test case for data science utilities.""" def test_case_mismatch(self): + """Test case mismatch on CURIE standardizations.""" data = ["EFO:1", "nope:nope"] df = pd.DataFrame([(row,) for row in data], columns=["curie"]) @@ -18,3 +21,6 @@ def test_case_mismatch(self): results = converter.pd_standardize_curie(df, column="curie") suggestions = results.get_suggestions() + self.assertIsInstance(suggestions, dict) + self.assertIn("", suggestions) + # FIXME add more detailed tests From 28870778dd0297af07e9caffe2a27d5f2242555b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 14 Oct 2023 12:58:17 +0200 Subject: [PATCH 7/8] Update report.py --- src/curies/report.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/curies/report.py b/src/curies/report.py index 897ffd22..d78cba34 100644 --- a/src/curies/report.py +++ b/src/curies/report.py @@ -1,11 +1,12 @@ """Report.""" import dataclasses +import enum import random import typing from collections import Counter, defaultdict from typing import TYPE_CHECKING, Dict, Mapping, Optional, Tuple -import enum + from .api import Converter if TYPE_CHECKING: From ddfdb5edbef084c57c18f0821496a626566f6e7c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 14 Oct 2023 12:58:42 +0200 Subject: [PATCH 8/8] Update api.py --- src/curies/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/curies/api.py b/src/curies/api.py index 8374f137..c2a7b671 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -1381,7 +1381,7 @@ def pd_standardize_curie( column: Union[str, int], target_column: Union[None, str, int] = None, strict: bool = False, - passthrough: bool = (False,), + passthrough: bool = False, ) -> "curies.Report": r"""Standardize all CURIEs in the given column.