diff --git a/Makefile b/Makefile index bc97e827..7191ebde 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ $(DOCDIR): gendoc: $(DOCDIR) cp -rf $(SRC)/docs/* $(DOCDIR) ; \ - $(RUN) gen-doc -d $(DOCDIR) $(SOURCE_SCHEMA_PATH) --template-directory $(TEMPLATE_DIR) + $(RUN) gen-doc -d $(DOCDIR) $(SOURCE_SCHEMA_PATH) --template-directory $(TEMPLATE_DIR) --index-name linkml-index testdoc: gendoc serve diff --git a/mkdocs.yml b/mkdocs.yml index dad89695..90d6f29e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,16 +25,20 @@ markdown_extensions: emoji_index: !!python/name:materialx.emoji.twemoji emoji_generator: !!python/name:materialx.emoji.to_svg nav: - - Home: home.md - - About SSSOM: about.md - - Specification: index.md - - Overview: spec.md + - Home: index.md + - Introduction: introduction.md + - Specification: + - Introduction: spec-intro.md + - Data model: + - Introduction: spec-model.md + - LinkML documentation: linkml-index.md + - Serialisations: + - Introduction: spec-formats.md + - SSSOM/TSV serialisation: spec-formats-tsv.md + - OWL/RDF serialisation: spec-formats-owl.md + - JSON serialisation: spec-formats-json.md - FAQ: faq.md - - Credits: credits.md - - Contact us: contact.md - - Resources for contributors: - - How to contribute?: contributing.md - - Code of Conduct: code_of_conduct.md + - Resources for contributors: contributing.md - Resources for users: - Use cases: - Overview: usecases.md diff --git a/poetry.lock b/poetry.lock index bc4781e5..76c318f6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -689,13 +689,13 @@ referencing = ">=0.31.0" [[package]] name = "linkml" -version = "1.6.2" +version = "1.7.10" description = "Linked Open Data Modeling Language" optional = false -python-versions = ">=3.8,<4.0" +python-versions = "<4.0.0,>=3.8.1" files = [ - {file = "linkml-1.6.2-py3-none-any.whl", hash = "sha256:0e11b085ada080e0ebe9eee469ad55970b0cc333e7c39be956740dbc3a9e50b0"}, - {file = "linkml-1.6.2.tar.gz", hash = "sha256:b1560a67de8c7de074c8be2ef5b810425f058e0874076e49c17a2dc4112f9da2"}, + {file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"}, + {file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"}, ] [package.dependencies] @@ -708,11 +708,11 @@ jinja2 = ">=3.1.0" jsonasobj2 = ">=1.0.3,<2.0.0" jsonschema = {version = ">=4.0.0", extras = ["format"]} linkml-dataops = "*" -linkml-runtime = ">=1.6.0" +linkml-runtime = ">=1.7.4" openpyxl = "*" parse = "*" prefixcommons = ">=0.1.7" -prefixmaps = ">=0.1.3" +prefixmaps = ">=0.2.2" pydantic = ">=1.0.0,<3.0.0" pyjsg = ">=0.11.6" pyshex = ">=0.7.20" @@ -722,8 +722,14 @@ pyyaml = "*" rdflib = ">=6.0.0" requests = ">=2.22" sqlalchemy = ">=1.4.31" +typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.9\""} watchdog = ">=0.9.0" +[package.extras] +black = ["black (>=24.0.0)"] +shacl = ["pyshacl (>=0.25.0,<0.26.0)"] +tests = ["black (>=24.0.0)", "pyshacl (>=0.25.0,<0.26.0)"] + [[package]] name = "linkml-dataops" version = "0.1.0" @@ -2132,5 +2138,5 @@ docs = [] [metadata] lock-version = "2.0" -python-versions = "^3.8" -content-hash = "ebd8569b0aed5963769685c890e8f3eea55c708fdbf82ea561ea5877c8f13eb7" +python-versions = "^3.8.1" +content-hash = "87753b722242bfa05bc6825a8ced6e1755cb1d0da5f9a253497ea8fcfcd4962e" diff --git a/pyproject.toml b/pyproject.toml index a1a044d5..1e6d04d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,11 +10,11 @@ authors = [ license = "MIT" [tool.poetry.dependencies] -python = "^3.8" +python = "^3.8.1" linkml-runtime = "*" [tool.poetry.dev-dependencies] -linkml = "^1.6.2" +linkml = "^1.7.0" mkdocs-material = "^8.2.8" mkdocs-mermaid2-plugin = "^1.1.1" diff --git a/src/docs/about.md b/src/docs/about.md deleted file mode 100644 index 1b21f856..00000000 --- a/src/docs/about.md +++ /dev/null @@ -1,18 +0,0 @@ -# About SSSOM, A Simple Standard for Sharing Ontological Mappings - -SSSOM is a simple metadata standard for describing semantic mappings: - -1. Introducing a machine-readable and extensible vocabulary to describe metadata of mappings. -2. Defining an easy to use table-based format that can be integrated into existing data science pipelines without the need to parse or query ontologies, and that integrates seamlessly with Linked Data standards. -3. Implementing open and community-driven collaborative workflows designed to evolve the standard continuously to address changing requirements and mapping practices. -4. Providing reference tools and software libraries for working with the standard. - -A SSSOM mapping comprises three major components: - -1. The **mapping** itself, that is, a triple `` that reflects a correspondence of a `subject` entity, for example a class in an ontology, to an `object` entity, for example an identifier in some database, via a semantic mapping `predicate`, such as `skos:exactMatch`. -2. A **mapping justification**, which the process or activity that led us to consider the mapping to be correct or reasonable (typical examples: labels match exactly; two classes are logically equivalent; a domain expert determined that two terms reflect the same real world concept). -3. **Provenance metadata**, including information about `author` and `mapping_tool`. - -For a detailed overview see [here](spec.md). - - diff --git a/src/docs/code_of_conduct.md b/src/docs/code_of_conduct.md deleted file mode 100644 index a7c1eaaf..00000000 --- a/src/docs/code_of_conduct.md +++ /dev/null @@ -1,4 +0,0 @@ -# Code of Conduct - -- [Contribution guidelines](https://github.com/mapping-commons/sssom/blob/master/CONTRIBUTING.md) -- [Code of Conduct](https://github.com/mapping-commons/sssom/blob/master/CODE_OF_CONDUCT.md) \ No newline at end of file diff --git a/src/docs/contact.md b/src/docs/contact.md deleted file mode 100644 index f55ce1fb..00000000 --- a/src/docs/contact.md +++ /dev/null @@ -1,11 +0,0 @@ -# Contact - -The preferred way to contact the SSSOM team is through the [issue tracker](https://github.com/mapping-commons/sssom/issues) (for problems with SSSOM) or the [GitHub discussions](https://github.com/mapping-commons/sssom/discussions) (for general questions). - -You can find any of the members of the SSSOM core team on GitHub: - -https://github.com/orgs/mapping-commons/teams/sssom-core - -Their GitHub profiles usually also provide email addresses. - -You can also reach us in the [OBO Foundry Slack](https://obo-communitygroup.slack.com/archives/C01DP18L5GW), in the `#sssom` channel. \ No newline at end of file diff --git a/src/docs/contributing.md b/src/docs/contributing.md index f3e53251..43625b05 100644 --- a/src/docs/contributing.md +++ b/src/docs/contributing.md @@ -1,4 +1,4 @@ # Contributing to SSSOM - [Contribution guidelines](https://github.com/mapping-commons/sssom/blob/master/CONTRIBUTING.md) -- [Code of Conduct](https://github.com/mapping-commons/sssom/blob/master/CODE_OF_CONDUCT.md) \ No newline at end of file +- [Code of Conduct](https://github.com/mapping-commons/sssom/blob/master/CODE_OF_CONDUCT.md) diff --git a/src/docs/credits.md b/src/docs/credits.md deleted file mode 100644 index ce2e5c85..00000000 --- a/src/docs/credits.md +++ /dev/null @@ -1,23 +0,0 @@ -# Credits - -## Phenomics First - -Parts of this work were funded by Phenomics First (NIH / NHGRI #1RM1HG010860-01): - -- Design of the specification -- Integration of sssom-py into the Mondo integration -- Development of the sssom-py CLI (command-line client) - -## Bosch Gift to LBNL - -Parts of this work were funded by a gift from [Bosch](https://www.bosch.com/) to the Lawrence Berkely National Laboratories (LBNL): - -- sssom-py IO functionality and deployment on pypi -- Testing framework -- File format converters in sssom-py -- Documentation pages of SSSOM (https://mapping-commons.github.io/sssom) - -## Significant contributions - -- This project was made using the [Link Model Language (LinkML) framework](https://github.com/linkml) -- Harvard Medical School (through significant contributions of Charlie Hoyt, including refactoring of sssom-py, release and quality control infrastructure) diff --git a/src/docs/home.md b/src/docs/home.md deleted file mode 100644 index 093e3f96..00000000 --- a/src/docs/home.md +++ /dev/null @@ -1,19 +0,0 @@ -## Introduction - -SSSOM is a Simple Standard for Sharing Ontological Mappings: - -1. Introducing a machine-readable and extensible vocabulary to describe metadata of mappings. -2. Defining an easy to use table-based format that can be integrated into existing data science pipelines without the need to parse or query ontologies, and that integrates seamlessly with Linked Data standards. -3. Implementing open and community-driven collaborative workflows designed to evolve the standard continuously to address changing requirements and mapping practices. -4. Providing reference tools and software libraries for working with the standard. - -**Quick links:** - -- [GitHub page](https://github.com/mapping-commons/sssom) -- [Detailed description](spec.md) -- [A Simple Standard for Sharing Ontological Mappings (SSSOM)](https://doi.org/10.1093/database/baac035) (publication in the Database Journal) -- [A Simple Standard for Ontological Mappings 2022: Updates of data model and outlook](https://zenodo.org/record/7672104) (Paper and presentation at the Ontology Matching Workshop 2022). -- [A Simple Standard for Ontological Mappings 2023: Updates on data model, collaborations and tooling](https://zenodo.org/record/8202395) (Paper and presentation at the Ontology Matching Workshop 2023). -- [Specification](index.md) -- [Presentations](presentations.md) -- [SSSOM Toolkit](https://mapping-commons.github.io/sssom-py/) diff --git a/src/docs/index.md b/src/docs/index.md new file mode 100644 index 00000000..63c8b0d4 --- /dev/null +++ b/src/docs/index.md @@ -0,0 +1,156 @@ +# Simple Standard for Sharing Ontological Mappings (SSSOM) + +![SSSOM banner](images/sssom-banner.png) + +**SSSOM** is the Simple Standard for Sharing Ontological Mappings. It comprises three distinct components that are intended to be used together to facilitate the exchange of semantic mappings: + +1. a machine-readable and extensible vocabulary to describe metadata of mappings; +2. a data model to represent mappings and their associated metadata; +3. several file formats to represent sets of mappings on disk and on the network. + +Beyond defining the standard itself, the **SSSOM Core Team** also aims to implement open and community-driven collaborative workflows designed to evolve the standard continuously to address changing requirements and mapping practices, and to provide reference tools and software libraries for working with the standard. + +## SSSOM at a glance + +### Basic concepts + +The [data model](spec-model.md) of SSSOM is centered around two fundamental concepts: mappings and mapping sets. + +A **SSSOM mapping** is a statement that there is a correspondence of some sort between two semantic entities. It comprises two components: + +1. The **core mapping** (or **raw mapping**), which is a triple `` that represents the correspondence itself between a subject entity, for example a class in an ontology, and an object entity, for example an identifier in some database, via a semantic mapping predicate, for example `skos:exactMatch`. +2. **Metadata** that provide supplementary pieces of information about the core mapping. This notably includes information pertaining to the *provenance* of the statement (for example, who emitted the statement, in other words who decided that the subject and the object should be mapped) and its *justification* (why should the subject and the object be mapped). + +A **SSSOM mapping set** is a collection of SSSOM mappings, with its own metadata. + +### The SSSOM/TSV format + +The main format proposed by the SSSOM standard to exchange mapping sets is the [SSSOM/TSV format](spec-formats-tsv.md). Here is a basic example of a file in that format: + +``` +#curie_map: +# FOODON: http://purl.obolibrary.org/obo/FOODON_ +# KF_FOOD: https://kewl-foodie.inc/food/ +# orcid: https://orcid.org/ +#mapping_set_id: https://w3id.org/sssom/tutorial/example1.sssom.tsv +#mapping_set_description: Manually curated alignment of KEWL FOODIE INC internal food and nutrition database with Food Ontology (FOODON). Intended to be used for ontological analysis and grouping of KEWL FOODIE INC related data. +#license: https://creativecommons.org/licenses/by/4.0/ +#mapping_date: 2022-05-02 +subject_id subject_label predicate_id object_id object_label mapping_justification author_id confidence comment +KF_FOOD:F001 apple skos:exactMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.95 "We could map to FOODON:03310788 instead to cover sliced apples, but only ""whole"" apple types exist." +KF_FOOD:F002 gala skos:exactMatch FOODON:00003348 Gala apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +KF_FOOD:F003 pink skos:exactMatch FOODON:00004186 Pink apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.9 "We could map to FOODON:00004187 instead which more specifically refers to ""raw"" Pink apples. Decided against to be consistent with other mapping choices." +KF_FOOD:F004 braeburn skos:broadMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +``` + +### Quick links + +**General** + +- [GitHub page](https://github.com/mapping-commons/sssom) +- [Detailed description](introduction.md) +- [Formal specification](spec-intro.md) + +**Publications** + +- [A Simple Standard for Sharing Ontological Mappings (SSSOM)](https://doi.org/10.1093/database/baac035) (initial publication in _Database_) +- [A Simple Standard for Ontological Mappings 2022: Updates of data model and outlook](https://zenodo.org/record/7672104) (paper and presentation at the Ontology Matching Workshop 2022) +- [A Simple Standard for Ontological Mappings 2023: Updates on data model, collaborations and tooling](https://zenodo.org/record/8202395) (paper and presentation at the Ontology Matching Workshop 2023) +- [Other presentations](presentations.md) + +**Related software** + +- [SSSOM Toolkit](https://mapping-commons.github.io/sssom-py/) (reference implementation of the standard, in Python) + +## The SSSOM Core Team + +### Contact + +The preferred way to contact the SSSOM team is through the [issue tracker](https://github.com/mapping-commons/issues) (for problems with SSSOM) or the [GitHub discussion forums](https://github.com/mapping-commons/sssom/discussions) (for general questions). + +You can find any of the members of the SSSOM core team [on GitHub](https://github.com/orgs/mapping-commons/teams/sssom-core). Their GitHub profiles usually also provide email addresses. + +You can also reach us in the [OBO Foundry Slack](https://obo-communitygroup.slack.com/archives/C01DP18L5GW), in the `#sssom` channel. + +### Documentation/specification editors + +* [Nicolas Matentzoglu](https://orcid.org/0000-0002-7356-1779) (Semanticly Ltd; [@matentzn](https://github.com/matentzn)) +* [Chris Mungall](https://orcid.org/0000-0002-6601-2165) (LBL) +* [Ernesto Jimenez-Ruiz](https://orcid.org/0000-0002-9083-4599) (City, University of London) +* [John Graybeal](https://orcid.org/0000-0001-6875-5360) (Stanford) +* [William Duncan](https://orcid.org/0000-0001-9625-1899) (LBL) +* [David Osumi-Sutherland](https://orcid.org/0000-0002-7073-9172) (EMBL-EBI) +* [Simon Jupp](https://orcid.org/0000-0002-0643-3144) (SciBite) +* [James McLaughlin](https://orcid.org/0000-0002-8361-2795) (EMBL-EBI) +* [Henriette Harmse](https://orcid.org/0000-0001-7251-9504) (EMBL-EBI) +* [Tiffany Callahan](https://orcid.org/0000-0002-8169-9049) ([@callahantiff](https://github.com/callahantiff)) +* [Charlie Hoyt](https://orcid.org/0000-0003-4423-4370) (Harvard Medical School; [@cthoyt](https://github.com/cthoyt)) +* [Thomas Liener](https://orcid.org/0000-0003-3257-9937) (Pistoia Alliance) +* [Harshad Hegde](https://orcid.org/0000-0002-2411-565X) (LBL) + +### Contributors + +* [Alasdair Gray](https://orcid.org/0000-0002-5711-4872) +* [Alex Wagner](https://orcid.org/0000-0002-2502-8961) +* [Amelia L. Hoyt](https://orcid.org/0000-0003-1307-2508) +* [Andrew Williams](https://orcid.org/0000-0002-0692-412X) +* [Anne Thessen](https://orcid.org/0000-0002-2908-3327) +* [Benjamin M. Gyori](https://orcid.org/0000-0001-9439-5346) +* [Bill Baumgartner](https://orcid.org/0000-0001-6717-5313) +* [Christopher Chute](https://orcid.org/0000-0001-5437-2545) +* [Chris T. Evelo](https://orcid.org/0000-0002-5301-3142) +* [Damion Dooley](https://orcid.org/0000-0002-8844-9165) +* [Davera Gabriel](https://orcid.org/0000-0001-9041-4597) +* [Harold Solbrig](https://www.wikidata.org/wiki/Q44607574) +* [HyeongSik Kim](https://orcid.org/0000-0002-3002-9838) +* [Ian Harrow](https://orcid.org/0000-0003-0109-0522) +* [James Malone](https://orcid.org/0000-0002-1615-2899) +* [James Overton](https://orcid.org/0000-0001-5139-5557) +* [James P. Balhoff](https://orcid.org/0000-0002-8688-6599) +* [James Stevenson](https://orcid.org/0000-0002-2568-6163) +* [Jiao Dahzi](https://orcid.org/0000-0001-5052-3836) +* [Joe Flack](https://orcid.org/0000-0002-2906-7319) +* [Jooho Lee](https://orcid.org/0000-0002-2955-3405) +* [Julie McMurry](https://orcid.org/0000-0002-9353-5498) +* [Kori Kuzma](https://orcid.org/0000-0002-9954-7449) +* [Kristin Kostka](https://orcid.org/0000-0003-2595-8736) +* [Lauren Chan](https://orcid.org/0000-0002-7463-6306) +* [Melissa Haendel](https://orcid.org/0000-0001-9114-8737) +* [Monica Munoz-Torres](https://orcid.org/0000-0001-8430-6039) +* [Nicole Vasilevsky](https://orcid.org/0000-0001-5208-3432) +* [Nomi Harris](https://orcid.org/0000-0001-6315-3707) +* [Núria Queralt-Rosinach](https://orcid.org/0000-0003-0169-8159) +* [Sabrina Toro](https://orcid.org/0000-0002-4142-7153) +* [Sebastian Koehler](https://orcid.org/0000-0002-5316-1399) +* [Shahim Essaid](https://orcid.org/0000-0003-2338-2550) +* [Sierra Moxon](https://orcid.org/0000-0002-8719-7760) +* [Sue Bello](https://orcid.org/0000-0003-4606-0597) +* [Tim Putman](https://orcid.org/0000-0002-4291-0737) + +## Acknowledgements + +### Funding + +#### Phenomics First + +Parts of this work were funded by Phenomics First (NIH / NHGRI #1RM1HG010860-01): + +- design of the specification; +- integration of `sssom-py` into Mondo; +- development of the `sssom-py` command-line interface. + +#### Bosch Gift to LBNL + +Parts of this work were funded by a gift from [Bosch](https://www.bosch.com/) to the Lawrence Berkely National Laboratories (LBNL): + +- `sssom-py`: + - IO functionality, + - file format converters, + - testing framework, + - deployment on the [Python Package Index](https://pypi.org/); +- SSSOM documentation (this document). + +### Significant contributions + +- The [Link Model Language (LinkML) project](https://github.com/linkml) (used to define the data model). +- Harvard Medical School (through significant contributions of Charlie Hoyt, including refactoring of sssom-py, release and quality control infrastructure). diff --git a/src/docs/introduction.md b/src/docs/introduction.md new file mode 100644 index 00000000..4280b440 --- /dev/null +++ b/src/docs/introduction.md @@ -0,0 +1,66 @@ +# Introduction + +## Abstract + +Mappings, or cross-references, are used to link terms across different ontologies. However, there is currently little to no standardisation in how such mappings are represented. While properties such as hasDbXref property are widely used in ontologies such as GO and MONDO, the meaning of such mappings is unclear, and cannot be further described with additional metadata or provenance. + +The Simple Standard for Sharing Ontology Mappings (SSSOM) is an initiative to provide a minimal and standard set of elements for the dissemination of mappings between ontology terms, to ensure a reliable interpretation of generated mappings and to enable sharing and data integration between people and applications. + +This document introduces the SSSOM catalog of metadata elements, which can be used to attach meta- and provenance data to both mappings and sets of mappings; a controlled vocabulary for the description of match types (SSSOM CV); a definition of both RDF and TSV serialisations of ontology mappings; and a (non-exhaustive) selection of recommended mapping predicates. + +## Introduction + +Currently, there are three methods typically used to express mappings in OWL: direct logical axioms using owl:equivalentClass; the oboInOwl hasDbXref property; and the SKOS vocabulary for mapping properties. The first, owl:equivalentClass, is a strong logical equivalence assertion which is not appropriate for more nuanced mappings such as close matches. The second, hasDbXref, does not assert formal logical equivalence but also has no clearly defined meaning. Finally, the SKOS vocabulary provides a hierarchy of mapping properties which allow the unambigous specification of exact, close, broad, and narrow matches, but does not provide the means for mappings to be annotated with additional metadata such as confidence scores and provenance. + +The Simple Standard for Sharing Ontology Mappings (SSSOM) addresses these problems by defining a catalog of metadata terms to describe mappings. Both individual mappings and **_sets of_** mappings can be described, enabling provenance and metadata to be captured on multiple levels. SSSOM interoperates with existing methods for the specification of mappings, allowing any predicate to be used to describe the nature of each mapping including those from OWL and SKOS. + +The provenance of mappings - such as whether the mapping was created as the result of a human-curated equivalence match, or a semantic similarity match - is specified using a controlled vocabulary (CV), SSSOM CV. Combined with the metadata properties provided by SSSOM such as confidence and semantic_similarity_score, this provenance information can be used to capture mapping descriptions in a manner that is explicit and amenable to curation. + +Two serialisations for SSSOM mappings are provided in this document, aimed at different communities: an RDF/OWL serialisation using IRIs that is aimed at the Knowledge Graph/Semantic Web community, and a TSV serialisation using [CURIE](https://www.w3.org/TR/curie/) syntax which is aimed at the wider bioinformatics community. An unambiguous translation between these serialisations is provided. + +## Challenges for exchange and use of mappings +Despite their importance for data integration, term mappings are typically neglected as data artefacts (57). A mapping in this context is a correspondence between two terms, referred to here as "subject" and "object" terms. A "predicate" defines the type of relationship between the subject and the object, such as skos:exactMatch, or owl:equivalentClass. A mapping, or "match", does not have to be exact: it can be broad, e.g. between a conceptually narrow term such as "Red Delicious" and a conceptually broader term such as "Apple". To our knowledge, no formal review has been published that analyzes the representation and formats used for collections of term mappings (mapping sets, or alignments), but in our experience, most mapping sets are represented as tables using an ad-hoc "schema", often merely a simple two-column format that lists matching terms in two naming schemes. An example of such a table can be seen in the following Table. + + +Subject | Object +--- | --- +UBERON:0002101 | FMA:24875 +UBERON:0000019 | FMA:54448 + +``` +Table 1: An example of a typical mapping table one might find on the web. +``` + +This type of table lacks clear semantics and is therefore very difficult to use and re-use either by humans or by machines. We will discuss a few of the most critical problems in the sections that follow. + +_Non-transparent imprecision_. Mapping precision describes, usually qualitatively, whether a mapping between a subject and an object is exact, broad, narrow, close or related. An exact mapping means that the subject term can be replaced with the object term and vice versa, i.e. they refer to the exact same real-world entity. A broad mapping links a subject term to a more general term, for example, the term "leg" to the term "hindlimb" (if the ontology defines leg as the parts of the hindlimb that exclude the foot). A narrow mapping links a subject term to a more specific term. For example, "long QT syndrome" in the Mondo Disease Ontology is a narrow match to "Romano-Ward long QT syndrome" in Orphanet. A close mapping relates two terms that are neither exact, broad or narrow, but belong to the same category of things and are semantically similar, such as "apple" to "pear", or "paw" to "hand". Due to its subjective nature ("what is close?"), this is a problematic category of mapping, but it is widely used, for example for relating similar anatomical terms across species. Related mappings are mappings across categories of things, such as the mapping between a phenotype "enlarged liver" and the anatomical entity "liver". In practice, it is rare that mapping tables such as the one presented in Table 1 constitute a set of purely "exact" matches. + +Different use cases may require different levels of mapping precision. For example, for entity merging (defined as the process of merging two entities from different sources into one) or data translation (defined as the process of moving annotations from using one ontology to another), exact mappings may be required, while for data grouping broad matches are often sufficient (ensuring that the subject is classified under the object term). For many machine learning use cases, close and related matches will be extremely useful regardless of their lack of semantic precision (though semantic precision is likely to improve predictive power). In practice, many mappings are to varying degrees imprecise but do not specify the mapping precision. This makes it impossible to reliably apply them to use cases such as entity merging or data translation. + +_Non-transparent accuracy, confidence, and provenance_. To scale to real-world use cases, automated tools are critical for matching terms across databases, terminologies and ontologies. Such tools typically implement mapping rules that determine whether a given pair of terms constitutes a match. For example, label matching rules might include "match if subject and object labels match", "match if subject label matches with an exact synonym of the object" and "match if subject and object exhibit a very high degree of semantic similarity". Depending on the rules, tools will have more or less confidence that a match constitutes a mapping. Even human curators often have different levels of confidence about the accuracy of any given mapping, especially if the process of determining whether a mapping is accurate involves the review of (often complex) descriptions and term definitions. + +Different use cases will profit from different degrees of accuracy. For example, if we seek to integrate data from various medical terminologies to inform medical diagnosis, we may require not only a very high degree of confidence about the mapping but also ensure that the mapping is "explainable" to users. To ensure that diagnostic decisions that require bridging of data silos through mappings are explainable, we furthermore need provenance (documentation of where a piece of data comes from and how it was produced), such as an explicit statement of the mapping rules by which the match was originally determined (for example, the labels of both terms could have been the same). Thanks to efforts by initiatives such as the Ontology Alignment Evaluation Initiative (OAEI), many mapping tables on the web include at least a confidence score. However, in our experience, mapping rules are rarely stated explicitly as part of the mappings or mapping set metadata. Many mappings in the wild are to varying degrees inaccurate, but without a confidence score and explicit mapping rules, this inaccuracy will not be transparent. + +_Non-transparent incompleteness_. Mapping sets can be incomplete for (at least) three major reasons: (1) they are out of date, i.e. a term in one ontology was removed (deprecated) in a later version of the ontology or a term with a more precise mapping was introduced; (2) they are deliberately partial, i.e. covering only a subset of terms, which were mapped for a specific purpose (for example a manual effort to map all COVID-19-relevant phenotypes from the Human Phenotype Ontology to the Mammalian Phenotype Ontology); or (3) they accidentally omit certain correct mappings, as the automated approaches that were used did not detect them (false negatives). We cannot determine whether a mapping set such as the one given in Table X1 is up-to-date, deliberately partial or accidentally incomplete without sufficient metadata about the purpose of the mappings, the tools used and the version of the source data used for the matching process. + +_UnFAIRness_. The FAIR principles are a set of community-developed guidelines to ensure that data or any digital object are Findable, Accessible, Interoperable and Reusable. Unlike many of the widely used controlled vocabularies, ontologies and data schemas, mappings are rarely published using standard formats and metadata vocabularies and can therefore be considered second class citizens in the world of FAIR semantics. +While some tools exist to browse mappings (the F and A in FAIR, findable and accessible), such as OxO and BioPortal, they lack access to at least some of the metadata required to determine their applicability for a use case: Are mappings likely to be correct? Are they precise enough? Have they been updated recently? Can I trust the authority that generated the mappings? While some minimum level of interoperability (the I in FAIR) is achieved simply by publishing the mappings as RDF triples (which rarely happens in practice), most mappings are best captured in the form of simple tables (in our experience the preferred format for both mapping curators and data engineers). Furthermore, the predicates or relations used in the mappings are far from standardized. Different relations have different semantics, ranging from strong logical relations such as owl:sameAs or owl:equivalentClass to predicates with no formally specified semantics such as oboInOwl:hasDbXref. + +In our experience, reusability (the R in FAIR) is a significant obstacle to FAIRness. It is infeasible to simply reuse existing mappings without the metadata required to make imprecision, inaccuracy and incompleteness explicit. Repositories such as OxO and BioPortal cannot make mappings more accessible, because the metadata required to do so simply does not exist. In order to gradually improve our mappings and make them FAIRer, we need to be able to share, review, fix and maintain our mappings in much the same way as our ontologies themselves - using standard formats and rich metadata. +FAIRifying data is an effort that aims to supply practical solutions for the use of the FAIR guiding principles throughout the research data life cycle. It recommends technologies that support semantic interoperability in a sustainable way, and practices that support FAIRness. The FAIRSemantics effort is currently discussing how to incorporate semantic mappings, and we reached out to them to consider SSSOM for this purpose. + +## Background about mappings + +A mapping can be defined as a triple _s, p, o_, where s is the subject of the mapping, p is the mapping predicate (or relation) and o is the object. There are many different mapping predicates used in practice, but they are not always standardized. The Semantic Web community uses a number of standard mapping predicates, such as owl:sameAs or owl:equivalentClass (logical mapping predicates) and skos:exactMatch or skos:broadMatch (terminological mapping predicates). We refer to mapping subjects and objects as "terms", which we will loosely define here as a set of symbols that define some entity in the real world. Usually, a term can be referred to by an identifier that uniquely identifies some entity in a certain context. For example, UBERON:0002101 is the identifier for a term that refers to the anatomical entity "limb". +Putting it all together, the mapping describes a correspondence in which the term with the id UBERON:0002101 constitutes a terminological exact match to the term with the identifier FMA:24875. Mappings between data model elements, databases and other representations can be described similarly. Note that we generally use the terms "matching" and "mapping" interchangeably. Occasionally we refer to "matching" as the process to determine a mapping candidate (lexical matching, logical matching etc), a "match" as the result of the matching process, and a "mapping" the process and result of the process that deduces a true correspondence from a (set of) matches. For SSSOM, this distinction is a bit academic, but useful to keep in mind when talking about the interplay of automated approaches (which result in "matches") and manual approaches (which typically result in the final mappings). Ontology alignment is the task of determining corresponding terms shared between two or more ontologies, i.e. mappings. Sometimes "ontology alignment" refers to the output of the alignment process. + +Mapping sets can be "partial", i.e. covering only a subset of terms in the subject or object source (ontology, database, etc), "derived", i.e. one mapping set can be obtained from one or more others (for example, a XAO to MeSH mapping can be obtained by combining a XAO-Uberon mapping with a Uberon-MeSH mapping), or "complete". We refer to a "complete" mapping, i.e. the set of all correspondences between two resources (ontologies, databases), as an "alignment". + +The identifier of a term has three parts: a namespace that describes in which database or ontology the identifier is defined, a local identifier that unambiguously identifies an entity within that namespace, and optionally a separator that can be used to separate the namespace from the local identifier to make them easier to process. UBERON:0002101, for example, comprises the namespace "UBERON", the separator ":" and the local identifier "0002101". There are various syntaxes for denoting identifiers; the UBERON:0002101 notation is called compact URI (CURIE) syntax, which is used widely across the database and ontology worlds. The problem with this syntax is that UBERON may not be a globally unique prefix, so files making use of such CURIEs must come with a prefix map that ensures that UBERON (in the CURIE syntax referred to as "prefix") is globally unique by mapping it to the persistent International Resource Identifier (IRI) prefix http://purl.obolibrary.org/obo/UBERON_. This may not be a major problem for a fairly unique prefix such as "UBERON", but it is for prefixes such as "ICD", which can refer to many different name spaces, such as ICD9, ICD10, ICD11 and more, all of which correspond to entirely different terminologies. + +_Approaches to mapping_. There are many different techniques that can be employed to generate term mappings. Automated matching techniques include ontology matching, entity resolution (the task of determining whether two database records correspond to the same entity), semantic similarity or automated reasoning. Recent approaches based on machine learning and graph embeddings show promise for working with messier inputs. No single tool will perform equally well on all inputs: some of the semantics-aware tools like LogMap and Agreement Maker Light (AML) can exploit the ontology structure to determine high-quality matches but will have problems with the large-scale data linking tasks required by modern big-data applications. + +Purely automated approaches to mapping are often insufficient for real world use cases that require a high degree of accuracy, such as medical diagnostics. They often need to be refined by hand or using sophisticated mapping reconciliation approaches independent of the actual matching. Determining a mapping is often complex, due to the high degree of terminological variability: different communities may use very different names for the same real world entities . For example, for example, the condition referred to in the Human Phenotype Ontology (HPO) as "Hyperchloriduria" is called "increased urine chloride ion level" in the Mammalian Phenotype Ontology (MP), which is used by the model organism community. + +_Mapping rules - capturing the conditions under which a match is established_. +Mapping rules define the conditions under which we determine a match between two terms. For example, the condition for a mapping rule could be "if the subject label and object label match exactly". In practice, mapping rules can be very simple (e.g., "exact match of term labels"), more complex ("exact match between label of subject and exact synonym of object after they are pre-processed using stemming"), or even more exacting ("complex match determined by a human curator that carefully reviewed the descriptions and definitions of both terms and concluded they mean the same thing"). One problem for both manually curated mappings and automated approaches is that these mapping rules are often hidden deeply in the code or are not documented at all. Exposing mapping rules along with confidence scores would be very valuable for reviewing mappings and explaining them to users. Our reference implementation for SSSOM is rdf-matcher, which makes these mapping rules explicit, but other approaches such as OMOP2OBO also capture mapping rules as part of the mapping metadata. diff --git a/src/docs/spec-formats-json.md b/src/docs/spec-formats-json.md new file mode 100644 index 00000000..91cc1407 --- /dev/null +++ b/src/docs/spec-formats-json.md @@ -0,0 +1,5 @@ +# The JSON serialisation format + +The JSON serialisation format is currently unspecified. + +It is intended as a more-or-less direct serialisation of the `MappingSet` class into the JSON format as specified by [RFC 8259](https://datatracker.ietf.org/doc/html/rfc8259), but many details of the serialisation are left unspecified for now. diff --git a/src/docs/spec-formats-owl.md b/src/docs/spec-formats-owl.md new file mode 100644 index 00000000..c4a5d8b1 --- /dev/null +++ b/src/docs/spec-formats-owl.md @@ -0,0 +1,99 @@ +# The OWL/RDF serialisation format + +This section defines a way to serialise SSSOM mappings as _reified OWL axioms_. This has the advantage that any mapping set can be simply merged with an ontology in the usual way, for example using [ROBOT merge](https://robot.obolibrary.org/merge). + +The OWL/RDF serialisation rules deal with three types of reified OWL axioms, and a few sub-types: + +1. Predicate is an annotation property +2. Predicate is an object property and + 1. Object/Subject are classes + 2. Object/Subject are individuals +3. Predicate is language relational construct of RDFS or OWL (`rdfs:subClassOf`, `owl:equivalentClass`) + +## Predicate is an annotation property: + +If the predicate corresponds to an annotation property, the mapping `` gets converted to an OWLAnnotationAssertion axiom: `OWLAnnotationAssertion(P,S,O)`. All mapping level metadata (`meta`) gets converted into OWLAnnotation objects which are materialised as axiom annotations on the mapping annotation assertion, see [OWL 2 Structural Specification](https://www.w3.org/TR/owl2-syntax/#Annotations): + +``` +AnnotationAssertion(meta P, S, O) +``` + +Where `meta` is a sequence of OWL Annotations objects like: + +``` +Annotation(Q1,V1) Annotation(Q2,V2) ... Annotation(Qn,Vn) +``` + +where `Qi` is a SSSOM metadata slot and `Vi` is an annotation value. + +Note that if a SSSOM metadata element value is a list `L` (i.e. can have multiple elements, such as creator and others), individual annotations are created for each of them: + +``` +Annotation(Q,V) for all V in L. +``` + +Example: + +``` +AnnotationAssertion(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) skos:exactMatch ) +``` + +Mapping set level annotations are manifested as Ontology annotation in the usual way, according to the [OWL 2 Structural Specification](https://www.w3.org/TR/owl2-syntax/#Annotations). + +## Predicate is an object property + +### Case 1: Object and Subject are classes. + +The mapping `` gets translated into an existential restriction: + +``` +SubclassOf(S, P some O) +``` + +All metadata slots are added as OWLAnnotation objects and added to SubclassOf axiom as axiom annotations: + +``` +SubclassOf(meta, S, P some O) +``` + +Example: + +``` +SubClassOf(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ObjectSomeValuesFrom( )) +``` + +### Case 2: Object and Subject are individuals + +The mapping `` gets translated into an object property assertion: + +``` +ObjectPropertyAssertion(P, S, O) +``` + +All metadata slots are added as OWLAnnotation objects and added to ObjectPropertyAssertion axiom as axiom annotations: + +``` +ObjectPropertyAssertion(meta, P, S, O) +``` + +Example: + +``` +ObjectPropertyAssertion(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ) +``` + + +### Predicate is language relational construct of RDFS or OWL + +The mapping `` gets translated into an annotated axiom using the following table: + +| Mapping predicate | Generated axiom | +| ------------------- | --------------------------- | +| owl:equivalentClass | EauivalentClass(meta, S, O) | +| rdfs:subClassOf | SubClassOf(meta, S, O) | + +Example: + +``` +SubClassOf(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ) +``` diff --git a/src/docs/spec-formats-tsv.md b/src/docs/spec-formats-tsv.md new file mode 100644 index 00000000..6a12dc7f --- /dev/null +++ b/src/docs/spec-formats-tsv.md @@ -0,0 +1,292 @@ +# The SSSOM/TSV serialisation format + +The SSSOM/TSV format is intended as the main format for exchanging SSSOM mapping set objects. + +The RECOMMENDED filename extension for a SSSOM/TSV file is `.sssom.tsv`, but SSSOM/TSV parsers MUST accept SSSOM/TSV files regardless of their extension. + + +## Structure + +A SSSOM/TSV file contains one, and only one, mapping set object. It is made of two different parts: + +* the _metadata block_, which contains essentially all the slots of the [`MappingSet` class](MappingSet.md) except the `mappings` slot; +* the _mappings block_ (also called the _TSV section_), which contains the individual mappings. + +A SSSOM/TSV file MUST NOT contain anything other than those two blocks. + + +### Metadata block + +The metadata block is written as the [YAML 1.2](https://yaml.org/spec/1.2.2/) serialisation of the `MappingSet` object, except that the `mappings` slot is _not_ included (since it contains the mappings, that are serialised in the mappings block instead). + +The metadata block MUST appear at the beginning of the file. Every line of the block MUST be preceded by a `#` character; the `#` character MAY be followed by one or several space characters (U+0020) before the YAML content – if so, every line MUST have the same number of space characters. + +The metadata block ends with the first line that does not begin with a `#` character, which marks the beginning of the mappings block. + +The metadata block SHOULD only contain the slots that do have a value. SSSOM/TSV writers SHOULD skip slots with no value when serialising the mapping set object. + +#### Multi-valued slots with a single value + +As an exception to the standard YAML rules regarding the serialisation of sequences, a multi-valued slot that happens to contain a single value MAY be serialised as a scalar value rather than as sequence containing only one item. + +For example, a `creator_id` slot with the single value `ORCID:1111-2222-3333-4444` MAY be serialised as + +```yaml +creator_id: "ORCID:1111-2222-3333-4444" +``` + +This is, strictly speaking, invalid according the YAML specification; the correct serialisation would be either + +```yaml +creator_id: [ "ORCID:1111-2222-3333-4444" ] +``` + +or + +```yaml +creator_id: + - "ORCID:1111-2222-3333-4444" +``` + +but the scalar form is frequently found in existing SSSOM/TSV files, so SSSOM/TSV parsers SHOULD accept it. SSSOM/TSV writers SHOULD favour one of the correct YAML serialisations, however. + +#### Forbidden YAML features + +The following features of the YAML 1.2 specification MUST NOT be used within the metadata block: + +* YAML directives ([YAML 1.2 §6.8.1](https://yaml.org/spec/1.2.2/#681-yaml-directives)); +* TAG directives ([YAML 1.2 §6.8.2](https://yaml.org/spec/1.2.2/#682-tag-directives)); +* Node tags ([YAML 1.2 §6.9.1](https://yaml.org/spec/1.2.2/#691-node-tags)); +* Node anchors ([YAML 1.2 §6.9.2](https://yaml.org/spec/1.2.2/#692-node-anchors)); +* Alias nodes ([YAML 1.2 §7.1](https://yaml.org/spec/1.2.2/#71-alias-nodes)). + +SSSOM/TSV writers MUST NOT generate any of those when writing the metadata block. The expected behaviour of SSSOM/TSV parsers upon encountering them is unspecified. + + +### Mappings block + +The mappings block contains the mappings, serialised as a matrix where each line represents an individual mapping and each column (separated by tab characters, U+0009) represents one of the slots of the [`Mapping` class](Mapping.md). + +The mappings block MUST follow immediately the metadata block within a SSSOM/TSV file. It starts with a header line containing the column names, which are the names of the slots in the `Mapping` class. + +There SHOULD be no empty columns. If none of the mappings in a set has a value for a given slot, that slot SHOULD be skipped when writing the header line and the individual mappings. + +Multi-valued slots MUST be serialised as a list of values separated by `|` characters. + +#### Quoting + +Within the mappings block, the following quoting rules, adapted from [RFC 4180](https://datatracker.ietf.org/doc/html/rfc4180), apply: + +1. Any value MAY be enclosed in double quotes (`"`). +2. Values containing line breaks, double quotes, or tabs (U+0009) MUST be enclosed in double quotes. +3. When a value is enclosed in double quotes, a double quote appearing within the value MUST be escaped by preceding it with another double quote. + +SSSOM/TSV parsers MUST strip any enclosing double quotes and escaping double quotes before passing the parsed objects to the application code. + + +## External metadata mode + +The metadata block MAY be stored in a separate file from the TSV section, instead of preceding it in the same file as described above. This is called the _external metadata mode_ (by contrast, when the two blocks are in the same file, this is called the _embedded metadata mode_). + +In external mode, the metadata block follows the same rules as described in the [Metadata block](#metadata-block) section above, except that lines MUST NOT start with a `#` character. + +It is RECOMMENDED that the file containing the metadata block has the same basename as the file containing the TSV section, with a `.sssom.yml` extension. + +When an external metadata file is used, the file containing the TSV section MUST NOT contain anything else than the TSV section. That is, the first line of that file MUST be the header line containing the column names. + +Implementations SHOULD support reading SSSOM/TSV files in external metadata mode; they MAY support writing SSSOM/TSV files in that mode. + + +## Encoding + +SSSOM/TSV files MUST be encoded in UTF-8 ([RFC 3629](https://datatracker.ietf.org/doc/html/rfc3629#section-13)). They MUST NOT start with a byte order mark (U+FEFF). This applies to external metadata files as well, when the [external metadata mode](#external-metadata-mode) is used. + + +## Identifiers + +All identifiers in a SSSOM/TSV file, that is, all the values of slots typed as [EntityReference](EntityReference.md), MUST be serialised in [CURIE syntax](https://www.w3.org/TR/curie/). SSSOM/TSV parsers SHOULD reject files containing identifiers serialised as IRIs. + +To allow unambiguous resolution of all CURIEs present in a SSSOM/TSV file, the metadata block MUST contain an additional `curie_map` field, which is a map of prefix names to IRI prefixes. The `curie_map` field SHOULD appear at the beginning of the metadata block. + +Any prefix name used in a SSSOM/TSV file MUST be declared with a corresponding entry in the CURIE map. SSSOM/TSV parsers MUST reject a file with undeclared prefix names. + +Prefix names listed in the table found in the [IRI prefixes](spec-intro.md#iri-prefixes) section are considered “built-in”. As such, they MAY be omitted from the CURIE map. If they are not omitted, they MUST point to the same IRI prefixes as in the aforementioned table. + + +## Propagatable slots + +As [explained in another section](spec-model.md#propagation-of-mapping-set-slots), some slots in the `MappingSet` class are intended, not to describe the mapping set itself, but to store values that are shared by all mappings in the set. These slots are called the “propagatable slots”, because their values should be “propagated” from the mapping set down to the individual mappings. + +### Propagation + +“Propagation” is the operation of assigning to individual mappings in a set the values from the propagatable slots of the set. That operation SHOULD be performed by a SSSOM/TSV parser before passing the parsed objects to the application code. + +For any given propagatable slot, propagation is only allowed if none of the individual mappings already have their own value in that slot. If any mapping (even only one mapping) has a value in that slot, then the slot MUST be considered as non-propagatable. Otherwise, a propagating SSSOM/TSV parser MUST (1) copy over the value of the propagatable slot on the `MappingSet` object to the corresponding slot of every individual `Mapping` objects, and (2) remove the propagated value from the `MappingSet` object. + +Implementations that support propagation MUST also support condensation. + +### Condensation + +“Condensation” is the opposite of “propagation”. It is the operation of assigning common values to the propagatable slots of the set, based on the values of these slots on individual mappings. That operation SHOULD be performed by a SSSOM/TSV writer prior to writing a set into a SSSOM/TSV file, but that behaviour, if available, MUST be deactivatable. + +For any given propagatable slot, condensation is only allowed if (1) all mappings in the set have the same value, and (2) the mapping set does not already have a value in the slot, unless that value happens to be the same as the value in all mappings. If those two conditions are met, then a condensating SSSOM/TSV writer MUST (1) set the value of the slot on the `MappingSet` object to the common value of the slot in all mappings, and (2) remove the condensed value from the individual `Mapping` object. + +Implementations that support condensation MUST also support propagation. + + +## Compatibility with previous versions of the specification + +Implementations MUST support the current version of the specification. However, SSSOM/TSV parsers MAY additionally accept to parse files that were compliant to a previous version. This section provides advice for implementations willing to support older versions. + +### Compatibility with pre-1.0 versions + +#### `match_type` slot + +Initial versions of this specification defined a `match_type` slot on the `Mapping` class. The slot was intended to describe the kind of match that led to the mapping, and accepted values from a specific enumeration. In SSSOM 0.9.1, this slot was replaced by the `mapping_justification` slot, and the enumeration was replaced by terms from the [SEMAPV vocabulary](https://mapping-commons.github.io/semantic-mapping-vocabulary/). + +Upon encountering a `match_type` slot, implementations supporting pre-1.0 versions MUST silently transform it into a `mapping_justification` slot and convert the enumeration values using the following table: + +| `match_type` value | `mapping_justification` value | +| ------------------ | ----------------------------- | +| Lexical | semapv:LexicalMatching | +| Logical | semapv:LogicalMatching | +| HumanCurated | semapv:ManualMappingCuration | +| Complex | semapv:CompositeMatching | +| Unspecified | semapv:UnspecifiedMatching | +| SemanticSimilarity | semapv:SemanticSimilarityThresholdMatching | + +Any other value in the `match_type` slot MUST be treated as an error. + +If the set contains both `match_type` and `mapping_justification` slots, it is advised to simply ignore the former. + + +#### `match_term_type` slot + +Initial versions of this specification defined a `match_term_type` slot on the `Mapping` class. The slot was intended to describe what was being matched. In SSSOM 0.9.1, this slot was replaced by two distinct slots called `subject_type` and `object_type` (this notably allowed for the case where the subject and the object are of a different type, something the `match_term_type` slot did not support). + +Upon encountering a `match_term_type` slot, implementations supporting pre-1.0 versions MUST silently transform it into a pair of `subject_type` and `object_type` slots, both slots having the same value derived from the original value using the following table: + +| `match_term_type` value | `subject_type` and `object_type` value | +| ----------------------- | -------------------------------------- | +| ConceptMatch | skos concept | +| ClassMatch | owl class | +| ObjectPropertyMatch | owl object property | +| IndividualMatch | owl named individual | +| DataPropertyMatch | owl data property | +| TermMatch | rdfs literal | + +Any other value in the `match_term_type` slot MUST be treated as an error. + +If the set already contains `subject_type` and `object_type` slots, any `match_term_type` slot can be silently ignored. + + +## Canonical SSSOM/TSV format + +This section defines a “canonical” variant of the SSSOM/TSV format, which has stricter serialisation rules. The purpose of the canonical SSSOM/TSV format is to minimise differences across SSSOM/TSV files that would be induced by small diverging behaviours between different SSSOM/TSV implementations. + +The rules in this section apply to SSSOM/TSV writers only. SSSOM/TSV writers SHOULD write files in the canonical format, but SSSOM/TSV readers MUST NOT reject a file solely because it does not follow the canonical rules. + +### General rules + +A canonical SSSOM/TSV writer: + +* MUST use line breaks made of only the U+000A character (no U+000D, and no U+000D + U+000A sequences); +* MUST condense the slots whenever possible, as described in the [Condensation](#condensation) section. + + +### Rules for the metadata block + +When writing the metadata block, a canonical SSSOM/TSV writer: + +* MUST embed the metadata block in the same file as the TSV section (no external metadata); +* MUST NOT insert additional space characters between the initial `#` character and the YAML content; +* MUST serialise multi-valued slots as YAML “block sequences” ([YAML Specification §8.2.1](https://yaml.org/spec/1.2.2/#821-block-sequences)) – even when the list of values contains only one item; +* MUST serialise scalar values in YAML “plain style” ([YAML Specification §7.3.3](https://yaml.org/spec/1.2.2/#733-plain-style)) whenever possible, otherwise in “double-quoted style” ([YAML Specification §7.3.1](https://yaml.org/spec/1.2.2/#731-double-quoted-style)); +* MUST serialise the slots in the order they appear in the [“Slots” table](MappingSet.md#slots), in the documentation for the `MappingSet` class; +* MUST write the `curie_map` at the beginning of the block, before any other slots; +* MUST NOT include in the CURIE map the prefix names that are considered “built-in”; +* MUST NOT include in the CURIE map any prefix name that is not used anywhere in the set; +* MUST sort the prefix names in the CURIE map in lexicographical order. + + +### Rules for the mappings block + +When writing the mappings block, a canonical SSSOM/TSV writer: + +* MUST quote values only when needed, as per the rules in the [Quoting](#quoting) section; +* MUST serialise floating point values with up to three digits as needed after the decimal point, rounding the last digit to the nearest neighbour (rounding up if both neighbours are equidistant); +* MUST write the columns in the order the slots appear in the [“Slots” table](Mapping.md#slots), in the documentation for the `Mapping` class; +* MUST sort the mappings in lexicographical order on all their slots, in the order the slots appear in the [“Slots” table](Mapping.md#slots). + + +## Examples + +This section is _non-normative_. + +A SSSOM/TSV file in embedded metadata mode: + +``` +#curie_map: +# FOODON: http://purl.obolibrary.org/obo/FOODON_ +# KF_FOOD: https://kewl-foodie.inc/food/ +# orcid: https://orcid.org/ +#mapping_set_id: https://w3id.org/sssom/tutorial/example1.sssom.tsv +#mapping_set_description: Manually curated alignment of KEWL FOODIE INC internal food and nutrition database with Food Ontology (FOODON). Intended to be used for ontological analysis and grouping of KEWL FOODIE INC related data. +#license: https://creativecommons.org/licenses/by/4.0/ +#mapping_date: 2022-05-02 +subject_id subject_label predicate_id object_id object_label mapping_justification author_id confidence comment +KF_FOOD:F001 apple skos:exactMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.95 "We could map to FOODON:03310788 instead to cover sliced apples, but only ""whole"" apple types exist." +KF_FOOD:F002 gala skos:exactMatch FOODON:00003348 Gala apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +KF_FOOD:F003 pink skos:exactMatch FOODON:00004186 Pink apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.9 "We could map to FOODON:00004187 instead which more specifically refers to ""raw"" Pink apples. Decided against to be consistent with other mapping choices." +KF_FOOD:F004 braeburn skos:broadMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +``` + +The same set in external metadata mode: first the file containing the metadata block: + +```yaml +curie_map: + FOODON: http://purl.obolibrary.org/obo/FOODON_ + KF_FOOD: https://kewl-foodie.inc/food/ + orcid: https://orcid.org/ +mapping_set_id: https://w3id.org/sssom/tutorial/example1.sssom.tsv +mapping_set_description: Manually curated alignment of KEWL FOODIE INC internal food and nutrition database with Food Ontology (FOODON). Intended to be used for ontological analysis and grouping of KEWL FOODIE INC related data. +license: https://creativecommons.org/licenses/by/4.0/ +mapping_date: 2022-05-02 +``` + +then the file containing the mappings block: + +``` +subject_id subject_label predicate_id object_id object_label mapping_justification author_id confidence comment +KF_FOOD:F001 apple skos:exactMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.95 "We could map to FOODON:03310788 instead to cover sliced apples, but only ""whole"" apple types exist." +KF_FOOD:F002 gala skos:exactMatch FOODON:00003348 Gala apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +KF_FOOD:F003 pink skos:exactMatch FOODON:00004186 Pink apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 0.9 "We could map to FOODON:00004187 instead which more specifically refers to ""raw"" Pink apples. Decided against to be consistent with other mapping choices." +KF_FOOD:F004 braeburn skos:broadMatch FOODON:00002473 apple (whole) semapv:ManualMappingCuration orcid:0000-0002-7356-1779 1 +``` + +### Invalid examples + +Illegal case 1: the metadata block cannot contains comments that are not part of the metadata. + +``` +# This is a comment that does not belong here. +#curie_map: +# HP: "http://purl.obolibrary.org/obo/HP_" +# MP: "http://purl.obolibrary.org/obo/MP_" +# orcid: "https://orcid.org/" +# This is another comment that also does not belong here. +#creator_id: +# - "orcid:0000-0002-7356-1779" +``` + +Illegal case 2: there should be no empty lines. + +``` +#curie_map: +# HP: "http://purl.obolibrary.org/obo/HP_" +# MP: "http://purl.obolibrary.org/obo/MP_" +# orcid: "https://orcid.org/" + +#creator_id: +# - "orcid:0000-0002-7356-1779" +``` diff --git a/src/docs/spec-formats.md b/src/docs/spec-formats.md new file mode 100644 index 00000000..dec5f98c --- /dev/null +++ b/src/docs/spec-formats.md @@ -0,0 +1,9 @@ +# SSSOM serialisation formats + +The SSSOM standard defines the following serialisation formats for storing and exchanging mapping sets: + +* the [SSSOM/TSV](spec-formats-tsv.md) format; +* the [SSSOM JSON](spec-formats-json.md) format; +* and the [OWL/RDF](spec-formats-owl.md) format. + +Implementations MUST support the SSSOM/TSV format. They MAY support the other formats. diff --git a/src/docs/spec-intro.md b/src/docs/spec-intro.md new file mode 100644 index 00000000..1dacae7a --- /dev/null +++ b/src/docs/spec-intro.md @@ -0,0 +1,29 @@ +# Specification of the SSSOM standard + +This document is the official specification for the SSSOM standard. + +It is divided in two sections covering the two different components of the standard: + +* the specification for the [data model](spec-model.md), to manipulate SSSOM mappings and mapping sets in your programs; +* the specification for the [serialisation formats](spec-formats.md), to read, write, and exchange SSSOM mapping sets. + +Both sections are _normative_. + +## Conventions used in this document + +### Key words + +Throughout the specification, the key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, “SHOULD NOT”, “RECOMMENDED”, “NOT RECOMMENDED”, “MAY”, and “OPTIONAL” are to be interpreted as described in [BCP 14](https://datatracker.ietf.org/doc/html/bcp14) when, and only when, they appear in all capitals, as shown here. + +### IRI prefixes + +Throughout the specification, the following IRI prefix names are used: + +| Prefix name | IRI prefix | +| ----------- | ---------- | +| owl | http://www.w3.org/2002/07/owl# | +| rdf | http://www.w3.org/1999/02/22-rdf-syntax-ns# | +| rdfs | http://www.w3.org/2000/01/rdf-schema# | +| semapv | https://w3id.org/semapv/vocab/ | +| skos | http://www.w3.org/2004/02/skos/core# | +| sssom | https://w3id.org/sssom/ | diff --git a/src/docs/spec-model.md b/src/docs/spec-model.md new file mode 100644 index 00000000..bf33644e --- /dev/null +++ b/src/docs/spec-model.md @@ -0,0 +1,83 @@ +# The SSSOM data model + +The SSSOM data model (hereafter “the model”) defines the data structure to represent and manipulate SSSOM concepts. The model is formally described as a [LinkML](https://linkml.io/) schema, from which the [documentation](linkml-index.md) is derived. + +This section provides an overview of the model and supplementary informations that may not be found in the schema (and its derived documentation) itself. Of note, the schema, not this section, is always the authoritative source of truth for all questions pertaining to the model. + +## Overview + +The model consists in a handful of classes, the most important of them being the [`Mapping` class](Mapping.md) and the [`MappingSet` class](MappingSet.md). Any SSSOM implementation MUST support those two classes and all their slots; support for the other classes is OPTIONAL. + +The `Mapping` class represents an individual mapping. Fundamental slots in that class are: + +* `subject_id` and `object_id`, referring to the entities being mapped to each other; +* `predicate_id`, referring to the relationship between the mapped entities; +* `mapping_justification`, which should provide the justification for the mapping. + +Those slots are mandatory (including the `mapping_justification` slot: the SSSOM standard posits that there can be no mapping without some form of justification) and an implementation MUST NOT allow the creation of a mapping object that does not have a value for any one of them. + +Other slots are intended to provide further details about a mapping. Those “further details” are sometimes referred to as “mapping metadata”, though the SSSOM standard makes no formal distinction between “data” and “metadata” – there are only “data about a mapping”. + +The `MappingSet` class represents, well, a set of individual mappings, which are contained in the `mappings` slot (a list of `Mapping` instances). Other slots in that class are intended either to provide further details about the set itself (sometimes referred to as “mapping set metadata”, with the same caveat as above regarding the data/metadata distinction), or to provide common details for all the mappings in the set (see the [Propagation of mapping set slots](#propagation-of-mapping-set-slots) section further below for details). + +Of note, within a set, a mapping may not necessarily be uniquely identified by the combination of its four mandatory slots (`subject_id`, `predicate_id`, `object_id`, and `mapping_justification`). A set may very well contain several mappings with the same subject, predicate, object, and justification, but that differ on some of the other, complementary slots. + + +## Propagation of mapping set slots + +As mentioned briefly above, there are two different types of slots in the `MappingSet` class: + +* slots that provide informations about the set itself; +* slots that provide informations about all the mappings in the set. + +The latter are called “propagatable slots”. The propagatable slots are: + +* `mapping_date`, +* `mapping_provider`, +* `mapping_tool`, +* `mapping_tool_version`, +* `object_match_field`, +* `object_preprocessing`, +* `object_source`, +* `object_source_version`, +* `object_type`, +* `subject_match_field`, +* `subject_preprocessing`, +* `subject_source`, +* `subject_source_version`, +* `subject_type`. + +(In a future version of this specification, this information will be formally specified directly within the LinkML schema.) + +When a mapping set object has a value in one of its propagatable slots, this MUST be interpreted as if all mappings within the set had that same value in their corresponding slot. For example, if a set has the value _foo_ in its `mapping_tool` slot, all the mappings in that set MUST be treated as if they had the value _foo_ in their `mapping_tool` slot. + +This mechanism is intended as a convenience, so that a slot which has the same value for all mappings in a set can be specified only once at the level of the set rather than for each individual mapping. + +Slots that are not in the above list (“non-propagatable slots”) describe the mapping set itself, not the mappings it contains, even if the slot also exists on the `Mapping` class. For example, the `creator_id` slot, when used in the `MappingSet` class, is intended to refer to the creators of the set, _not_ the creators of the individual mappings (which may be different, and which are listed in the `creator_id` slot of every mapping). + + +## Allowed and common mapping predicates + +Implementations MUST accept any arbitrary predicate in the `predicate_id` slot. + +The following mapping predicates are considered common, and implementations MAY encourage users to use them: + +| Predicate | Description | +| --------- | ----------- | +| owl:sameAs | The subject and the object are instances (OWL individuals), and the two instances are the same. | +| owl:equivalentClass | The subject and the object are OWL classes, and the two classes are the same. | +| owl:equivalentProperty | The subject and the object are OWL object, data, or annotation properties, and the two properties are the same. | +| rdfs:subClassOf | The subject and the object are OWL classes, and the subject is a subclass of the object. | +| rdfs:subPropertyOf | The subject and the object are OWL object, data, or annotation properties, and the subject is a subproperty of the object. | +| skos:relatedMatch | The subject and the object are associated in some unspecified way. | +| skos:closeMatch | The subject and the object are sufficiently similar that they can be used interchangeably in some information retrieval applications. | +| skos:exactMatch | The subject and the object can, with a high degree of confidence, be used interchangeably across a wide range of information retrieval applications. | +| skos:narrowMatch | The object is a narrower concept than the subject. | +| skos:broadMatch | The object is a broader concept than the subject. | +| oboInOwl:hasDbXref | Two terms are related in some way. The meaning is frequently consistent across a single set of mappings. Note this property is often overloaded even where the terms are of a different nature (e.g. interpro2go). | +| rdfs:seeAlso | The subject and the object are associated in some unspecified way. The object IRI often resolves to a resource on the web that provides additional information. | + +In addition, predicates from the following sources MAY also be encouraged: + +* any relation from the [Relation Ontology (RO)](https://obofoundry.org/ontology/ro.html); +* any relation under [skos:mappingRelation](http://www.w3.org/2004/02/skos/core#mappingRelation) in the [Semantic Mapping Vocabulary](https://mapping-commons.github.io/semantic-mapping-vocabulary/). diff --git a/src/docs/spec.md b/src/docs/spec.md deleted file mode 100644 index d57c15e4..00000000 --- a/src/docs/spec.md +++ /dev/null @@ -1,492 +0,0 @@ -# Simple Standard for Sharing Ontological Mappings (SSSOM) - -![SSSOM banner](images/sssom-banner.png) - -Development Draft (under construction: some metadata fields may be subject to change) - -*Editors:* - -* [Nicolas Matentzoglu](https://orcid.org/0000-0002-7356-1779) (Semanticly Ltd; [@matentzn](https://github.com/matentzn)) -* [Chris Mungall](https://orcid.org/0000-0002-6601-2165) (LBL) -* [Ernesto Jimenez-Ruiz](https://orcid.org/0000-0002-9083-4599) (City, University of London) -* [John Graybeal](https://orcid.org/0000-0001-6875-5360) (Stanford) -* [William Duncan](https://orcid.org/0000-0001-9625-1899) (LBL) -* [David Osumi-Sutherland](https://orcid.org/0000-0002-7073-9172) (EMBL-EBI) -* [Simon Jupp](https://orcid.org/0000-0002-0643-3144) (SciBite) -* [James McLaughlin](https://orcid.org/0000-0002-8361-2795) (EMBL-EBI) -* [Henriette Harmse](https://orcid.org/0000-0001-7251-9504) (EMBL-EBI) -* [Tiffany Callahan](https://orcid.org/0000-0002-8169-9049) ([@callahantiff](https://github.com/callahantiff)) -* [Charlie Hoyt](https://orcid.org/0000-0003-4423-4370) (Harvard Medical School; [@cthoyt](https://github.com/cthoyt)) -* [Thomas Liener](https://orcid.org/0000-0003-3257-9937) (Pistoia Alliance) -* [Harshad Hegde](https://orcid.org/0000-0002-2411-565X) (LBL) - -*Contributors:* - -* [Alasdair Gray](https://orcid.org/0000-0002-5711-4872) -* [Alex Wagner](https://orcid.org/0000-0002-2502-8961) -* [Amelia L. Hoyt](https://orcid.org/0000-0003-1307-2508) -* [Andrew Williams](https://orcid.org/0000-0002-0692-412X) -* [Anne Thessen](https://orcid.org/0000-0002-2908-3327) -* [Benjamin M. Gyori](https://orcid.org/0000-0001-9439-5346) -* [Bill Baumgartner](https://orcid.org/0000-0001-6717-5313) -* [Christopher Chute](https://orcid.org/0000-0001-5437-2545) -* [Chris T. Evelo](https://orcid.org/0000-0002-5301-3142) -* [Damion Dooley](https://orcid.org/0000-0002-8844-9165) -* [Davera Gabriel](https://orcid.org/0000-0001-9041-4597) -* [Harold Solbrig](https://www.wikidata.org/wiki/Q44607574) -* [HyeongSik Kim](https://orcid.org/0000-0002-3002-9838) -* [Ian Harrow](https://orcid.org/0000-0003-0109-0522) -* [James Malone](https://orcid.org/0000-0002-1615-2899) -* [James Overton](https://orcid.org/0000-0001-5139-5557) -* [James P. Balhoff](https://orcid.org/0000-0002-8688-6599) -* [James Stevenson](https://orcid.org/0000-0002-2568-6163) -* [Jiao Dahzi](https://orcid.org/0000-0001-5052-3836) -* [Joe Flack](https://orcid.org/0000-0002-2906-7319) -* [Jooho Lee](https://orcid.org/0000-0002-2955-3405) -* [Julie McMurry](https://orcid.org/0000-0002-9353-5498) -* [Kori Kuzma](https://orcid.org/0000-0002-9954-7449) -* [Kristin Kostka](https://orcid.org/0000-0003-2595-8736) -* [Lauren Chan](https://orcid.org/0000-0002-7463-6306) -* [Melissa Haendel](https://orcid.org/0000-0001-9114-8737) -* [Monica Munoz-Torres](https://orcid.org/0000-0001-8430-6039) -* [Nicole Vasilevsky](https://orcid.org/0000-0001-5208-3432) -* [Nomi Harris](https://orcid.org/0000-0001-6315-3707) -* [Núria Queralt-Rosinach](https://orcid.org/0000-0003-0169-8159) -* [Sabrina Toro](https://orcid.org/0000-0002-4142-7153) -* [Sebastian Koehler](https://orcid.org/0000-0002-5316-1399) -* [Shahim Essaid](https://orcid.org/0000-0003-2338-2550) -* [Sierra Moxon](https://orcid.org/0000-0002-8719-7760) -* [Sue Bello](https://orcid.org/0000-0003-4606-0597) -* [Tim Putman](https://orcid.org/0000-0002-4291-0737) - -*Quick links*: - -- [SSSOM on Wikidata](https://www.wikidata.org/wiki/Q108394480) -- [SSSOM Python toolkit on Wikidata](https://www.wikidata.org/wiki/Q108394654) -- [SSSOM Python toolkit: Official Documentation](https://mapping-commons.github.io/sssom-py) -- [Presentations](presentations.md) - -## Abstract - -Mappings, or cross-references, are used to link terms across different ontologies. However, there is currently little to no standardisation in how such mappings are represented. While properties such as hasDbXref property are widely used in ontologies such as GO and MONDO, the meaning of such mappings is unclear, and cannot be further described with additional metadata or provenance. - -The Simple Standard for Sharing Ontology Mappings (SSSOM) is an initiative to provide a minimal and standard set of elements for the dissemination of mappings between ontology terms, to ensure a reliable interpretation of generated mappings and to enable sharing and data integration between people and applications. - -This document introduces the SSSOM catalog of metadata elements, which can be used to attach meta- and provenance data to both mappings and sets of mappings; a controlled vocabulary for the description of match types (SSSOM CV); a definition of both RDF and TSV serialisations of ontology mappings; and a (non-exhaustive) selection of recommended mapping predicates. - -## Table of Contents - -* [Introduction](#intro) -* [SSSOM Metadata Elements](#meta) -* [SSSOM Common Predicates](#predicates) -* [SSSOM Serialisation](#serialisation) -* [SSSOM Use Cases](#usecase) - - - -## Introduction - -Currently, there are three methods typically used to express mappings in OWL: direct logical axioms using owl:equivalentClass; the oboInOwl hasDbXref property; and the SKOS vocabulary for mapping properties. The first, owl:equivalentClass, is a strong logical equivalence assertion which is not appropriate for more nuanced mappings such as close matches. The second, hasDbXref, does not assert formal logical equivalence but also has no clearly defined meaning. Finally, the SKOS vocabulary provides a hierarchy of mapping properties which allow the unambigous specification of exact, close, broad, and narrow matches, but does not provide the means for mappings to be annotated with additional metadata such as confidence scores and provenance. - -The Simple Standard for Sharing Ontology Mappings (SSSOM) addresses these problems by defining a catalog of metadata terms to describe mappings. Both individual mappings and **_sets of_** mappings can be described, enabling provenance and metadata to be captured on multiple levels. SSSOM interoperates with existing methods for the specification of mappings, allowing any predicate to be used to describe the nature of each mapping including those from OWL and SKOS. - -The provenance of mappings - such as whether the mapping was created as the result of a human-curated equivalence match, or a semantic similarity match - is specified using a controlled vocabulary (CV), SSSOM CV. Combined with the metadata properties provided by SSSOM such as confidence and semantic_similarity_score, this provenance information can be used to capture mapping descriptions in a manner that is explicit and amenable to curation. - -Two serialisations for SSSOM mappings are provided in this document, aimed at different communities: an RDF/OWL serialisation using IRIs that is aimed at the Knowledge Graph/Semantic Web community, and a TSV serialisation using [CURIE](https://www.w3.org/TR/curie/) syntax which is aimed at the wider bioinformatics community. An unambiguous translation between these serialisations is provided. - -### Challenges for exchange and use of mappings -Despite their importance for data integration, term mappings are typically neglected as data artefacts (57). A mapping in this context is a correspondence between two terms, referred to here as "subject" and "object" terms. A "predicate" defines the type of relationship between the subject and the object, such as skos:exactMatch, or owl:equivalentClass. A mapping, or "match", does not have to be exact: it can be broad, e.g. between a conceptually narrow term such as "Red Delicious" and a conceptually broader term such as "Apple". To our knowledge, no formal review has been published that analyzes the representation and formats used for collections of term mappings (mapping sets, or alignments), but in our experience, most mapping sets are represented as tables using an ad-hoc "schema", often merely a simple two-column format that lists matching terms in two naming schemes. An example of such a table can be seen in the following Table. - - -Subject | Object ---- | --- -UBERON:0002101 | FMA:24875 -UBERON:0000019 | FMA:54448 - -``` -Table 1: An example of a typical mapping table one might find on the web. -``` - -This type of table lacks clear semantics and is therefore very difficult to use and re-use either by humans or by machines. We will discuss a few of the most critical problems in the sections that follow. - -_Non-transparent imprecision_. Mapping precision describes, usually qualitatively, whether a mapping between a subject and an object is exact, broad, narrow, close or related. An exact mapping means that the subject term can be replaced with the object term and vice versa, i.e. they refer to the exact same real-world entity. A broad mapping links a subject term to a more general term, for example, the term "leg" to the term "hindlimb" (if the ontology defines leg as the parts of the hindlimb that exclude the foot). A narrow mapping links a subject term to a more specific term. For example, "long QT syndrome" in the Mondo Disease Ontology is a narrow match to "Romano-Ward long QT syndrome" in Orphanet. A close mapping relates two terms that are neither exact, broad or narrow, but belong to the same category of things and are semantically similar, such as "apple" to "pear", or "paw" to "hand". Due to its subjective nature ("what is close?"), this is a problematic category of mapping, but it is widely used, for example for relating similar anatomical terms across species. Related mappings are mappings across categories of things, such as the mapping between a phenotype "enlarged liver" and the anatomical entity "liver". In practice, it is rare that mapping tables such as the one presented in Table 1 constitute a set of purely "exact" matches. - -Different use cases may require different levels of mapping precision. For example, for entity merging (defined as the process of merging two entities from different sources into one) or data translation (defined as the process of moving annotations from using one ontology to another), exact mappings may be required, while for data grouping broad matches are often sufficient (ensuring that the subject is classified under the object term). For many machine learning use cases, close and related matches will be extremely useful regardless of their lack of semantic precision (though semantic precision is likely to improve predictive power). In practice, many mappings are to varying degrees imprecise but do not specify the mapping precision. This makes it impossible to reliably apply them to use cases such as entity merging or data translation. - -_Non-transparent accuracy, confidence, and provenance_. To scale to real-world use cases, automated tools are critical for matching terms across databases, terminologies and ontologies. Such tools typically implement mapping rules that determine whether a given pair of terms constitutes a match. For example, label matching rules might include "match if subject and object labels match", "match if subject label matches with an exact synonym of the object" and "match if subject and object exhibit a very high degree of semantic similarity". Depending on the rules, tools will have more or less confidence that a match constitutes a mapping. Even human curators often have different levels of confidence about the accuracy of any given mapping, especially if the process of determining whether a mapping is accurate involves the review of (often complex) descriptions and term definitions. - -Different use cases will profit from different degrees of accuracy. For example, if we seek to integrate data from various medical terminologies to inform medical diagnosis, we may require not only a very high degree of confidence about the mapping but also ensure that the mapping is "explainable" to users. To ensure that diagnostic decisions that require bridging of data silos through mappings are explainable, we furthermore need provenance (documentation of where a piece of data comes from and how it was produced), such as an explicit statement of the mapping rules by which the match was originally determined (for example, the labels of both terms could have been the same). Thanks to efforts by initiatives such as the Ontology Alignment Evaluation Initiative (OAEI), many mapping tables on the web include at least a confidence score. However, in our experience, mapping rules are rarely stated explicitly as part of the mappings or mapping set metadata. Many mappings in the wild are to varying degrees inaccurate, but without a confidence score and explicit mapping rules, this inaccuracy will not be transparent. - -_Non-transparent incompleteness_. Mapping sets can be incomplete for (at least) three major reasons: (1) they are out of date, i.e. a term in one ontology was removed (deprecated) in a later version of the ontology or a term with a more precise mapping was introduced; (2) they are deliberately partial, i.e. covering only a subset of terms, which were mapped for a specific purpose (for example a manual effort to map all COVID-19-relevant phenotypes from the Human Phenotype Ontology to the Mammalian Phenotype Ontology); or (3) they accidentally omit certain correct mappings, as the automated approaches that were used did not detect them (false negatives). We cannot determine whether a mapping set such as the one given in Table X1 is up-to-date, deliberately partial or accidentally incomplete without sufficient metadata about the purpose of the mappings, the tools used and the version of the source data used for the matching process. - -_UnFAIRness_. The FAIR principles are a set of community-developed guidelines to ensure that data or any digital object are Findable, Accessible, Interoperable and Reusable. Unlike many of the widely used controlled vocabularies, ontologies and data schemas, mappings are rarely published using standard formats and metadata vocabularies and can therefore be considered second class citizens in the world of FAIR semantics. -While some tools exist to browse mappings (the F and A in FAIR, findable and accessible), such as OxO and BioPortal, they lack access to at least some of the metadata required to determine their applicability for a use case: Are mappings likely to be correct? Are they precise enough? Have they been updated recently? Can I trust the authority that generated the mappings? While some minimum level of interoperability (the I in FAIR) is achieved simply by publishing the mappings as RDF triples (which rarely happens in practice), most mappings are best captured in the form of simple tables (in our experience the preferred format for both mapping curators and data engineers). Furthermore, the predicates or relations used in the mappings are far from standardized. Different relations have different semantics, ranging from strong logical relations such as owl:sameAs or owl:equivalentClass to predicates with no formally specified semantics such as oboInOwl:hasDbXref. - -In our experience, reusability (the R in FAIR) is a significant obstacle to FAIRness. It is infeasible to simply reuse existing mappings without the metadata required to make imprecision, inaccuracy and incompleteness explicit. Repositories such as OxO and BioPortal cannot make mappings more accessible, because the metadata required to do so simply does not exist. In order to gradually improve our mappings and make them FAIRer, we need to be able to share, review, fix and maintain our mappings in much the same way as our ontologies themselves - using standard formats and rich metadata. -FAIRifying data is an effort that aims to supply practical solutions for the use of the FAIR guiding principles throughout the research data life cycle. It recommends technologies that support semantic interoperability in a sustainable way, and practices that support FAIRness. The FAIRSemantics effort is currently discussing how to incorporate semantic mappings, and we reached out to them to consider SSSOM for this purpose. - -### Background about mappings - -A mapping can be defined as a triple , where s is the subject of the mapping, p is the mapping predicate (or relation) and o is the object. There are many different mapping predicates used in practice, but they are not always standardized. The Semantic Web community uses a number of standard mapping predicates, such as owl:sameAs or owl:equivalentClass (logical mapping predicates) and skos:exactMatch or skos:broadMatch (terminological mapping predicates). We refer to mapping subjects and objects as "terms", which we will loosely define here as a set of symbols that define some entity in the real world. Usually, a term can be referred to by an identifier that uniquely identifies some entity in a certain context. For example, UBERON:0002101 is the identifier for a term that refers to the anatomical entity "limb". -Putting it all together, the mapping describes a correspondence in which the term with the id UBERON:0002101 constitutes a terminological exact match to the term with the identifier FMA:24875. Mappings between data model elements, databases and other representations can be described similarly. Note that we generally use the terms "matching" and "mapping" interchangeably. Occasionally we refer to "matching" as the process to determine a mapping candidate (lexical matching, logical matching etc), a "match" as the result of the matching process, and a "mapping" the process and result of the process that deduces a true correspondence from a (set of) matches. For SSSOM, this distinction is a bit academic, but useful to keep in mind when talking about the interplay of automated approaches (which result in "matches") and manual approaches (which typically result in the final mappings). Ontology alignment is the task of determining corresponding terms shared between two or more ontologies, i.e. mappings. Sometimes "ontology alignment" refers to the output of the alignment process. - -Mapping sets can be "partial", i.e. covering only a subset of terms in the subject or object source (ontology, database, etc), "derived", i.e. one mapping set can be obtained from one or more others (for example, a XAO to MeSH mapping can be obtained by combining a XAO-Uberon mapping with a Uberon-MeSH mapping), or "complete". We refer to a "complete" mapping, i.e. the set of all correspondences between two resources (ontologies, databases), as an "alignment". - -The identifier of a term has three parts: a namespace that describes in which database or ontology the identifier is defined, a local identifier that unambiguously identifies an entity within that namespace, and optionally a separator that can be used to separate the namespace from the local identifier to make them easier to process. UBERON:0002101, for example, comprises the namespace "UBERON", the separator ":" and the local identifier "0002101". There are various syntaxes for denoting identifiers; the UBERON:0002101 notation is called compact URI (CURIE) syntax, which is used widely across the database and ontology worlds. The problem with this syntax is that UBERON may not be a globally unique prefix, so files making use of such CURIEs must come with a prefix map that ensures that UBERON (in the CURIE syntax referred to as "prefix") is globally unique by mapping it to the persistent International Resource Identifier (IRI) prefix http://purl.obolibrary.org/obo/UBERON_. This may not be a major problem for a fairly unique prefix such as "UBERON", but it is for prefixes such as "ICD", which can refer to many different name spaces, such as ICD9, ICD10, ICD11 and more, all of which correspond to entirely different terminologies. - -_Approaches to mapping_. There are many different techniques that can be employed to generate term mappings. Automated matching techniques include ontology matching, entity resolution (the task of determining whether two database records correspond to the same entity), semantic similarity or automated reasoning. Recent approaches based on machine learning and graph embeddings show promise for working with messier inputs. No single tool will perform equally well on all inputs: some of the semantics-aware tools like LogMap and Agreement Maker Light (AML) can exploit the ontology structure to determine high-quality matches but will have problems with the large-scale data linking tasks required by modern big-data applications. - -Purely automated approaches to mapping are often insufficient for real world use cases that require a high degree of accuracy, such as medical diagnostics. They often need to be refined by hand or using sophisticated mapping reconciliation approaches independent of the actual matching. Determining a mapping is often complex, due to the high degree of terminological variability: different communities may use very different names for the same real world entities . For example, for example, the condition referred to in the Human Phenotype Ontology (HPO) as "Hyperchloriduria" is called "increased urine chloride ion level" in the Mammalian Phenotype Ontology (MP), which is used by the model organism community. - -_Mapping rules - capturing the conditions under which a match is established_. -Mapping rules define the conditions under which we determine a match between two terms. For example, the condition for a mapping rule could be "if the subject label and object label match exactly". In practice, mapping rules can be very simple (e.g., "exact match of term labels"), more complex ("exact match between label of subject and exact synonym of object after they are pre-processed using stemming"), or even more exacting ("complex match determined by a human curator that carefully reviewed the descriptions and definitions of both terms and concluded they mean the same thing"). One problem for both manually curated mappings and automated approaches is that these mapping rules are often hidden deeply in the code or are not documented at all. Exposing mapping rules along with confidence scores would be very valuable for reviewing mappings and explaining them to users. Our reference implementation for SSSOM is rdf-matcher, which makes these mapping rules explicit, but other approaches such as OMOP2OBO also capture mapping rules as part of the mapping metadata. - - -### Some notes on the standardisation process: - -Note this is a public copy of the editors’ draft. It is provided for discussion only and may change at any moment. Do not cite this document other than as work in progress. SSSOM is community-driven, so all feedback is welcome. - - - -## SSSOM Metadata Elements - -A "term" is defined in a controlled vocabulary / ontology, and usually corresponds to a class, an individual or a property (entity in OWL, concept in SKOS, resource in RDF). The "subject" is the term on the left side of the mapping, and the "object" is the term on the right side of the mapping. A "predicate" relates the subject with the object and is typically an annotation or object property. A "mapping set" is a set of mappings that can be shared using the SSSOM standard. - -The conceptual model of SSSOM has two main elements: - -- a [Mapping](https://mapping-commons.github.io/sssom/Mapping/) and -- a [MappingSet](https://mapping-commons.github.io/sssom/MappingSet/). - -Some SSSOM metadata elements apply only to one element or the other, but many can be applied to both. - -Note that some SSSOM metadata elements have known equivalent properties which will be used in the RDF serialisation, for example `see_also` is mapped to `rdfs:seeAlso`. - -All metadata elements and their mappings are declared and managed in the [SSSOM schema](https://github.com/mapping-commons/sssom/blob/master/src/sssom_schema/schema/sssom_schema.yaml). - -### Metadata Elements - -The latest version of the metadata elements are: - -- [Mapping](https://mapping-commons.github.io/sssom/Mapping/) -- [MappingSet](https://mapping-commons.github.io/sssom/MappingSet/) - - - -## Common Mapping Predicates - -The use of predicates is not restricted by SSSOM, but for maximum re-use, the following predicates are strongly encouraged. - -*Sources:* - -* [https://www.bioontology.org/wiki/BioPortal_Mappings](https://www.bioontology.org/wiki/BioPortal_Mappings) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PredicateDescription
owl:sameAsThe subject and the object are instances (owl individuals), and the two instances are the same.
owl:equivalentClassThe subject and the object are classes (owl class), and the two classes are the same.
owl:equivalentPropertyThe subject and the object are properties (owl object, data, annotation properties), and the two properties are the same.
rdfs:subClassOfThe subject and the object are classes (owl class), and the subject is a subclass of the object.
rdfs:subPropertyOfThe subject and the object are properties (owl object, data, annotation properties), and the subject is a subproperty of the object.
skos:relatedMatchThe subject and the object are associated in some unspecified way.
skos:closeMatchThe subject and the object are sufficiently similar that they can be used interchangeably in some information retrieval applications.
skos:exactMatchThe subject and the object can, with a high degree of confidence, be used interchangeably across a wide range of information retrieval applications.
skos:narrowMatchFrom the SKOS primer: A triple skos:narrower (and skos:narrowMatch) asserts that , the object of the triple, is a narrower concept than , the subject of the triple.
skos:broadMatchFrom the SKOS primer: A triple skos:broader (and skos:broadMatch) asserts that , the object of the triple, is a broader concept than , the subject of the triple.
oboInOwl:database_cross_referenceTwo terms are related in some way. The meaning is frequently consistent across a single set of mappings. Note this property is often overloaded even where the terms are of a different nature (e.g. interpro2go)
rdfs:seeAlsoThe subject and the object are associated in some unspecified way. The object IRI often resolves to a resource on the web that provides additional information.
RO:?Any Relation in the Relation Ontology (RO).
- - - -## Serialisation - -### RDF/XML serialised re-ified OWL axioms: - -The default RDFXML serialisation of the mappings will be realised as *reified OWL axioms*. This has the advantage that any mapping set can be simply merged with an ontology in the usual way, for example using [ROBOT merge](http://robot.obolibrary.org/merge). We will deal with three types of reified OWL-axioms, and a few sub-types: - -1. Predicate is an annotation property -2. Predicate is an object property and - 1. Object/Subject are classes - 2. Object/Subject are individuals -3. Predicate is language relational construct of RDFS or OWL (rdfs:subClassOf, owl:equivalentClass) - -#### Predicate is an annotation property: - -If the predicate corresponds to an annotation property, the mapping gets converted to an OWLAnnotationAssertion axiom: `OWLAnnotationAssertion(P,S,O)`. All mapping level metadata (`sssomMetadata`) gets converted into OWLAnnotation objects which are materialised as axiom annotations on the mapping annotation assertion, see [OWL 2 Structural Specification](https://www.w3.org/TR/owl2-syntax/#Annotations): - -``` -AnnotationAssertion(sssomMetadata P, S, O) -``` - -Where `sssomMetadata` is a sequence of OWL Annotations objects like: - -``` -Annotation(Q1,V1) Annotation(Q2,V2) ... Annotation(Qn,Vn) -``` - -where Qi is a SSSOM metadata element and Vi is an annotation value. - -Note that if a SSSOM metadata element value is a list L (i.e. can have multiple elements, such as creator and others), individual annotations are created for each of them: - -``` -Annotation(Q,V) for all V in L. -``` - -Example: - -``` -AnnotationAssertion(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) skos:exactMatch ) -``` - -Mapping set level annotations are manifested as Ontology annotation in the usual way, according to the [OWL 2 Structural Specification](https://www.w3.org/TR/owl2-syntax/#Annotations). - -#### Predicate is an object property - -##### Case 1: Object and Subject are classes. - -The Mapping gets translated into an existential restriction: - -``` -SubclassOf(A, P some O) -``` - -All metadata elements are added as OWLAnnotation objects and added to SubclassOf axiom as axiom annotations: - -``` -SubclassOf(sssomMetadata, A, P some O) -``` - -Example: - -``` -SubClassOf(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ObjectSomeValuesFrom( )) -``` - -##### Case 2: Object and Subject are individuals - -The Mapping gets translated into an object property assertion: - -``` -ObjectPropertyAssertion(P, A, O) -``` - -All metadata elements are added as OWLAnnotation objects and added to ObjectPropertyAssertion axiom as axiom annotations: - -``` -ObjectPropertyAssertion(sssomMetadata, P, A, O) -``` - -Example: - -``` -ObjectPropertyAssertion(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ) -``` - -#### Predicate is language relational construct of RDFS or OWL - -The mapping gets translated into an annotated axiom that corresponds to the construct used. By default, SSSOM will support: - - - - - - - - - - -
owl:EquivalentClassEquivalentClass(sssomMetadata,A,O)
rdfs:subClassOfSubClassOf(sssomMetadata, A,O)
- -Example: - -``` -SubClassOf(Annotation(sssom:creator_id ) Annotation(sssom:mapping_justification semapv:LexicalMatching) ) -``` - -### TSV: - -All SSSOM metadata elements labelled with L in the metadata table are permissible as column names in the TSV. List elements (such as creator) are "|"-separated. The columns SHOULD be sorted according to the order as they appear in the [SSSOM metadata](https://mapping-commons.github.io/sssom/Mapping/). For example, the first columns of a mapping set TSV should always be, in that order: subject_id, predicate_id, object_id, mapping_justification, if labels are not included; if they are included, the order should be: subject_id, subject_label, predicate_id, predicate_label, object_id, object_label, mapping_justification. For easier review of diffs, for example git diff or unix diff, we recommend to serialise the TSV by a fixed row order, sorted column by column from left to right. - -Metadata about a set of mappings can be supplied as part of the mappings (embedded mode) and as a simple yaml file alongside the primary mapping file. The YAML metadata block MUST contain a curie map that allows the unambiguous interpretation of CURIES. A curie map is supplied after a `curie_map:` parameter in the yaml file. The value is a dictionary of CURIE->URLPREFIX pairs. -Note that the following prefixes are built-in and (1) MUST NOT be changed from their [SSSOM default interpretation](https://github.com/mapping-commons/sssom/blob/master/project/jsonld/sssom_schema.context.jsonld) and (2) MAY be omitted from the curie map: "`sssom`", "`owl`", "`rdf`", "`rdfs`", "`skos`", "`semapv`". - -Note that *all* identifiers in a SSSOM/TSV file (all metadata elements with a range of `EntityReference`), whether they are part of a mapping record or of the set's metadata, MUST be in CURIE form *only*. The use of full-length identifiers is not officially supported. - -**Canonical ordering of columns**. Apart from the elements themselves, some example usage and a description, **_[the SSSOM spec](https://mapping-commons.github.io/sssom/Mapping/) defines the canonical order for the metadata_** in which the elements SHOULD appear when serialised. -(The "canonical order" corresponds to the exact order of elements as seen in the specification.) -This precludes spurious diffs in a git setting, which is an important concern for the continuous reviewing of mappings by curators and users. - -**Canonical ordering of keys in the YAML metadata block**. For the same reason (avoiding spurious diffs), the keys in the YAML metadata block SHOULD be sorted in the same order as they are listed in the [MappingSet specification](https://mapping-commons.github.io/sssom/MappingSet/); keys in the `curie_map` dictionary SHOULD be sorted by alphabetical order. Those recommendations apply whether the metadata block is embedded in the TSV file or kept in a separate file. - -Note that only metadata elements permissible in a global context (G, or L/G) can be used in the metadata-file. - -We recommend to use the following *filename conventions* for SSSOM metadatafiles: - -- TSV files should have the extension `.sssom.tsv`, for example: `mp-hp-exact-0.0.1.sssom.tsv`. -- External yaml metadata files should have the extension `.sssom.yml`, for example `mp-hp-exact-0.0.1.sssom.yml` - -Example ([download](https://raw.githubusercontent.com/mapping-commons/sssom/master/examples/external/mp-hp-exact-0.0.1.sssom.yml)): - -``` -curie_map: - HP: "http://purl.obolibrary.org/obo/HP_" - MP: "http://purl.obolibrary.org/obo/MP_" - orcid: "https://orcid.org/" -creator_id: - - "orcid:0000-0002-7356-1779" -license: "https://creativecommons.org/publicdomain/zero/1.0/" -mapping_provider: "http://purl.obolibrary.org/obo/upheno.owl" -``` - -#### External mode - -In external mode, the mapping set metadata MUST be supplied in a separate YAML file; that file SHOULD have the same base name as the mapping file, with the extension `.sssom.yml`. - -Example ([download](https://raw.githubusercontent.com/mapping-commons/sssom/master/examples/external/mp-hp-exact-0.0.1.sssom.tsv)): - -``` -subject_id subject_label predicate_id object_id object label mapping_justification -HP:0009124 Abnormal adipose tissue morphology skos:exactMatch MP:0000003 abnormal adipose tissue morphology semapv:LexicalMatching -HP:0008551 Microtia skos:exactMatch MP:0000018 small ears semapv:LexicalMatching -HP:0000411 Protruding ears skos:exactMatch MP:0000021 prominent ears semapv:LexicalMatching -``` - -#### Embedded mode (default) - -In the embedded mode, we allow the integration of mapping set level metadata as **_commented YAML_**. Apart from being commented, the YAML follows the exact same spec as the *YAML specified by the external mode*. Heavily used tools in bioinformatics such as pandas allow to [specify comment characters](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) when reading CSV files, which makes this option the most user friendly for this community. Additionally, it is a simple unix-level or language-level operation to filter these as a pre-processing in a robust fashion. - -Note: the mapping set level metadata _must be included as a continuous block at the beginning of the file_. This means in particular: -- No comments can be included that are not part of the metadata data. For example, this is not allowed: - -Illegal case 1: -``` -#curie_map: -# HP: "http://purl.obolibrary.org/obo/HP_" -# MP: "http://purl.obolibrary.org/obo/MP_" -# orcid: "https://orcid.org/" -# This is a comment that does not belong here -#creator_id: -# - "orcid:0000-0002-7356-1779" -``` - -Illegal case 2: -``` -# This is a comment that does not belong here -#curie_map: -# HP: "http://purl.obolibrary.org/obo/HP_" -# MP: "http://purl.obolibrary.org/obo/MP_" -# orcid: "https://orcid.org/" -#creator_id: -# - "orcid:0000-0002-7356-1779" -``` - -- There should be no empty rows: the commented yaml files _must_ be directly followed by the column headers. For example, this is not allowed: - -Illegal case 3: -``` - -#curie_map: -# HP: "http://purl.obolibrary.org/obo/HP_" -# MP: "http://purl.obolibrary.org/obo/MP_" -# orcid: "https://orcid.org/" - -#creator_id: -# - "orcid:0000-0002-7356-1779" -``` - -- The can be only a single # in the beginning of each row, followed immediately by the yaml. -- When the leading hash-symbol is stripped from the header block, the resulting string is: - 1. a valid yaml file - 2. conforms to SSSOM mapping set specification (only `curie_map` or a metadata elements that are allowed on `mapping_set` level, i.e. `global`). -- After the table header, no further row should be commented out. - -Example ([download](https://raw.githubusercontent.com/mapping-commons/sssom/master/examples/embedded/mp-hp-exact-0.0.1.sssom.tsv)): - -``` -#curie_map: -# HP: "http://purl.obolibrary.org/obo/HP_" -# MP: "http://purl.obolibrary.org/obo/MP_" -# orcid: "https://orcid.org/" -#creator_id: -# - "orcid:0000-0002-7356-1779" -#license: "https://creativecommons.org/publicdomain/zero/1.0/" -#mapping_provider: "http://purl.obolibrary.org/obo/upheno.owl" -subject_id subject_label predicate_id object_id object_label mapping_justification -HP:0009124 Abnormal adipose tissue morphology skos:exactMatch MP:0000003 abnormal adipose tissue morphology semapv:LexicalMatching -HP:0008551 Microtia skos:exactMatch MP:0000018 small ears semapv:LexicalMatching -HP:0000411 Protruding ears skos:exactMatch MP:0000021 prominent ears semapv:LexicalMatching -``` - -*Notes:* - -* ROBOT implementation: [https://github.com/ontodev/robot/issues/312](https://github.com/ontodev/robot/issues/312) - -### JSON: - -JSON translation is fully managed by [LinkML dumper classes](https://linkml.io/linkml/code.html#loaders-and-dumpers). - - - -## Use Cases: - -* Consumers: - * OxO - * Analysis in R/Python using dataframes/pandas - * Visual inspection by curators to spot-check errors - * Machine Learning (e.g. predict predicate based on SSSOM columns) - -* Maintainers: - * Maintain mappings in google sheets - * Is the format optimized for google refine? - * Maintain mappings in github/tsvs - * Rendering - * [Drive-by Curation](https://doi.org/10.32388/KBX9VO) PRs - -* Providers - * Autogenerate pages like - * [http://geneontology.org/docs/download-mappings/](http://geneontology.org/docs/download-mappings/) - * [http://uberon.github.io/downloads.html#bridge](http://uberon.github.io/downloads.html#bridge) - * OxO