diff --git a/docs/glossary.rst b/docs/glossary.rst index 07e75e5c7..7edbb16e9 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -244,8 +244,9 @@ For a deeper dive into some of these concepts, see the :ref:`guide`. The most common projection is the :term:`RDF` mapping, but this results in a structure that is not well suited to graph operations due to the use of :term:`Blank Nodes` to represent OWL expressions. - OAK makes use of a simple projection where OWL existential axioms are mapped to :term:`Edges`, - similar to :term:`Relation Graph`. + OAK makes use of a simple projection where common constructs such as OWL existential axioms are mapped + to :term:`Edges`, similar to :term:`Relation Graph`. OAK also projects some axiom types that are + not yet projected in relation graph, such as those between individuals. - See also :ref:`relationships_and_graphs` in the Guide. diff --git a/docs/guide/relationships-and-graphs.rst b/docs/guide/relationships-and-graphs.rst index 37e4f59f3..d365c2b28 100644 --- a/docs/guide/relationships-and-graphs.rst +++ b/docs/guide/relationships-and-graphs.rst @@ -70,7 +70,8 @@ We can make this more human readable: if you are used to working with OWL and the underlying RDF/OWL representation the presentation as simple triads above can be confusing, as these are not actually modeled as triples in the ontology, but rather as more complex axioms involving - constructs like existential restriction. We will return to this later. + constructs like existential restriction. These axioms are *projected* onto a graph + representation. We will return to this topic later. Graph Traversal and Relation Graph Reasoning -------------------------------------------- @@ -442,3 +443,40 @@ and in principle it *may* be the case that this is not asserted However, OAK is designed for working with *released* versions of ontologies, which should be *pre-classified*. This means that all edges that are both :term:`Direct` and :term:`Entailed` should also be :term:`Asserted`. + +Further notes on OWL and Graph Projection +----------------------------------------- + +Many ontologies use the OWL language to express relationships between entities. The OWL +representation is not directly a graph, but different kinds of OWL axioms can be *projected* +onto graph edges. This kind of projection is common, but lacks standardization. + +Two of the most common patterns in OWL ontologies are: + +- SubClassOf between two named classes (e.g. Finger SubClassOf Digit) +- SubClassOf between a named class and a simple existential restriction (e.g. Digit SubClassOf part-of some Hand) + +It's a de-facto standard that these are both projected to graph edges (the former to an is-a or SubClassOf edge, +the latter to a part-of edge). This can be seen e.g. in ontology browsers such as the OLS. These two kinds of +axioms are very common in bio-ontologies. However, there is a lack of standardization in how more complex axioms +should be mapped to edges, or whether they should be. + +The following table outlines some common patterns and whether these are projected to edges in common tools: + +.. csv-table:: OWL to Graph Projections + :header: OWL Axiom, Graph Projection, SQL Adapter, Relation Graph + :widths: 20, 20, 20, 20 + + A SubClassOf B, rdfs:subClassOf B, Yes, Yes + A SubClassOf R some B, A R B, Yes, Yes + A SubClassOf R value B, A R B, Yes, No + A SubClassOf R only B, A R B, No, No + A SubClassOf R max 0 B, A R B, No, No + I type A, I rdf:type A, Yes, No + I type R some A, I R A, Yes, No + I type R value A, I R A, Yes, No + I Facts: R J, I R J, Yes, No + +See `OWLStar `_ for progress towards a standardization of OWL +graph projections for property graphs - this also includes proposals for labeling edges with axiom +types as well as additional semantic information (e.g. cardinality) and annotations. \ No newline at end of file diff --git a/notebooks/OBO/CHEBI-Slimmer.ipynb b/notebooks/OBO/CHEBI-Slimmer.ipynb index 6161b3b4c..16331cc30 100644 --- a/notebooks/OBO/CHEBI-Slimmer.ipynb +++ b/notebooks/OBO/CHEBI-Slimmer.ipynb @@ -24,8 +24,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-08-20T18:49:39.288308Z", - "start_time": "2024-08-20T18:49:39.270787Z" + "end_time": "2024-08-24T00:14:25.092197Z", + "start_time": "2024-08-24T00:14:23.249848Z" } }, "cell_type": "code", @@ -45,26 +45,26 @@ ], "id": "initial_id", "outputs": [], - "execution_count": 103 + "execution_count": 1 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:49:39.364397Z", - "start_time": "2024-08-20T18:49:39.360986Z" + "end_time": "2024-08-24T00:14:25.101519Z", + "start_time": "2024-08-24T00:14:25.096408Z" } }, "cell_type": "code", "source": [ - "from oaklib.datamodels.vocabulary import IS_A\n", + "from oaklib.datamodels.vocabulary import IS_A, HAS_PART\n", "from oaklib.interfaces import OboGraphInterface\n", "\n", "assert isinstance(chebi, OboGraphInterface)\n" ], "id": "237c5d0906ac560c", "outputs": [], - "execution_count": 104 + "execution_count": 2 }, { "metadata": {}, @@ -75,8 +75,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:49:40.515475Z", - "start_time": "2024-08-20T18:49:40.512240Z" + "end_time": "2024-08-24T00:14:25.149991Z", + "start_time": "2024-08-24T00:14:25.147563Z" } }, "cell_type": "code", @@ -84,18 +84,20 @@ "# Relations\n", "CBO = \"obo:chebi#is_conjugate_base_of\"\n", "CAO = \"obo:chebi#is_conjugate_acid_of\"\n", - "TAUTOMER_OF = \"obo:chebi#is_tautomer_of\"" + "TAUTOMER_OF = \"obo:chebi#is_tautomer_of\"\n", + "ENANTIOMER_OF = \"obo:chebi#is_enantiomer_of\"\n", + "HAS_ROLE = \"RO:0000087\"" ], "id": "703ad334757e40df", "outputs": [], - "execution_count": 105 + "execution_count": 3 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:49:40.530940Z", - "start_time": "2024-08-20T18:49:40.528328Z" + "end_time": "2024-08-24T00:14:25.159993Z", + "start_time": "2024-08-24T00:14:25.157729Z" } }, "cell_type": "code", @@ -105,14 +107,14 @@ ], "id": "4538a10374439f3b", "outputs": [], - "execution_count": 106 + "execution_count": 4 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:49:41.401110Z", - "start_time": "2024-08-20T18:49:41.398242Z" + "end_time": "2024-08-24T00:14:25.169516Z", + "start_time": "2024-08-24T00:14:25.167446Z" } }, "cell_type": "code", @@ -123,14 +125,14 @@ ], "id": "d9a11b7b7e44919a", "outputs": [], - "execution_count": 107 + "execution_count": 5 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:49:41.426154Z", - "start_time": "2024-08-20T18:49:41.422566Z" + "end_time": "2024-08-24T00:14:25.179682Z", + "start_time": "2024-08-24T00:14:25.176430Z" } }, "cell_type": "code", @@ -147,11 +149,23 @@ "CYSTEINIUM = \"CHEBI:32458\"\n", "CORD_E = \"CHEBI:213754\"\n", "AAAE = \"CHEBI:46874\"\n", - "CITRIC_ACID = \"CHEBI:30769\"\n" + "CITRIC_ACID = \"CHEBI:30769\"\n", + "\n", + "AMMONIA=\"CHEBI:16134\"\n", + "AMMONIUM=\"CHEBI:28938\"\n", + "AZANIDE=\"CHEBI:29337\"\n", + "HYRDRIDONITRATE_2M = \"CHEBI:29340\"\n", + "PECTIN = \"CHEBI:17309\"\n", + "WATER = \"CHEBI:15377\"\n", + "\n", + "CONJ_EXCLUDES = {\n", + " AMMONIA, AMMONIUM, AZANIDE, HYRDRIDONITRATE_2M, PECTIN\n", + "}\n", + "\n" ], "id": "6b59749828d83578", "outputs": [], - "execution_count": 108 + "execution_count": 6 }, { "metadata": {}, @@ -163,8 +177,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:50:01.773393Z", - "start_time": "2024-08-20T18:49:41.785756Z" + "end_time": "2024-08-24T00:14:42.329935Z", + "start_time": "2024-08-24T00:14:25.194589Z" } }, "cell_type": "code", @@ -180,12 +194,12 @@ "200879" ] }, - "execution_count": 109, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 109 + "execution_count": 7 }, { "metadata": {}, @@ -196,8 +210,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:15.416210Z", - "start_time": "2024-08-20T18:50:02.431525Z" + "end_time": "2024-08-24T00:14:52.997092Z", + "start_time": "2024-08-24T00:14:42.377312Z" } }, "cell_type": "code", @@ -218,18 +232,18 @@ "161106" ] }, - "execution_count": 110, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 110 + "execution_count": 8 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:17.607713Z", - "start_time": "2024-08-20T18:50:15.833094Z" + "end_time": "2024-08-24T00:14:54.898433Z", + "start_time": "2024-08-24T00:14:53.110918Z" } }, "cell_type": "code", @@ -246,18 +260,18 @@ "177462" ] }, - "execution_count": 111, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 111 + "execution_count": 9 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:17.665229Z", - "start_time": "2024-08-20T18:50:17.662032Z" + "end_time": "2024-08-24T00:14:55.019693Z", + "start_time": "2024-08-24T00:14:55.016909Z" } }, "cell_type": "code", @@ -273,12 +287,39 @@ "'InChI=1S/C4H8O3.Na/c1-3(5)2-4(6)7;/h3,5H,2H2,1H3,(H,6,7);/q;+1/p-1'" ] }, - "execution_count": 112, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 112 + "execution_count": 10 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Various Relationships\n", + "id": "a56da86f0437f78f" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:14:57.712081Z", + "start_time": "2024-08-24T00:14:55.137919Z" + } + }, + "cell_type": "code", + "source": [ + "PMAP = {ENANTIOMER_OF: \"RO:0018039\"}\n", + "preserved_rels = list(chebi.relationships(predicates=[HAS_PART, HAS_ROLE, ENANTIOMER_OF]))\n", + "preserved_rels_by_subject = defaultdict(list)\n", + "for s, p, o in preserved_rels:\n", + " p_mapped = PMAP.get(p, p)\n", + " preserved_rels_by_subject[s].append((p_mapped, o))\n", + "assert len(preserved_rels_by_subject) > 1000" + ], + "id": "1b818c23743b2dd8", + "outputs": [], + "execution_count": 11 }, { "metadata": {}, @@ -295,8 +336,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:17.994829Z", - "start_time": "2024-08-20T18:50:17.992869Z" + "end_time": "2024-08-24T00:14:57.859240Z", + "start_time": "2024-08-24T00:14:57.857353Z" } }, "cell_type": "code", @@ -308,8 +349,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:20.236870Z", - "start_time": "2024-08-20T18:50:18.333154Z" + "end_time": "2024-08-24T00:15:00.772525Z", + "start_time": "2024-08-24T00:14:58.010630Z" } }, "cell_type": "code", @@ -322,13 +363,13 @@ ], "id": "481dce997828261a", "outputs": [], - "execution_count": 113 + "execution_count": 12 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:20.567488Z", - "start_time": "2024-08-20T18:50:20.563444Z" + "end_time": "2024-08-24T00:15:00.929475Z", + "start_time": "2024-08-24T00:15:00.926015Z" } }, "cell_type": "code", @@ -341,70 +382,70 @@ "189057" ] }, - "execution_count": 114, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 114 + "execution_count": 13 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:20.895644Z", - "start_time": "2024-08-20T18:50:20.893113Z" + "end_time": "2024-08-24T00:15:01.084503Z", + "start_time": "2024-08-24T00:15:01.082231Z" } }, "cell_type": "code", "source": "assert charges[L_CYSTEINE_ZWITTERION] == 0", "id": "6c387597c60290a2", "outputs": [], - "execution_count": 115 + "execution_count": 14 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:21.231594Z", - "start_time": "2024-08-20T18:50:21.229170Z" + "end_time": "2024-08-24T00:15:01.240865Z", + "start_time": "2024-08-24T00:15:01.238205Z" } }, "cell_type": "code", "source": "assert charges[CYSTEINATE_1_MINUS] == -1", "id": "58b92eac4c3aea65", "outputs": [], - "execution_count": 116 + "execution_count": 15 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:21.557913Z", - "start_time": "2024-08-20T18:50:21.555625Z" + "end_time": "2024-08-24T00:15:01.416184Z", + "start_time": "2024-08-24T00:15:01.413800Z" } }, "cell_type": "code", "source": "assert charges[CITRIC_ACID] == 0", "id": "c55e6cb12a37ea60", "outputs": [], - "execution_count": 117 + "execution_count": 16 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:21.891293Z", - "start_time": "2024-08-20T18:50:21.888851Z" + "end_time": "2024-08-24T00:15:01.567022Z", + "start_time": "2024-08-24T00:15:01.564552Z" } }, "cell_type": "code", "source": "assert AMINO_ACID_ANION not in charges, \"X anion terms are agnostic to a SPECIFIC charge\"", "id": "e7d7762146973a0e", "outputs": [], - "execution_count": 118 + "execution_count": 17 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:22.910361Z", - "start_time": "2024-08-20T18:50:22.552993Z" + "end_time": "2024-08-24T00:15:02.089201Z", + "start_time": "2024-08-24T00:15:01.723140Z" } }, "cell_type": "code", @@ -463,18 +504,18 @@ "10687" ] }, - "execution_count": 119, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 119 + "execution_count": 18 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:22.969774Z", - "start_time": "2024-08-20T18:50:22.959082Z" + "end_time": "2024-08-24T00:15:02.260039Z", + "start_time": "2024-08-24T00:15:02.246482Z" } }, "cell_type": "code", @@ -525,25 +566,25 @@ "" ] }, - "execution_count": 120, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 120 + "execution_count": 19 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:23.332540Z", - "start_time": "2024-08-20T18:50:23.330433Z" + "end_time": "2024-08-24T00:15:02.447896Z", + "start_time": "2024-08-24T00:15:02.445523Z" } }, "cell_type": "code", "source": "# charges_by_inchi[L_CYSTEINE_ZWITTERION]", "id": "4b25096456c81bdf", "outputs": [], - "execution_count": 121 + "execution_count": 20 }, { "metadata": {}, @@ -554,57 +595,59 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:25.315969Z", - "start_time": "2024-08-20T18:50:23.352768Z" + "end_time": "2024-08-24T00:15:04.043465Z", + "start_time": "2024-08-24T00:15:02.471704Z" } }, "cell_type": "code", "source": [ "conjrels = list(chebi.relationships(predicates=[CBO, CAO, TAUTOMER_OF]))\n", "\n", + "conjrels = [(s, p, o) for s, p, o in conjrels if not s in CONJ_EXCLUDES and not o in CONJ_EXCLUDES]\n", + "\n", "assert len(conjrels) > 15000\n", "assert len([r for r in conjrels if r[1] == CBO]) > 8000\n", "assert len([r for r in conjrels if r[1] == TAUTOMER_OF]) > 1500" ], "id": "3cb9284c0f7ba8b1", "outputs": [], - "execution_count": 122 + "execution_count": 21 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:26.014858Z", - "start_time": "2024-08-20T18:50:26.012780Z" + "end_time": "2024-08-24T00:15:04.218543Z", + "start_time": "2024-08-24T00:15:04.207737Z" } }, "cell_type": "code", - "source": "", - "id": "12ccc4842a4a2204", + "source": [ + "conjrels_by_subject = defaultdict(list)\n", + "for s, p, o in conjrels:\n", + " conjrels_by_subject[s].append((p, o))" + ], + "id": "73446b9fa82786fd", "outputs": [], - "execution_count": null + "execution_count": 22 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:26.035169Z", - "start_time": "2024-08-20T18:50:26.024989Z" + "end_time": "2024-08-24T00:15:04.392333Z", + "start_time": "2024-08-24T00:15:04.390440Z" } }, "cell_type": "code", - "source": [ - "conjrels_by_subject = defaultdict(list)\n", - "for s, p, o in conjrels:\n", - " conjrels_by_subject[s].append((p, o))" - ], - "id": "73446b9fa82786fd", + "source": "", + "id": "12ccc4842a4a2204", "outputs": [], - "execution_count": 123 + "execution_count": null }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:26.379666Z", - "start_time": "2024-08-20T18:50:26.376Z" + "end_time": "2024-08-24T00:15:04.567047Z", + "start_time": "2024-08-24T00:15:04.563816Z" } }, "cell_type": "code", @@ -618,12 +661,12 @@ " ('obo:chebi#is_tautomer_of', 'CHEBI:142853')]" ] }, - "execution_count": 124, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 124 + "execution_count": 23 }, { "metadata": {}, @@ -634,8 +677,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:26.813512Z", - "start_time": "2024-08-20T18:50:26.711296Z" + "end_time": "2024-08-24T00:15:04.958411Z", + "start_time": "2024-08-24T00:15:04.740412Z" } }, "cell_type": "code", @@ -653,18 +696,43 @@ "\n", "conj_graph = calculate_conj_graph(conjrels)\n", "sccs = list(nx.strongly_connected_components(conj_graph))\n", + "asserted_sccs = sccs\n", "assert len(sccs) > 8000\n", "\n" ], "id": "5d1bc9a29166f020", "outputs": [], - "execution_count": 125 + "execution_count": 24 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:05.135196Z", + "start_time": "2024-08-24T00:15:05.132019Z" + } + }, + "cell_type": "code", + "source": "len(asserted_sccs)", + "id": "a0fb3be95eeb9eec", + "outputs": [ + { + "data": { + "text/plain": [ + "8553" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 25 }, { "metadata": {}, "cell_type": "markdown", "source": [ - "## Label analysis\n", + "## Lexical analysis\n", "\n", "The CHEBI conjugate relationships are incomplete - here we aim to complete them doing a lexical analysis of the labels.\n", "\n", @@ -681,8 +749,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:27.151161Z", - "start_time": "2024-08-20T18:50:27.145953Z" + "end_time": "2024-08-24T00:15:05.313966Z", + "start_time": "2024-08-24T00:15:05.308375Z" } }, "cell_type": "code", @@ -738,47 +806,47 @@ " '(9-)': -9}" ] }, - "execution_count": 126, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 126 + "execution_count": 26 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:30.995745Z", - "start_time": "2024-08-20T18:50:27.501511Z" + "end_time": "2024-08-24T00:15:08.625746Z", + "start_time": "2024-08-24T00:15:05.487934Z" } }, "cell_type": "code", "source": "roles = list(chebi.descendants(\"CHEBI:50906\", [IS_A]))", "id": "c10c64db4580a7ee", "outputs": [], - "execution_count": 127 + "execution_count": 27 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:31.390691Z", - "start_time": "2024-08-20T18:50:31.388205Z" + "end_time": "2024-08-24T00:15:08.804133Z", + "start_time": "2024-08-24T00:15:08.801675Z" } }, "cell_type": "code", "source": [ "# https://github.com/ebi-chebi/ChEBI/issues/4528\n", - "EXCLUDE_STEMS = [\"disulfide\", \"tartr\", \"tartar\"]" + "EXCLUDE_STEMS = [\"disulfide\", \"tartr\", \"tartar\", \"oxide\", \"oxo\"]" ], "id": "8400b3213240d61b", "outputs": [], - "execution_count": 128 + "execution_count": 28 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:37.180635Z", - "start_time": "2024-08-20T18:50:31.730095Z" + "end_time": "2024-08-24T00:15:14.655659Z", + "start_time": "2024-08-24T00:15:08.982851Z" } }, "cell_type": "code", @@ -833,13 +901,13 @@ ], "id": "191d0cca5fcbbcaf", "outputs": [], - "execution_count": 129 + "execution_count": 29 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:37.518592Z", - "start_time": "2024-08-20T18:50:37.515037Z" + "end_time": "2024-08-24T00:15:14.865774Z", + "start_time": "2024-08-24T00:15:14.862663Z" } }, "cell_type": "code", @@ -855,18 +923,42 @@ " 'anion': 'CHEBI:37022'}" ] }, - "execution_count": 130, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 130 + "execution_count": 30 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:37.860090Z", - "start_time": "2024-08-20T18:50:37.856322Z" + "end_time": "2024-08-24T00:15:15.082133Z", + "start_time": "2024-08-24T00:15:15.078829Z" + } + }, + "cell_type": "code", + "source": "stem_to_chem[\"oxo\"]", + "id": "18ad460573de1981", + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 31 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:15.354713Z", + "start_time": "2024-08-24T00:15:15.351237Z" } }, "cell_type": "code", @@ -884,31 +976,31 @@ " '(2-)': 'CHEBI:35808'}" ] }, - "execution_count": 131, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 131 + "execution_count": 32 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:38.194184Z", - "start_time": "2024-08-20T18:50:38.191684Z" + "end_time": "2024-08-24T00:15:15.570930Z", + "start_time": "2024-08-24T00:15:15.568562Z" } }, "cell_type": "code", "source": "assert not stem_to_chem[\"citrate\"]", "id": "218adb893354594a", "outputs": [], - "execution_count": 132 + "execution_count": 33 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:38.532569Z", - "start_time": "2024-08-20T18:50:38.529049Z" + "end_time": "2024-08-24T00:15:15.782029Z", + "start_time": "2024-08-24T00:15:15.778272Z" } }, "cell_type": "code", @@ -923,18 +1015,18 @@ " 'zwitterion(1-)': 'CHEBI:142858'}" ] }, - "execution_count": 133, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 133 + "execution_count": 34 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:38.871513Z", - "start_time": "2024-08-20T18:50:38.869077Z" + "end_time": "2024-08-24T00:15:15.987654Z", + "start_time": "2024-08-24T00:15:15.985166Z" } }, "cell_type": "code", @@ -944,7 +1036,44 @@ ], "id": "4c64ccc7e2241c4c", "outputs": [], - "execution_count": 134 + "execution_count": 35 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:16.221660Z", + "start_time": "2024-08-24T00:15:16.205696Z" + } + }, + "cell_type": "code", + "source": "", + "id": "91405e702f57aac8", + "outputs": [], + "execution_count": 36 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:16.425579Z", + "start_time": "2024-08-24T00:15:16.422306Z" + } + }, + "cell_type": "code", + "source": "", + "id": "8c0413a76dcfd32b", + "outputs": [ + { + "data": { + "text/plain": [ + "31415" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 37 }, { "metadata": {}, @@ -961,8 +1090,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:39.228166Z", - "start_time": "2024-08-20T18:50:39.211098Z" + "end_time": "2024-08-24T00:15:16.648206Z", + "start_time": "2024-08-24T00:15:16.632610Z" } }, "cell_type": "code", @@ -1170,18 +1299,18 @@ " 'stem': None}]" ] }, - "execution_count": 135, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 135 + "execution_count": 38 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:39.945773Z", - "start_time": "2024-08-20T18:50:39.941374Z" + "end_time": "2024-08-24T00:15:16.855047Z", + "start_time": "2024-08-24T00:15:16.850831Z" } }, "cell_type": "code", @@ -1241,68 +1370,188 @@ " 'stem': None}]" ] }, - "execution_count": 136, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 136 + "execution_count": 39 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:40.059494Z", - "start_time": "2024-08-20T18:50:39.975188Z" + "end_time": "2024-08-24T00:15:17.145232Z", + "start_time": "2024-08-24T00:15:17.060979Z" } }, "cell_type": "code", "source": [ "charge_problems = [] ## warning - global\n", - "rows = []\n", + "lexical_conj_pairs = []\n", "for stem, clique in stem_to_chem.items():\n", - " rows += make_conjrefs(clique, stem)\n", + " lexical_conj_pairs += make_conjrefs(clique, stem)\n", "\n", "# Reported here: https://github.com/ebi-chebi/ChEBI/issues/4525\n", "pd.DataFrame(charge_problems).to_csv(\"tmp/charge_problems.csv\", index=False)\n", "\n", - "len(rows)" + "\n", + "len(lexical_conj_pairs)" ], "id": "c5ef02e4ed64a4d3", "outputs": [ { "data": { "text/plain": [ - "17170" + "17166" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:17.362431Z", + "start_time": "2024-08-24T00:15:17.347876Z" + } + }, + "cell_type": "code", + "source": [ + "chem_to_stem: Dict[str, str] = {}\n", + "for row in lexical_conj_pairs:\n", + " def _assign(chem: str, stem: str):\n", + " if chem in chem_to_stem:\n", + " if chem_to_stem[chem] != stem:\n", + " raise ValueError(f\"Conflicting stems for {chem}: {chem_to_stem[chem]} vs {stem}\")\n", + " else:\n", + " chem_to_stem[chem] = stem\n", + " stem = row[\"stem\"]\n", + " _assign(row[\"chem1\"], stem)\n", + " _assign(row[\"chem2\"], stem)" + ], + "id": "3088a79bf38863f5", + "outputs": [], + "execution_count": 41 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:19:49.836705Z", + "start_time": "2024-08-24T00:19:49.729430Z" + } + }, + "cell_type": "code", + "source": [ + "g = calculate_conj_graph([(row[\"chem1\"], \"?\", row[\"chem2\"]) for row in lexical_conj_pairs])\n", + "lexical_sccs = list(nx.strongly_connected_components(g))\n", + "len(lexical_sccs)" + ], + "id": "b0612535c952ce23", + "outputs": [ + { + "data": { + "text/plain": [ + "7517" ] }, - "execution_count": 137, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 137 + "execution_count": 117 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "fd3af467efa5e894" }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:40.412838Z", - "start_time": "2024-08-20T18:50:40.395220Z" + "end_time": "2024-08-24T00:25:07.736974Z", + "start_time": "2024-08-24T00:25:07.639766Z" + } + }, + "cell_type": "code", + "source": [ + "# venn diagram of overlaps between\n", + "# - lexical_sccs\n", + "# - asserted_sccs\n", + "# - rhea_sccs\n", + "# - full_sccs\n", + "\n", + "from matplotlib_venn import venn3\n", + "from matplotlib_venn import venn3_unweighted\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def hashable_scc(scc):\n", + " return set([tuple(sorted(list(x))) for x in scc])\n", + "\n", + "def my_venn3(sccs, *args, **kwargs):\n", + " scc_sets = [hashable_scc(scc) for scc in sccs]\n", + " venn3_unweighted(scc_sets, *args, **kwargs)\n", + "\n", + "#venn3([set(lexical_sccs), set(asserted_sccs), set(rhea_sccs)], (\"Lexical\", \"Asserted\", \"Rhea\"))\n", + "#venn3([{1}, {1,2}, {1,2,tuple(\"a\" \"b\")}])\n", + "my_venn3([lexical_sccs, asserted_sccs, rhea_sccs], (\"Lexical\", \"Asserted\", \"Rhea\"))\n", + "plt.show()" + ], + "id": "2305e056b7c20879", + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAGFCAYAAADXUXXZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAABayUlEQVR4nO3dd5ycdbn//9d9Ty/b+6Zs2m7KpickhJCQkAiRICBgwwQVbEg5tgMqSrV/5YiIoCIcfoBHUIoGENAQCBBaCJAeUjZlk93N9jp97vv3x5gNS9qWmbnnnrme57GPyOzszDV7Zu/3fLqi67qOEEIIMUiq0QUIIYQwNwkSIYQQQyJBIoQQYkgkSIQQQgyJBIkQQoghkSARQggxJBIkQgghhkSCRAghxJBIkAghhBgSCRIhhBBDIkEihBBiSCRIhBBCDIkEiRBCiCGRIBFCCDEkEiRCCCGGRIJECCHEkEiQCCGEGBIJEiGEEEMiQSKEEGJIJEiEEEIMiQSJEEKIIZEgEUIIMSQSJEIIIYZEgkQIIQyi67rRJcSFKYNk5cqVrFy5MuHPM378eH7729/G9THPPvtsvve978X1MYUQx/ed73yH8ePH88ADDxhdSh+dnZ1cf/31vPPOO3F5vERcqwbClEGSLI899hif+tSnjC5DCDEIXV1drF69mqqqKh577LGU+vS/fft2/vGPf6BpmtGlxIUEyUlMnz6d0tJSo8sQQgzCM888A8CNN97Ivn37ePPNNw2uKH2lbZC88847rFixgmnTpjFnzhxuuOEGWltbAYhGo1x66aXMnTu39zaA733ve0yfPp2amhrg2OZiY2MjN9xwA/PmzWPGjBmsWLGC9957r/f7ra2t3HrrrSxevJjJkyczZ84crr76ag4ePJikVy2EOOKJJ55g3rx5nH766VRUVPDoo4/2+f6BAwf4+te/zty5c5k2bRqf+cxnWLt2be/3A4EAt9xyCwsXLmTy5MksW7aM+++/v89jtLe3c9NNN3HGGWcwZcoUPv3pT/PGG2/0uc/48eO5++67ufjii5k6dSp33303l19+OQCXX355n2761atXc/HFFzNlyhTmz5/Pj3/8Y3w+X5/He/vtt/nMZz7DtGnTOPfcc3n99dfj8vsairQMkvXr1/PFL34Rp9PJnXfeyQ9+8APefvttLr/8cgKBABaLhZ///Of4fD5+8YtfALH/Bz711FNcf/31jBkz5pjH7Onp4XOf+xxvvfUW//3f/83dd9+Nw+HgiiuuYN++fei6zte+9jXWrVvHd7/7Xe6//36uueYa3njjDW6++eZk/wqEyGi7du1i8+bNXHTRRQBcdNFFvPjiizQ3NwOgaRpf+9rX8Pv9/PKXv+See+4hNzeXq666iv379wPw05/+lFdeeYUbbriB+++/nyVLlvDLX/6SJ554AoBgMMgXvvAFXnzxRb71rW9x9913U1paype//OVjwuT3v/89n/jEJ7jrrrtYunQpN910EwA33XRT7/Xh6aef5uqrr2bMmDH87ne/45prrmHVqlV84xvf6O2W27p1K1dccQVZWVncddddXH755Xz7299O+O/zlHQTWrFihb5ixYoTfv8zn/mMfv755+uRSKT3tpqaGn3ixIn6I4880nvbH/7wB72qqkr/17/+pZ9xxhn6V7/61T6PU1VVpd911126ruv6ww8/rI8fP17ftm1b7/d9Pp9+zjnn6H/961/1hoYGfeXKlfr69ev7PMbtt9+uT548ufe/Fy9erN9www2De+FCiH752c9+ps+ZM0cPBoO6rut6XV2dPmHCBP3ee+/VdV3XGxsb9aqqKn3VqlW9P9PZ2an/9Kc/1Xfu3Knruq6fe+65+g9/+MM+j3v33XfrL730kq7ruv7YY4/pVVVV+vvvv9/7fU3T9M9//vP6xRdf3HtbVVWV/oUvfKHP47z55pt6VVWV/uabb/b+3MKFC/Urr7yyz/1ef/11vaqqqvc5r732Wn3hwoV6KBTqvc+zzz7b51plhLRrkfj9fjZu3MhZZ52FrutEIhEikQgjRoxg7NixrFu3rve+V155JdOmTeO6665D13V++tOfnvBxN2zYwPDhw5k4cWLvbS6XixdeeIFPfepTlJSU8NBDDzFr1iwOHjzIunXrePjhh3n33XcJhUIJfc1CiKPC4TCrVq1i6dKlBAIBOjs78Xg8zJo1i7/+9a9omkZhYSHjxo3jRz/6ETfccANPP/00mqbx/e9/n8rKSgDmzp3LX//6V77yla/wyCOPUFtby9VXX82iRYsAeOONNygqKqK6urr3OhONRlm8eDFbtmyho6Ojt6YPXzeOp6amhoaGBs4+++zex4pEIpx22ml4vd7e69aGDRtYsGABNput92fPOeccLBZLnH+LA2M19NkToLOzE03TuO+++7jvvvuO+b7D4ej93xaLhQsuuICNGzcydepUCgoKTvi47e3tJ/0+wKpVq/if//kf6uvryc3NZeLEiTidzsG/GCHEgL388su0tLTw+OOP8/jjjx/z/VdffZWzzjqLBx54gHvvvZd///vf/P3vf8dms7F06VJuvfVWcnJyuPHGGyktLWXVqlXcfvvt3H777cyYMYNbbrmFCRMm0N7eTlNTE9XV1ceto6mpiZycHADcbvdJa25vbwfg1ltv5dZbbz3m+42NjQB0dHSQl5fX53tWq/WY25It7YLE4/GgKApf/OIXWb58+THfd7lcvf+7qamJ3/72t0ycOJGXXnqJ559/nmXLlh33cbOyso47aP7uu++Sk5NDW1sbN9xwAytXruTKK6+kpKQEgF/+8pds2LAhTq9OCHEqTzzxBCNGjOAnP/lJn9t1Xeeaa67h0Ucf5ayzzqKkpIRbbrmFm2++mR07dvD8889z3333kZeXx80334zdbueqq67iqquuoq6ujpdeeol77rmH73znOzz77LNkZWUxatQofvWrXx23juHDh/e75uzsbACuv/565syZc8z3jwRSbm5u7zjPh1/Xh1s/Rki7ri2v18ukSZOoqalhypQpvV+VlZX89re/5a233uq970033YTFYuHBBx9kyZIl3HrrrX1mcX3Y7Nmzqa2tZdeuXb23BYNBrr32Wh5//HHee+89NE3j2muv7Q2RaDTaO6MiXeaLC5HKmpqaePXVV1m+fDlz587t83X66aezbNky1q5dy4YNGzjjjDPYtGkTiqIwceJEvvWtb1FVVUVdXR2BQIBzzz23dyFjeXk5n//851m+fDl1dXUAzJkzh/r6egoKCvpca9atW8ef/vSnk3Y3ffR7Y8aMoaCggIMHD/Z5rJKSEu644w62bdsGwLx583jllVfw+/29P/vqq68SDofj/ascENO2SBoaGnjwwQePub2qqopvf/vbfPWrX+U73/kOF1xwAdFolAceeICNGzfyjW98A4C///3vrFmzhjvuuIPc3FxuuukmzjvvPG655RbuuuuuYx734osv5uGHH+aqq67iuuuuIy8vj4ceeohwOMxll13W++a67bbbuOSSS+jo6ODPf/4zO3bsAMDn8+H1ehP3CxFC8Pe//51IJHLc3giIzd7629/+xksvvYTT6eT666/n2muvpbCwkNdff53t27dz+eWX43Q6qa6u5u6778ZmszF+/Hj27t3LU089xbnnngvErgmPPPIIX/rSl/j6179OWVkZr7/+Ovfddx8rVqzoM47xUVlZWUCsGy4nJ4cJEybwrW99q/fD7eLFi+ns7OSee+7h8OHDvd1nV199NatXr+bKK6/ky1/+Mq2trdx5550nfa6kMGyYfwhWrFihV1VVHffrBz/4ga7rsdkOl112mT516lR91qxZ+uWXX947o6qhoUGfPXu2/pWvfKXP4z700EN6VVWV/vTTT+u6rh8zE6KhoUH/9re/rc+ePVufOXOmfsUVV+jbt2/v/f4jjzyiL1myRJ88ebK+aNEi/YYbbtD//e9/61VVVfrLL7+s63oGzNoKh3Xd59P1zk5db23V9cZGXa+r0/Xa2ti/TU263tERu084bHS1aSMSjej+sF/vDHTqLb4Wvb6rXq/tqNUPdR7SD3cf1lt8LXpHoEPvCfXowUhQ1zTN6JITYtmyZfry5ctP+H1N0/Szzz5bX7Bggb5nzx79mmuu0efNm6dXV1fry5cv1x999NHe+3Z1dem33367vmjRIr26ulpfuHCh/vOf/1z3+/2992lubta///3v6/PmzdMnT56sn3vuufp9992nR6PR3vscb0ZVNBrVv/3tb+tTpkzpU++zzz6rf/KTn9QnT56sz5kzR//617+u79ixo8/PbtmyRV+xYoU+depUffHixfqqVav0M844w9BZW4qup9C+ASI1aRp0d0NnJ3R19f03EIBI5OjXQCkK2Gx9v1wuyM6GnJzYV3Y2ZGXF7pthekI9dAY76Qh2xP4NdNAT7iEcDRPWwr3/avrAu05VRcWm2rCqVhxWB167l2xHNtmObLLsWbF/HVlYVdN2XIgkkSARR0Ui0NgY++rsPBoY3d1g9NtEVWNhciRgsrMhNxeKiuBDM/HMKKpFafI10R5opyPQ0RsancFOItogwjnO3DZ3b7BkO7IpcBdQ6i3FaZUZiSJGgiSTdXdDQwMcPhz7amkxPjAGIz8fSkuPfqX4WFQwEuRwz2Hqu+pp6G6gydc0qBaF0bId2ZR6SynxlFDiLSHflW90ScIgEiSZQtOgufloaBw+DD09RleVGF7v0VApKwOD59h3h7pp6G7o/Wr1H39moNnZLfbeUDnyr3SLZQYJknQWDkNtLezfDwcOQDBodEXGcDhgxAgYMyb2bxJWATd0N7CndQ/7O/bTHepO+POlIotiYVj2MCpyKqjIrcBtO/miPGFeEiTpJhiEvXtjX3V1EI0aXVFqsdth5EgYOxaGD49rqDR0N1DTVsPetr30hNO0tTcExZ5iRuWOYmzeWLIcWUaXI+JIgiQdhMOwbx/s2QMHD8a6scSp2e1QUXG0paIOfH3u4e7D1LTVUNNWI+ExAMWeYsbmjWVs/lhpqaQBCRIzO3wYtmyJhYi0PIbGbodRo6CqCsrLT3rXNn8bH7R8QE1bTcZ2W8WLgkJZVhnVRdWMyh2FkoFTvNOBBInZaFqs5bFlCzQ1GV1NesrLg+pqqKyMrWshtp/RvvZ9bG3aSl1XncEFpiev3Ut1UTUTCifgsJp7SnemkSAxC78ftm2D7dvhIyemiQSx24lOHE/NCDfrW7ZK6yNJrKqVyvxKJhdPJs9l7Iw70T8SJKmuuRk2b461QmTsI2nCLitdRTa6bQF0dDpdTmrsOoeVDJ35ZpBhWcOYUjKFkTkjjS5FnIQESao6cADeey82DiKSJuy20VFspcfiP+73/Q4Hu51QJ4GSVNmObKaWTGVC4QRUJe02LTc9CZJU09QEb74J9fVGV5JRIk4rHSU2uq3HD5CP6nE62enUaEROv0ymHEcOpw07jTF5Y4wuRXyIBEmq6OyEt9+GmhqjK8koUbuFjlIH3TYfg/lD6HQ5+cARpRVjz4PINMWeYk4ffjql3lKjSxFIkBgvEIB3340NpMsYSNJoVpWOMidddj/6oCKkr3a3i+32MJ0Yv8liJqnIqWDu8LnkOnONLiWjSZAYJRKJDaJv3Agh6R5Jpu5iF+1ZIaLEd+2Nrig0eF1sVf1EFfmzShYFhQmFE5hVPksWNxpEgsQIO3fC+vXpu2liigp5bLSVqAQSPFAetlnZ6bZyUAkk9HlEX1bVyrSSaUwvnY5FTfx+auIoCZJk6umBV16JbaQokkZTFTqGueiyD24cZLA6XE4228P0KLLrQDLlOfNYNGoRRZ4io0vJGBIkybJzJ7z+unRjJZmvwElbboSIQWMXuqpy0Otku+JDl90/kkZBYVrpNGaXz5bpwkkgQZJoPl+sFXLggNGVZBTNqtI6zE6PNTW6l/wOB++7ojIYn2T5rnwWjVpEobvQ6FLSmgRJIu3aFWuFZOo5IAYJ5DhoKYwa1go5EV1V2et1sEvt31oVER+qojK9dDozy2ZK6yRBJEgSweeDV1+NHSglkkZXoH24m057au9F1uly8p4jRACZ7p1M0jpJHAmSeNu3D9aulVZIkoXdNprLFEImWWketVjY7rVxSGZ2JZWqqMwsm8nMsplGl5JWJEji6Z13YosLRVL1FLloyQ7EZWFhsh32utlokYH4ZBuVO4rFoxZjs9iMLiUtSJDEQzgMa9ZIV1aS6UD7iNTvyjqVHqeT9Y4QQUW6upIpz5nHuePOJduRbXQppidBMlQdHfCvf0Fbm9GVZJSoTaV5hJ1AmnQNhW1W3nMrtCmyZ1cyOSwOloxZwvDs4UaXYmoSJENx8CCsXi1rQ5Is5LXRVKKn3KysodJVlQ+y7OxPk3A0CwWFucPnMrVkqtGlmJYEyWBt3BjbrVd+fUnVU+iiJcec4yH9VZ/lZpPF3N11ZlSZX8nCioWyvcogSJAMVCQSW2C4e7fRlWScznI3ba7MuMC2u12st/tlgnCSFbmLOGfsOXjsHqNLMRUJkoEIheCf/4TGRqMryThtw110OjJrIV+308lbziCRNG59pSK3zc3yyuVyXvwASJD0VyAQC5HmZqMrySg60Frh6vfJhenG73DwljMsM7qSzGl1srxyOQXuAqNLMQUJkv4IBOCZZ6C11ehKMoquKjRXOPCpmT34HLLZeNutyS7CSeawOFhetVxWwveDBMmp+Hzw7LMyvTfJNKtK00hbws8OMYuI1co7HoUOmR6cVHaLnfMqz6PYU2x0KSlNguRkenpiLZGODqMrySiaRaGxwkZQkWnVHxa1WFjvVSVMksym2vh45cflfPiTkCA5ke7uWIh0dhpdSUbRVIWmUXZpiZxAxGLlbS90Kem1hibVWVUry8Ytozyr3OhSUpLsqXw8nZ2wapWESJLpCjRVOCRETsIajXCaD9zIWodkimgRnt/9PAc7DxpdSkqSIPmozk54+ulYi0Qkja5A8ygngQwfWO8PWzjC3B4Vp/z5JlVEi/DC7hckTI5D3okfFgzC88/HxkZE0uhAS4Ur42dnDYQ9HOZ0nxWHLn/CyRTVo/x7z79p8bUYXUpKkXfhEZoG//43tLcbXUnGaa1w0WPJzHUiQ+EIhTg9YMWK7EGfTGEtzPO7n8cXzoxdFvpDguSIV16Bujqjq8g4neXujF1sGA/OYIjTgg6jy8g4PeEent/9PBFNJj2ABEnMe+/Bzp1GV5FxfAXOjNk7K5Gy/QGmRdxGl5Fxmn3NvFjzIjLxVYIEampg/Xqjq8g4Ia+dllxZJxIvpd0+xmouo8vIOPs79vPmwTeNLsNwmR0khw/DSy8ZXUXGidotNJVoaLK3bVyN7QpQoks3V7JtbtzMtqZtRpdhqMwNkq4ueOEFiMr+RcmkqwpNwy1pdyhVKlB0nSndEbxYjS4l46w7sI7ajlqjyzBMZgZJKATPPRfbjFEkVctIp2x9kkCWaJTZPlVmciWZjs7qmtW0+TNzT77MDJJ162SarwG6S2SabzI4QiFmhJxGl5FxwlqY1TWriWqZ18uReUGyZw/s2mV0FRkn5LXR6pUWYLLk+/yM02XwPdnaAm28degto8tIuswKku5uePVVo6vIOJqq0FxCWp+znorGdAXIxWZ0GRlnS+OWjNtGJXOCRNdjM7RC0j+fbG0jnISRrc+TTdF0pvuUDPojTx0v73uZQCRzWuCZ8x7buBHq642uIuP4Cp2yct1AjlBIFisawBf28cr+V4wuI2kyI0iam+Gdd4yuIuNEHRZacqQFaLTibh/lsr4k6fa172NH8w6jy0iK9A+SSATWrIltyiiSqrXcJosOU8QEX1SmBBvg9drX6Qik/wmr6R8kb74pU30N4CtwyrbwKcQWjjA5IrO4ki2iRVizdw2ant4fqNI7SA4ehG2ZvXWBETSrSmuuDK6nmpJuH4XYjS4j4zT5mniv/j2jy0io9N1LIRqF114zuoqM1F7uIErqD7CHwhH+8MTLvLR+B1arhWVnTOZLF5yJoijsrj3Mb/6ymn2HmqkoK+C6yz5G1ciS3p995d2d/O+q12hu76J6zDC+9flzKCnINvDV9M9kH6x1xU6kFMnzfsP7VBZUku1I/ffIYKRvi2TjRjlz3QCBHAddttQPEYB7//YS7+44wE+vvYTvf+k8nlu3mWdf24Q/GOaHv3uKKWOH8bvvrWDSmHJ+dM+T+IOxVtbWPYf42QPPcsmSWdzz/ZXYrBZ++sAzBr+a/nGEQkzUZRZXskX1KK/Xvm50GQmTnkHS3Q3vv290FRlHV6C10Bx9wZ09fp5/fQvfuuxjTBhVxowJFVyyZBYf7Ktn7YYd2G1WvnLxWYwsK+CqTy3G5bDz6rsfAPD46ndYMmci5y+YxoiSfL7x6bNp7eiho9scZ6sM7/aTpadvZ0SqOtBxgH3t+4wuIyHSM0hefz02W0skVVeZ2zQLD7fuOYTHZWdq1Yje2z577ly+s3IZ2/fWM3nsMBQl1v+jKArVY4exbW9sHdKmXQeZP72y9+fKCnN4+MdfIcdrjk/6iqYzOSRBYoQ3at9Iy1MV0+/ddOgQ7NtndBUZJ2pT6XCZZ5ZWfXMHJQU5/PvNrfzlhbeIRDTOmVfNZctOp7Wjh1HlBX3un5flZl9dM92+AF2+AFFN4/u/fZyaQ01MGFXGtZ9dQmFulkGvZuCy/QFKHQ4aCBpdSkbpCnWx6fAmZpbNNLqUuEqvFomux6b7iqTrLHWYas2IPxjmUGMbz762ie+uXMZXLz6Lf7z8Hk+u2UAwHMFmtfS5v81qIRyJ9o6T3PPXNSyZM4nbvv5JwpEoP7rnKTTNXHuJVZljKCvtbGzYiD+cXr/89AqSnTuhpcXoKjJOxGWly26uPwyLquALhPj+l5YzaUw5Z86o5HPL5vLsa5uw/yc0PiwcieKwW7GosT+Zj8+fwtK5kxg/qpTvffE89tY1s31vnREvZdBcwSCjdNluPtnCWph36tJrp430CZJIRM5eN0h7ic10+/rm53ix26x9puwOL8mnqa2LglwvrZ19B85bO3vIz/GS43VhtaiMKMnv/V6210W2x0VTW1fS6o+XMb5oGl0EzGNH8460OgQrfd5DmzaBzxyzZtJJKMtuysOqJo4uIxSOcPBwa+9tBxpaKMnPZuLoMrbV1KHrsXjUdZ1tNXVMHF2GxaJSObKEmkNNvT/X0e2js9tPSUFO0l/HUNnCYSo1WfGebDo6bx5Mn2749AiScBg2bza6iozUVmTOlW0jSvKZO3kMv3roefYcbOSdbft47IW3OX/hNBbMqKLbF+Dev73E/voW7v3bSwSCYRbOHA/AJUtm8/eX3+OVdz/gQH0Lv3roBcYML2LCqFKDX9XgjOgJYZMViklX21lLs6/Z6DLiQtGPfOwys82b4Y03jK4i4wRyHBwuNO+snx5/kN89toZ1G3fhsNu44KzpfP7jp6MoCjv21XPXX1ZzoKGV0cMK+a/PLWXciKMr2//52ib+8vxbtHf5mFo1gm9e9jGK8swza+ujarPdbFOlRZ9sY/LGsHTMUqPLGDLzB4mmwaOPxhYhiqRqHO3ELxszpoWoxcKarKiJ5t2lBwWFz0z+jOm3TjF/19aePRIiBgh5bBIiacQSjTJWxkqSTkdn82Hzd8ubP0g2bjS6gozUWZh+a1kz3XB/+q24NoMPWj4w/bG85g6S2lpobT31/URcRRwWfHJ8btqxh8NUyLqSpItoEbY2bjW6jCEx98fKJLVG6js7ueXFF1l/8CC5TieXz5zJF2fNAuCDpiZuWb2arY2NVOTmcuPixZw+ciQAvnCYn770Ev/etQtN11lWVcX3Fi3CYzf3mRBdxQ50ZGA2HVUEdPanQQ/XrvW7WPXrVX1uq5xTyQXfvIDtr23njSffoKuli6JRRSxeuZiycWUA3HHZHcd9vGVfX0b1wuqE1bu1aSvTSqdhVc15STZn1QBNTVCXnJXE33zmGcqzs3lyxQp2t7Tw3WefZVh2NqePHMkVjz/O2WPH8vNly/jHtm1cs2oVL1xxBQVuNz996SW2NDRw/6WXogA/eOEFfv7yy9x+zjlJqTsRNKtKt93czXBxYq5gkFKX+ffgajnUwpiZYzjny0f/1iw2Cwd3HOSF+17gnK+cQ3llOe+vfp8nf/kkX7nrK9iddr5+z9f7PM6G5zbwwRsfMG72uITWG4gE+KD5A6qLExdWiWTerq0ktUY6AgHer6/nqtNPZ1ReHkvHjWPB6NG8ceAAT23dittu55alS6nIy+O6+fOpyM1lS0MDADZV5UdLljC5pITqkhIumTyZDYcOJaXuROkpNNeeWmLgKkLmvSwc0XqolcLhhXhyPb1fTo+TnvYeTv/k6Uw6cxK5JbnM++Q8At0BWg7Gtlb68P0joQjvPf8e53zlHBxuR8Jr3nR4E2adRGvOd0x3N+zdm5SnclqtuKxWntyyhXA0Sk1rK+8eOsTE4mLerq1lydixvfsvATyxYgVnjRkDwM1LlzJr2DAADnZ08MyOHcwZMeK4z2MW3Z7oqe8kTC3HH8Bu0kvDES2HWsgryzvm9vGnj+f0i04HIBwKs+G5Dbiz3RQMLzjmvuseX8fIySOpmFKR8HohtjPw3vbkXNfizZxdW7t2xXb6TQKH1cpNS5Zw+5o1PPTuu0R1nYurq/nUlCk88t57TC0t5Uf/+hdr9uxhWE4ON5x1Vm94HHHDc8/x923bGJadzdXz5iWl7kQIZdkJETK6DJFgiq4zRnOyQzXnhApd12mtb2Xfpn28/Y+30TSNqrlVzP/UfCz/2dV5/5b9PPGzJ9DROe/q87A7+45bdjZ3smPdDj536+eSWvuull2MyRuT1OeMB3N+7Ni9O6lPt6e1lcVjxvDYZZfxs3PP5fmdO1m1fTu+cJg/rl9PkcfDfRdfzGnDh3Pl449T/5Ejfr8yZw6Pfe5zDMvO5itPPIFm0uZrd57l1HcSaaE0YN6WZ1dzF5FgBKvNyvnXnc9Znz+L7eu2s/bPa3vvUziikBU/WcH8S+fzwu9foG5X3/HWzS9vpmRMSe8gfLLUdtYSjJhvfMp8LZKWFmhL3q6Zb+zfz+ObN7P2q1/FabMxpbSUw93d3Pvmm1gUhYnFxVw3fz4Ak0pKWLdvH//Yvp2vz53b+xjjCmLN5l+ffz4L/vAH1h88yFyTdXHpqkKPzXxvcDE4jlCIQredZhO2QLOLsvnGH7+B0+NEURSKRxWj6zrP/e45Fq1chKqqeHI8eHI8FI8qpn53PZte3ER5ZXnvY+x6axdTl05Neu2arlHTVsPEoolJf+6hMF+LJMmtkS2HD1ORl4fTZuu9bVJxMXWdnRR5vYzJz+9z/1F5edR3dRGKRnlh5066g0cvvoUeD7lOJ21+83UZ+AqcMsieYSrC5m2Buryu3qOSAQrKC4iEIzTubeTw3sN97ps/LB9/19G/yc6WTloOtTBuVmJnap3I7tbkXuPiwXxBsmdPUp+u2Otlf1sboejRpn5NayvDc3KYXlbGB42Nfe5f09bGsOxsVEXhe88/z8s1Nb3fq+vspM3vZ+xHwscMurMkRDJNvj+IxYS7Au/buI/fffV3hP9zmiVA4/5GnF4nm1/ezKuPvtrn/o17G8kvP/o32bC7gayCLLILjdn/qr67nu6QubZ9MleQ1NcnfV+ts8eOxWax8MN//Yu9ra2s2bOH37/1FitnzOCz06bxQXMzv339dfa3tfGbdeuobW/nwokTsaoqn5k6lf957TXeOXiQLYcP861nnmHJuHFUFhYm9TUMVdRuIaBIt1amUTWNYSR+2mu8lVeVY7Vb+dd9/6K1rpW97+/llf97hdM+cRpTz55K7bZa3n3uXdrq21j3+Drq99Qz8+NHz1Bvrm2mYNixs7iSyWytEnPt/vvqq7B9e9KfdndLCz9Zs4ZNDQ3ku918fvp0vjBzJoqisOHQIX6yZg27WloYm5/PjWefzWnDhwMQikT49Wuv8fSOHfjCYc6prOSHixfjdZjrj7OrxEmrVxYhZqJWt5P1JlyA2nywmZceeon63fXYnXamLpnKvIvnoSgKe97dw2uPvUZ7QzsFIwpYfPlihlUdnWm5+v7VBHwBzr/2fMPqz3flc+mkSw17/oEyT5BoGjz8MATlk3GyNY524Ffl956JohYLL3qjmLCHy/QunXQp+S5zdIObZ9ZWba2EiAE0i0JANd/MnZNT0BQ7GnY03UoUG7puJYoVTbeADjqxK+eRf5Ujp9LrOooKKlFUJYpFjaDoEVQljIUwih5CwbxTZz/KEo1SioN6k2+ZYka7W3czZ9gco8voF/MESZIH2UWMP8+Bjvm6NnTshBU3Ed1NWLMTjtgIRSxoUYgmeN6AqoDFAjarhs0awaYGsSoBbIoPVTffjL3SiEq97dT3E/G1p3WPBEncmXyPKrPye1O9T0MhrOQQ1ryENDuhiJVQWCVqYKNA00GLQDiiAvb/fMWO4VVVsFt17LYINksIm+LDTntKt2Lyg2GQIEm6rlAXncFOU5yeaI4gaWsDE669MDtdAb8ltbo0dCyElRyCmpdA2EkgoKKZY5QPiA31BUIKgZCN2NXZAxThtGs4HSGclh7segdKCi0EtEYiFOg2WpTwqe8s4qquq06CJG6StF286CuY40AzuG9cRyWk5BOMeAiEHQSCKibKjX4LhFQCISfgBApw2HVc9hAOqw+H3mZ4sJRqVlosEiTJVtdVx4TCCUaXcUoSJOKEAh5jlhnpWAhSgC+cRU/AhpaBayGDIYVgyAE4UMjD5YritnfjogXVgFDJDWtg3oXuplXXZY5rnzmCpL7e6AoyUtCRvCu4jpWgUkBPKAuf32qq7qpE0wGf34LPnwPk4HZqsVBRW1D15LQYPaEwigOZBpxkvrCP9kA7uc5co0s5qdQPkpYWCJhv1pDZ6QoElcR+8tVRCSpFdAez8AckPPrLF1DxBbKBbFxODa/dh0tpTGj3l6JpFGKnKYXGbjJFXVedBMmQSbeWIULZDvQEjY9EFRc9kWI6/S5DZ1elA39AxR/woipevJ4wXkszNjoS8lxFupWmBH+4EMeq66pjUtEko8s4KQkScVwBT/w7xENKAV2hfLp90tkeb5oOnd02OinD6Sgl29WJU2+M67Ti3LCGCbfeMr36rtTv2k/tINF1GR8xSLzGRzRs+PQSOv1ewjLpJykCQYVAMAeLmkO2J4BHbcSCb8iP6wmGJEgM4I/4afO3kec69ujgVJHaQdLSAiFpShshOMRtUTTFQVe0nM5uh4x9GCSqQVuXkzZG4nVHyLEfxqp3DfrxVE0jHxutyCeCZKvrqpMgGbTmZqMryEhhpwVtkF0iOna6tHI6up0SICmk22el2zeMLE+EHFs9Fr1nUI+Tq1tplYWJSdfsS+1rYWoHSXu70RVkpLDHBgMMEk2x0aOV097tysh1H2bR1WOlixFke8JkW+sH3OWVHVVS/aqRltoD7UaXcFKp/ZZI4tns4qiws/8LEXWs9OjltHe5E74Zooifzh4bXYwk2xsi21rf780kPVEt1a8aaaktkNrXwtR+S0iLxBBh26n7pHTAxzDaurIkQExKBzq67XRSQU5WgGy19pSzvJzhiAy4GyAUDeEL+3Db3EaXclype9RuJJL0Y3VFTMgSOen3w0oOjf7xNHdIiKQDHWjvclLXU0mA4pPe1xqJYE/hy0Y6S+XurdR9R3R0xKb/iqTSFYicYFaOjpWO6Gjq2ssIhGSvjHQTicDhjnyaw5VEOfEn33zZU94QqRwkqdu1JeMjhgh77OjH2QYjoJTQ0p1H5OSNFZEGenwW/MpI8rJ8eJVa+Mh+yzmaSkPqfgRNWxIkgyHjI4YIO/uuOo8qHlqD5fj8sho9k2g6tHS66bZVke9pwq639n7Poymp3JeRttr8qfvhWoJE9BG1H+2y6tGH09rplfUgGSwYVqhvLybHm0uOZT8KURwyc8sQ0iIZDAkSQ0QsOprioCU4UloholdHtx2/bRyFngbsUTmt1Ag94R7C0TA2S+qNUaVuA7Wz0+gKMlKXkkNd52gJEXGMUFihvr2MYLhMDiYxSGcwNa+LqRkkkQgyqptcumKh1TWLw4FSmdIrTkgH2ttdOANnYNFdRpeTcYJRY4++PpHUDBLZqDGpwtZsGmwL6PIXEkECXPRD0AXd83Ho5UZXklGCEQmS/gum5i8rHfmdw2nQ5hIKxZYrRzU5aUqcmkPRiEYtBDum4I5WG11OxpAWyUBIkCRFp2sCjYFqNC32NtCtOjoyRUucmk0/2v/p6xqOK3QaqXo5SSfSIhkICZKE0lFpcc2mzV/R5/aoTVojon9sH9mTy+/Lx+E/ExWnQRVlBmmRDIQEScJEVQeNjvl0+wuO+Z60RkR/He/CEQy6UHvOwKan7gFMZheKpub4sQRJBgnbcjhsmU8gePx9lHSLBInoH+UE++BFwjaiXbNx6sOTXFFmCEQCRpdwXBIkGSJoL6IhOodwOPUWMwnzUU7SetU0lUBHNS59XBIrygzSIhkImf4bVwFHKY2R6b2D6ieUmu8GkYLUfnSD+jvG4tLGJ6GazCGD7QMhLZK48TvKaQxNPXWIALoiXVuin/q5AZu/cxRubVKCi8kcMtg+EHLod1z4nCNpCk1G7+d2FhIkor/UAbxXfJ0jcEenJrCazKHpqXltTM0gscg+T0PV4xpNc2Biv0MEJEjEAAzwreLrKsMdmZ6QUjKJRUnNa2NqBomammWZRY9rDM3+qgFP5pUgEf11ssH2E/F1l+COzExANZnDokqQ9J+0SAbN7xxOi7/S6DJEmjvR9N9T8XUX4YpMjnM1mUNVUvOSnaJVpWZZqS7oKKEpOGnwywqlQSL6SVcGv428v3uYzOYaJOnaGghpkQxYyJZPY3jqgMZEPkrVU/PtIFLPUIIEYrO5XPqo+BSTQaRFMhDSIhmQiDWLRm1mv6b4noyiyWFFon+0OFzQ/J3jZRv6AZIxkoGQFkm/RVUnjcppRKND/52pQwwikTmiQ2yRAKBDuGsydr1o6I+VIaRFMhDSIukXHZUmWxy3PUnNKeoiBWlxunRomkK0exoW3ROXx0t3MkYyENIi6ZdW10yCwfgdd6pGU/PtIFKPRvy6QaNRC5bAbBRd/u5PRVokAyFBckpdrqrjbgU/FDJGIvorGucLWijoxBmZFdfHTEdW1Wp0CceVmkHikWbuyQQcpbT5R8f9cZWoBInon2gcWyRH+HvycGsT4v646cRtO/4REEZLzSDxeo2uIGVFLB6aI5MTsuRDWiSivxIRJAC+rgqZyXUSXntqXhtTM0ikRXJcumKh2TI7LjO0TiRVm84itQQTteZIh0j3JKyk5gXTaBIkA+FwgE0OYPqodudUgqHEnoltU+T3Lk5BUQmQuA8z0agFi38mDGFxbbqSIBko6d7qw+8op9NfnPDnsekSJOLkdGviW63BoAu3Vp3w5zEbCZKBkiDpFVUdtEQnJuW5rJp0bYmT05LUW+DrHiaLFT/EbrFjs6TmBz0JEhNodcwgGknOBd4alSARJxdRk3Qx00H3TUZB3pOQuq0RSOUgkQF3ALpdY/H5c5L2fJawrOERJxdO4jqvcNiOMzw9ac+XyiRIBkNaJESsXtqCY5L6nJaQBIk4uWCSWwj+ngKc+oikPmcqkiAZDAkSWqzTh7yj70BZQpaU3YZBpIagAfs9hbvHo2JP+vOmEgmSwchJXndOKupxjSYQMKZ7z6pIn7Q4MX8Cp/6eSDRqwZnhJytmO7KNLuGEUjdIPB5wxW9DQjPRFBttobGGPb8Dh2HPLVKbbrESNugANF93EXY9vvvLmUmhu9DoEk4otT96FhTAwYNGV5F0Ha4pRH3GjVXYI/ZU/ogxaO9ufZd7H7m3z20zJ8+ku6ebnXt3HnP/+bPm88VLv9jntoeefIjc7FwuWHoBAOs2rOPBxx885mcVReGPP/1j3GpPFVFHYhfEnooSqAbXK4bWYAS7xZ7SLZLUDpLCwowLkpC9gE6fsXPn7SE7GHu9SIj6xnqmTZzGyk+u7L3NZrWh6zqRaKT3tr21e/nD//2BRacv6vPzz699nlfXv8onlnyi97bTpp7G5KqjXS7RaJQ7/nQHUydMTdwLMVDQamxrNRh04XJU4ld3GVpHsqVyawTMECQZplUxfjWvzWdL2yApLyknJ+vE42+apvHkC09y7sJzGTV8FAD+gJ8Hn3iQHXt2kJ+T3+f+dpsdu+3oIPA/X/4nuq5z8bKLE/IajNatGD/gHeqpQM2qRSNgdClJk+pBktodGEWZtaq121UZ14OqBkvRFBxq+o2T1DfWU1JYctL7rNuwDp/fx7KzlvXe1tzWTDgc5kfX/ojC/BP/Qff4enh+7fNcsuwSbNbUXIE8VF0Y/7qiUQuO8BSjy0iqVA+S1G6RZGWB2w0+n9GVJJyuWGgPjzS6jF4O3UGQoNFlxI2u6zQ0NbB151b++VKs1TBryiwuXHoh1v/sHaXrOs+vfZ4l85fg/NBYwIiyEVz3xetO+Rwvv/Uyudm5zJqSngc06TYHkQRtHz9Q/p58bLm5hGk3upSkKPGc/AOQ0VK7RQJQnPiNClNBl6sqadug9Icjml4tktb2VkLhEFarla9d9jUuPe9S3nr/LR5/7vHe+3xQ8wHtne0sPG3hgB9f13VeXf8qZ887O55lp5SII7XeE9ZQZhyC5ba5yXJkGV3GSaXOletESkth3z6jq0goTbHRGRxmdBl92AI2SM3D2AalIK+AO390J26XG0VRGFk+El3Xuf+x+/n08k+jqiobtmxgctVkPO6Br9/Zd3Af7R3tnDbttARUnxr8Kdbd6fflYLcXEqLZ6FISKtVbI2CGFklJ6v8Sh6rLNSGhh1UNhs1vS7sV7h63B0U52jVTVlRGOBKmx9cDwNadW5k+afqgHnvrzq1Ujq7E40rfPeJ6krVZ4wAogSqjS0i4Em/qXwNT/0pRVARJ3CQu2aKqg85AqdFlHEsHl2L8wH+8bNm5hW/e9k2CoaPjPrX1tXjdXrK8WXT1dNHU2sS4UeMG9fg1tTWMqxjcz5qCaqEzBc+qCQaycJCCfz9xVOpN/deX+kGiqjAstbp94qnTMSHp+2n1lyuSPkEyrmIcNpuNh558iIamBjZ/sJnHn3uccxeeC0Dd4TpsVhuFeYObHVN3uI6y4rJ4lpxSIi43utFFnIg/fVslTquTInfqz15NzSvYR40aZXQFCRFVnXQFUrfZ6uxJn8UkToeTb17xTbp6uvjx3T/m/3vi/2PBaQt6g6Szu7N3/GQwjvx8uuq2pu6HimDQhUMvN7qMhKjIqRj0ezKZFF3XU/aDRi+fDx55xOgq4q7dPZUOX2p/im0obiCopc80YDE4NTkjCaXI1N/jcTp7CDhfM7qMuDtn7DmMyh1ldBmnZI4WiduddtOANcVKdzD1X5NbT99P2aJ/NKcrpUMEIBDwYCP/1Hc0EatqZXj2cKPL6BdzBAmkXfdWj3Nsys3UOh6nP326t8Tg+O3m+DBhDVUaXUJcDcsahlVN/RUaIEFimK6IOSYQ2HvsWAw4yEikjjYltdaPnIjfn4sljRY/maFL6wjzBElubtocduV3DiccTr2plMelg1tJnz9OMUBWG90psL9Wv+jgiKbHFGwFhYrcCqPL6DfzBAmkTaukS0+dPbX6wxVO3Rk7IrGCTnMtsAz6ijHbZe14SrwlOK3m6VY212+8wjwJfSJhazb+YGrvm/NRzg5n2q1yF/3TlsLTfo8nGrXg0kcZXcaQmalbC8wWJCUlpj9+t8c+yugSBkzRFLyK1+gyRJLpNjvtuvHnjwyUFkztKfX9UZFjrg/N5goSRYFx5u4D7Qml9rkCJ+L1SZBkGp/LXC3nI4IBr6kH3Ys9xeQ4zTUebK4gAZg0yegKBi3oKCESMcnA5UfYum3YUnDTPpE4jap5L8YObZTRJQxadZHxp6QOlPmCJCfHtHtvdavmWFx0ItlattEliCSJur0EdfNdHo6IBFJ/se/xOK1OxuSNMbqMATPnO6XafImtKxZ8QXOvvHV1mnt8SvRfu81cs7U+KhRymHKl+/iC8VhU863bMmeQVFSAx1xvdL9jRMru8ttflpAFj2qu37sYBIuFZswz9fREbBFzDVgrKEwsmmh0GYNiziubosBEc/3CezD/TBIAb1AG3dNdwJ2VulvGD0DQX2B0CQMyPHs42Q5zdh+bM0gAJkyInVViArpiMd3akRNxdjpNs/+PGJwmS3p8WIhGLdgxz1jJpCLzTiQyx5X4eNxu06x0DzjK0PXU3j2133TIiZpraqLov4jbS49uvj76E7FGU/90QQCv3cvIHHPtePFh5g0SMM1UYL+SuodXDYanzSMbOaapJrs5u1ZOJGKSCS6TiiaZ4gCrEzF3kJSXQ16e0VWckj+ca3QJcaVoCjm6tErSTdTppiMFz2UfilDIkfKLEy2KhfEF440uY0jMHSQAM2YYXcFJhW25RCLpN6bgbffK/ltppsVkq6n7y66l9rqziUUTcdnMPbXe/FeCceOgIHVnZ/ht6XmWtBJRyCa9ukEymeZw0WrCfbX6Qw+n7rZEVtXKjNLU/jDcH+YPEoDTTjO6ghPyR1M35IbK2+FFSfEjWEX/tKVpawQgFMwiVS91U4qnmL41Aqn62x2okSNjOwOnGB2VYMj8b5ITsYQsZKvSKjE73e6gCXOcgjgYmqZgJ/VaJQ6Lg2ml04wuIy7SI0gA5swxuoJjhOyF6TPt9wSyOrKkVWJy7e5co0tIOGsK9gxMK52G3ZIe3YnpEyRlZTA8tTZFDFpT780bb5aghRwlfbtF0p1ud3JYN/92KKeip9jaJ5fVxeTiyUaXETfpEySQcmMlwQyZIpvdli3rSkyq0W2OdRZDFQ6m1h5xM8pmpNUOEekVJEVFMHq00VX0CoZT682bKEpEIVfLNboMMUARt5e2NFs3ciKRqBWLnhp/j1n2LFNvh3I86RUkEGuVpMAK0YjFQzQN14+ciKfVg11Nj/7ezKBQb881uoikslNkdAkAzCqflXZrsNLr1QDk5qbEzsBBe2q8aZNF0RXyTbIdhYBAVl5a7anVH0rU+PdnobuQyvxKo8uIu/QLEojN4DL4vJKQkmvo8xvB0eGQ80rMwGqlVk2PHX4HIho29jWrispZFWeZek+tE0nPILHbYcECQ0sIa5l5Qc3rzJPpwCmu1VNANAP/fxQOG7tWZmrJVArc6TmTMz2DBGKLFMeONezpw5H0XeB1MpaAhVxyjS5DnIDmctOYBqcfDoamqagGvfZcZy6zymYZ8tzJkN6jwfPnw6FDEAgk9Wl1VCIR42fDHGo7xN0v3s3Wuq1kObO4cMaFfPq0TwPwzr53+NMrf+Jg20GG5w3nigVXMGf0sYs6X9z+Is9teo5ffeZX/X7erJYsfEU+glowbq9FxIGqUuvI51THH0bCIR75+dUs+cw1jKiKrbzubG1k9V9+Q+3OTXhzCzjzgi8xftZZAOi6zjurH+f9V1YR9HUzbtp8zv701didsV0dfF1tvPjo3ezbvgGr3UH13I9x5gVfQrUkf4zGRi5BGpL+vAsrFpryLPb+St8WCYDTCfPmJf1pwzbj149ousaPnvoROe4c7l15L/+19L/4vzf/jzXb13Co7RC3/uNWPlb9Me77wn0snbSUW/9xKw0dff/A3j/wPnf+684BP7eiKxR0F0gXV4rpyCrAf4oB9kg4xLMP/IyW+v29t2nRKE/d80NUi5WVP7iH2Usv5Z8P/oLmur0AbHrtWd7458OcecEVfPY7v6a7vZln//dnvT//7P/+nKC/h8v++zd84ss/ZMc7L7H+339NzIs8BVVL/kmlk4omUeo1xwFbg5XeLRKAykrYvRtqa5P2lBFLDoST9nTH1dbTxtiisVy39DrcdjfD8oYxY+QMthzaQoG3gPOmnsclsy4B4NLZl/KXt/7CBw0fUJoTe8M//PrDPPr2owzLG9wW3DafjXxXPi1KS9xekxi8qNtL/SnO5Wip3x8LgI+0WGq2vk1XWxOf/c6vcbg85JeMYO/W9dTVbKOwfDTvvfwPZi25lImnLQZg2Rf+mz/84DJaD9eSnV+CJyuPectXklccey9VzVjAoT1bEvI6T0XRvEn9+Oy1e5k7bG7yntAg6d0iOWLBArAlr6spnAIzYgq8Bdz4iRtx293ous7WQ1vZfHAz00ZMY9qIaVy1+CoAItEIz21+jlAkxPjSo4frvLv/XX52yc84s/LMQdfgafHgUtN300rTsFg4YD/1AXC1uzYxomoan/vvO/vcfnDnRkaOn4HDdXQCyUVfv5WpZy4HoKO5gbJRE3q/580pwO3Noa5mO1abnfO+9L3eEGmu28eezW8yvNKYzQq1aHIPuTpz5JnYLMZ3cyda+rdIALze2JTgdeuS8nThFFlBe8TK+1bS2NXI3DFz+wTDobZDXPm/V6LpGlcuuLK3NQLw68/9GoD3a98f9PMqKBR0FlCXVYema4N+HDE0rVmFBPVTf2acvvATx729vbmenIJSXvn7/Wx/azUubw7zlq+kcvp8ANzZuXS3N/fePxz0E+jpwt/d0edxHvuf73Bw92ZKRlYy46zjP1eiRSIOkrXR8bj8caY+h30gMqNFAlBdDSNGJOWpolpqrfC+6YKbuO2i29jTuIffv/z73ttz3bnc/fm7uWbJNTz0+kO8uvPVuD+3JWChIJKeUx7NIOzOonGImzKGgwG2vvkvgr4uLrrqNibNXcrTf7qdhv07ARg/axFvv/AoLfUHiIRDvPzEHwDQopE+j7P409/g09/8f0QjYZ554GfHPE8yJGu3iSx7FvNHzE/Kc6WCzGiRHHH22fDkk9DVldCniWqp9WutKq0CIBQN8Yt//oKvnvVVbBYbHoeHcSXjGFcyjgMtB/jHe/9gQVX819+429x4ij30aD1xf2xxElYr+22n7tI6FdViwenJZulnr0NRVUpGVnJw92Y2vfYspRVVzPv4ZXQ01/Pgj7+CxWJl6pnnUTR8LHZn326k4uGx6fjnrvwuf/7FNXS0NJBTkNxBaF1XULGjEUrYc1gUCx8b+zEc1sxZApBaV7xEczjgnHPgH/+ASOTU9x+kaNT4X2tbTxvb6rYxv/Lop6KKggrC0TDb67ajKApThk/p/d7IgpFsrN2YsHry2/IJ5gWJaIn7vYsPU2j0FhGJw3k4nuzY1iKKerQDI79kBE2HagCwOVx84ss/JOiPfVBwuDzcc/2nyC4oIejvYe/W9YyfubD35wvKYt09/u7OpAcJgEV3oSmJC5IFFQsodKfeQVqJlDldW0cUFMCZgx9APhUdFU0z/tfa0NHAbatuo7nraN/1rsO7yHHlsK1uG7/+16/Rdb3P90YWJK4/Vw2rFPcUy5TgJOnMLojbGexloyfQXLcPTYv23tbScKA3BNY+eR9b3/wXDpcHh8tDw74PCAV8lI+ZRCQU5NkHfkr9vh29P3v4wC4UVe0dgE82VUncBJCJhROpKqhK2OOnKuOveEaoqoJJidnGOaqmxqrhqtIqKksqueOFO9jfsp+3a97mvrX3cdnpl7Fk0hJae1q5/9X7OdR2iFXvrWLN9jV8ds5nE1qTrcdGYTSzPqkZIeTNoU6J34SPCbMXg67z4qO/pa3xEO+vXcW+reuZMv/jAHhzC3jj2Udo2PcBhw/s5J8P/oJpC87H5cnGk5NP5fQzefGxuzlcu5uDuzfzr0d+zYxFF/aZBZZMipaYv9FiTzHzR2bOuMiHKfqHP5ZmEk2DVaugsTGuDxuyF1IfSo2tEFq6W7j7xbt578B7OG1OLpxxIZ+d81kURWF73Xbuffle9jbtpSS7hCsXXMm8cccu3nzo9YfYVLtpQCvbT6WjqIN2vT1ujyeO0pwudjmKT7V4/ZTu+MY5fPqb/693ZXtL/X5W/+Uu6vftIDu/hAUXXkHljFjLXtOirH3ij2x7+0UURWHS3KUsvOjLvSvXg/4eXnr89+zZ9AYA1XOXsuCiK7FYjZkW68reh1/9IK6P6bQ6uWTiJXjsqTVjM1kyN0gAenpig+9+f9we0u8cTmOgOm6Pl5YUaCpuwhf1GV1JerFa2est69dU30zmzqrHZ9kUt8dTUFhetZzyrPK4PabZZPY7zuOBJUviehBWVMmcmRqDpkNhS6EchBVPikq9t1hCpB/0OE/PnzNsTkaHCGR6kACUl8d5Py75lfaHElEo6ipKu5PijNKeXUhHhhybO2RxmMl2xLj8cUwrNWaVfiqRv2KAyZNh5sy4PJQuv9J+s/qtFIeKjS7D9PxZeTQgW9H0lx6nIBmZM5JFoxbF5bHMTq56R8yeHQuUIdLlE/aAODocFEclTAYr6M1lv5ptdBmmosThb7TMW8bSMUulRf0f8lv4sDPOgPHjT32/k9Clj3rAXG0uirTMOuM+HsLebPZajD+ywGyG2iIpchexbNwyrKrxC49ThVz1PmrhQhgzZvA/L59QBsXd6qZAlz25+ivizmKPJdfoMkxpKEGS78rnvMrzMmJH34GQq95HKUpsT65BbvCoy8rtQfO2eMkn3+gyUl7U7WGPLR/kvTYoyiAve9mObM6rPC+j9tDqLwmS41FV+NjHoKxswD8ar4G8TJXVnEWukmt0GSlLc7qpsRUMecFhJtMG8TfqsXlYXrkcty2555mYhQTJiVitcO65UDTAvvs4rknJVDlNOeSo0vf/UZrDxR5HIVFpiQzJQH97TquT5VXLyXIk/5hes5AgORm7Hc47D0pK+v0jKtFT30mcUk5jjrRMPkRzuqlxFkmIxIGi9P+QNY/Nw/lV55PrzE1cQWlAguRUHA5YvhxGjerX3RUJkrhQUMhpyqEAGYCPuLPY7SgkIiESF4rav7/RPGceF064kHyXjNudigRJf1itsTGT6lPvoaXoct5GPHmbvRRHM3f7+WBWLrtteWgZ+voToh8tkvKsci6ccCFeuzcJBZmfBEl/KQrMnx87+/1kd5MWSdy52lyUhEoybvFXd1Y+e9UcZHZWvJ08SMblj+O8yvOwW2QvuP7KrL/MeJg+PTY9WD3+r06VFklCODodlPpKsSgWo0tJAoX2nGIOqjK4mwiKcuK/0Wkl0zh79NkZ96FlqOS3NRjjxsUG4e3HfmJRkCBJFFuPjbKusvTeNVhRac4tkb2zEuk4XVsKCmeMOIO5w+caUJD5SZAMVnk5XHBBbCv6D1F06dpKJEvAQklbCW5LGs7nt9mpyymlWZcFbwn1kRaJRbHwsbEfY3Lx0Pfay1QSJEORnw8XXdRnerBFi98hWeL41LBK4eFC8pX0mU0TdXvZ4ymhU7aCTzhdCfX+b4/NwyfGf4JRuaOMKygNZPYJifGiafD227BpE5pio1Y/2+iKMkYwO0iTo4moiVuCPdkF1CoyOyhZnDlbCCiHGJE9gsWjF+O0JuYM90wiQRJP+/bByy9zILxItkpJoqgjSnNOMwEtYHQpA2O1cthbTJu0QpLKkbeBaeVjmF463ehS0oYESbx1dbHv2U6UThl0TyZd0eko6qBD6zC6lH7RnG72OwvkaNwks2Lh3FkOhuXKsQXxJO/ieMvKYltWOfUu2SsqmRRdIbcxl+JIccpP3fRn5bHTUSghkmROzY2lrkxCJAHknZwAHq/CJn8uGx0lRKxy+E0yudpdDOsYhkf1nPrOSabbHDTklv/nREPp+kwWFRVXdwGBvUW4HJmwDin55CqXAFn/WUfWEHTSrJQx1d1Bka/T2KIyiBpSKWwsxJProcXWkgID8Qr+7DwOKFlIR3JyOTUXkfp8/IHYpS5L1ngmhARJAnz4zRrRVd715VFkd1NNK45Q6MQ/KOLK1e6i3FZOe147XVqXITXoDid1rgK6dPlTSyYLFuwdefib+7ZMJUgSQ7q2EuB4b9amkINXQqUcdOWiy5klSaOGVfIb8ykNlSb3jG1FpTu7kA+cxRIiSeaKeNH3lx8TIgBemWWdEPIOT4C8vOPfrqGw1Z9DrdXNVGsrnoDJpquamKPTQXlPOR0FiZ/ZFXW5OWTPx4f0xyeTDRtqSz7+9hOvC8lPnzWsKUWCJAEcjlirpOsEvSmdERuvRUoY4fAxTmvHHg4nt8AMpURjM7u8Li8dWR10a91xfXzd7qDZnU+LnsZ7gaUgCxbsPTn4D3vhFOu3BnrgqegfCZIEKSo6cZAcURt0cxAX41w9VITasUSNHhTODFa/lQJ/AdnebNrcbfiHuq2N1UabJ4/DOE95IRPxo6DgDGQTaMjGHz11L73HAy7ZCzMhJEgSpKgIampOfT8dhV1+L3sVNxPdXZQFOlG0/h8FKgbP1m2juLuYQE6ANkcbIW2AEyFUC93ePOoUtxw8lWSuiJdQQy7+YP+7D6U1kjgSJAlSWDiw+0d0lc2+HHZbvEx0d1Do70aRuaJJ4exwUqaU0ZPbQ7utnYh2il0JFBW/N4dDapYcf5tkTs1NtDEHf8/Auw8H+jcp+k+CJEEG++nHH7Xwri8fp5pDlauLkkAXqrRQEk8HT5sHt+rGl+ujw9pBWPvI2JVqIeDJod7ikVXpSaSg4Ix4iDRlE/ANfl8yaZEkjgRJgtjtkJ0NnYNchxjQLGzy5WJVshnn7mZYqAtrRPbvSjRFU/C0evAoHgI5ATrsHQTUCN3uXBoUd6wFIg3FpFBRcQS9hJqyB9SFdSISJIkjQZJARUWDD5IjIrrKDl82O8litKuHEdEuWdSYDDrgy6EzOJx6p4Lu6iGi+IyuKiNYsWD3ZxNo9OKPxKfl5/WCU3aLTxgJkgQqKoI9e+LzWBoKe/xe9uCl0B5ilKWbvGCPdHvFma4otDvd7Itm0Rj6z0mFfqDNjdUVwV7YRdDeQxSZYRdfCk7NBR1eAq3OuI89SWsksSRIEihRb97mkJ1m8rEquVS4fJRrPbiDsrhxKLodThpUD/sDbiL+438KjvitRGrzgFyc+QGU7B4CFh+69HUNmg0bNr+XQJOHQDhxCzglSBJLgiSBCgtBVWMHKCZCRFd7WynZ1jCj7D0UhH2ywLGfgnYbTVYv+4JueoID+VNQCLS6oNWFYtFwFfrQPD0EFQnz/rBgwR52E233EOp0kIx3a3FxEp4kg0mQJJDNBmVlcOhQ4p+rM2JjUyQXyCXHFma4zU9B1IcrGEz8k5tIwO6g1eqiLuKiJWSHIQ436VE1tqIaL1ZnBHueD83pJ6gGpaXyITZs2EIuIh1uQp0OhrgEdEDsdigtTeITZiAJkgSrqEhOkHxYR9hGR9gGZOO2RBlu91Gk+/EEAxm3NkVTVbocTppxcTDoIhCyDDk8TiQSsBKpzwayUVQNZ14AxesnbPUTybgxFQWHbscScBFudxP22ZLS8jieESNiPQMiceSo3QTr7IRHHzW6ihgVnUJ7iCJrkFwtgDsUTLvBek1V8dvtdKoOmqJODgcdKbHq3OYNYcsOoNuDhC3BtAsWBQWbZscScqL7HAQ7HOhaaly9zz4bxo0zuor0Ji2SBMvOju0G3NZmdCWxmV+NIcd/ZiNlo6CTbwtTZAuQq4dwRULYwxHMtFAiZLPRbXXQjoPmiJ22sB1ScKgi3G0n3H10NbbVGcGWFURxBYlYQ4SUEGb6vVuxYtVsqCEH0R4HoU4HIc34wP4oRYm1SERiSZAkwciRqREkH6Wj0BK20xI+eoFT0cm1hcm1hslWQni0MM5I2NDFkLqiELJZCVhs+BQb3ZqVzqiNjoiNcFjFsD6TIYgErEQCVuA/Z2YoOjZ3GIsrguoIo9vCRNUwESWChnGtRgsWrLoNS8SGHrQT9dsI99iIRFXMsDy2tDS2G7dILAmSJKiogI0bja6ifzQUWsN2WsN2ei9yxALGbYmSZY3gViM4FQ0HUex6FKsexaLpKGiomo6i6VjQj7v5pI6CriroioJOLCSiqoWwqhJWLIQUC0FUArqFoGahJ2qhK2JFD6Xep9240hXCPXbCx9lDyuKIYnVGUKxRFFsULFGwaOiWKLoSRVO03rDRYr/V/7Rt+rZwFBRUVJQj/6fHblF0FSVqQYla0CMW9JCFSNBKNGAhqqmm7oSrqDC6gswgQZIEJSWxVbVmPsdKQ6E7aqU7OrC3jEXRQdfRUNCPjFWk17BMwkWDFqKD2iJER1Fj/+qaig6mDoXBGDnS6AoyQ2qMhqW5TO6njeoKUdSjISKSSEHXlJQZ9E62nBzIzTW6isyQme8wA0gTW4jkktZI8kiQJMnw4WCRI7yFSJpRo4yuIHNIkCSJ3Q6jRxtdhRCZITs7tquESA4JkiSaNMnoCoTIDBMnGl1BZpEgSaLS0tjiRCFE4qgqjB9vdBWZRYIkyeSTkhCJNXq0HGKVbBIkSVZZKYPuQiSSfFhLPgmSJHM4YOxYo6sQIj3l5kJ5udFVZB4JEgPIJyYhEmPCBKMryEwSJAYoKYH8fKOrECK9WCxQVWV0FZlJgsQg0ioRIr5kkN04EiQGqayMLVIUQsRHdbXRFWQuCRKD2O0wZYrRVQiRHoYPj3UZC2NIkBhoyhQ5dEeIeJg92+gKMpsEiYHsdpg2zegqhDC3igooLja6iswmQWKwyZNlgFCIoZDWiPEkSAxmtcL06UZXIYQ5jR4NBQVGVyEkSFLApEngdhtdhRDmoijSGkkVEiQpQFolQgzc2LGym3aqkCBJERMngtdrdBVCmIOiwKxZRlchjpAgSREWC8yYYXQVQphDZSXk5BhdhThCgiSFTJgAhYVGVyFEarPbYc4co6sQHyZBkkIUBRYujP0rhDi+uXNlckqqkSBJMYWFsnWKECdSWiobnqYiCZIUNHs2ZGUZXYUQqUVVYy12kXokSFKQ1Qpnnml0FUKklunTYycgitQjQZKiRoyAceOMrkKI1JCbK7MaU5kESQqbN092BxYCYMGC2BR5kZokSFKYywWnn250FUIYa8IEKCszugpxMhIkKW78eCgvN7oKIYzhdsuHKTOQIDGBxYtlq3mReRQl9t6XI6lTnwSJCXg8sGSJLFQUmWX2bBg2zOgqRH9IkJjEsGGySZ3IHCNHyiwtM5EgMZGZM2PTgoVIZ1lZsS4tYR4SJCZz9tmy3bxIXxYLfOxjMu3dbCRITMbhiP2hqfL/OZGG5s2THbDNSC5HJlRUBGecYXQVQsRXZWXs2GlhPhIkJjVpkmyhItJHfn5s9bowJwkSE1u4MNY6EcLMnM5Yd63VanQlYrAUXdd1o4sQgxcIwKpV0N5udCXm09a2nxdfvI26undxOnOYMWMFp532ZQAOH97Ciy/eTnPzTgoLK1m06AeUl0/v/dlt2/7Om2/eS09PEyNHzmPp0lvweCTVB8puh/PPl3ERs5MWick5nbB8uczkGihd13jqqa/iduexcuVTLF16K2++eS/btz+Nz9fC3/72RQoLq/j85x9n/PjzePzxL9HZWQfAvn2v8vzzP2DGjJVcdtnfsNncPPHEV9B1zeBXZS4WC5x7roRIOpAgSQMeTyxMXC6jKzGPnp5mioomsnTpLeTljWLMmLMYOXIehw5tYOvWv+N05rJ06S0UFIxl1qwvMmzYLDZu/AsA7733CBMnfoIZM1ZQUDCWj33sdrq66tm/f53Br8o8VBWWLpXNGNOFBEmayMmBj39c9iXqL6+3mE984k7sdi+6rnPo0AYOHlzPiBFz6OiopaSkGlU9um95UdF46ureB6C9vZaysqm937PZnOTmjuz9vji1s86CigqjqxDxIkGSRgoLY10Fcm7DwNx339k8+uhllJfPoLLyXNzuQrq7D/e5T1dXA35/GwAeTwHd3Y2939N1je7uw73fFyd3xhmxqb4ifUiQpJmysliXgSxY7L8LLriLiy76PY2N23n55Z9RVXUO9fWb2LTpr2hahH37XmX37hfRtDAA48efx8aNf6Gu7j2i0TBvvfV7fL4WotGwwa8k9c2cCZMnG12FiDeZtZWmdu2Cl14yugpz2bnzef75z+9y7bXvsn3706xZ82MikQBFRRMZOXIutbVvsWLFk2halDVrfsymTY8BUFV1LpFIgNzcUSxadIPBryJ1VVfD/PlGVyESQWZup6nKSohG4dVXQT4qHKunp5m6uveprFzae1tBwTii0TDBYDeTJ1/CpEkX4fO14PUWs3btL8nOHg6AqlpYuvRmzjrreiKRIC5XLn/+86VUVMhV8kQmTZIQSWfSAZLGJkyQbq4T6eg4yKpV19DVdXQs5PDhLbhc+TQ37+SZZ76FqlrweovRdZ19+15lxIi5AGzY8CBvvfVHbDYXLlcu3d2NNDZuZ8SIOUa9nJQ2YwaceabRVYhEkktMmhs9Ojaby2YzupLUUlo6hZKSal544Qe0tOympmYta9f+P04//evk5Y1mz56XeP/9/6O9vZYXX7yVQKCD6uqLAMjOHs769fdx4MCbNDfv4umnr2P06LMoLKwy9kWloHnz4LTTjK5CJJqMkWSIxkZ4/vnYSngR0919mBdfvJ0DB97AZnMxY8YK5sz5GoqiUFPzMmvX/oLOznrKy6dx9tk3UVAwtvdn33rrD7z33sNEIkHGjVvK2WffiN0uq0KPUNXYFj5Vkq0ZQYIkg3R0wHPPQWen0ZWIdGazxbpU5RC2zCFBkmECgVjLpLHx1PcVYqDc7lhXakGB0ZWIZJIgyUCRSGxq8N69Rlci0kl+PixbJvu+ZSIJkgyl6/DOO/Dee0ZXItLBqFGwaJFs0ZOpJEgy3MGDsdaJ3290JcKMVBVOP11Wq2c6CRKBzwdr1kBdndGVCDPJzo4Nqss28EKCRACxrq533419yTtCnMqYMbHpvdKVJUCCRHxEXV2sdeLzGV2JSEUWS2yR4aRJRlciUokEiThGIBALk4MHja5EpJLc3FhXVn6+0ZWIVCNBIk7o/fdjM7s0OUE241VVxTZdlK12xPFIkIiTam+Hdevg0CGjKxFGyMmJbbg4bJjRlYhUJkEi+mX3bnjzTRk7yRRWK0yfHvuS3aPFqUiQiH4LhWJdXVu3ysyudDZyZKwbKyvL6EqEWUiQiAFrbobXXpP9utKN1xs7T33UKKMrEWYjQSIGbft2ePttCAaNrkQMharClCkwa1asS0uIgZIgEUMSCMRmd23bFtsMUpiHosQOPps9Oza1V4jBkiARcREIwMaNsUAJh42uRpyMosDYsTBzpgSIiA8JEhFXgQBs3hwbkA+FjK5GfJiqwrhxsTPUc3KMrkakEwkSkRDBIGzZEgsVCRRjqWpsQeH06bGNFoWINwkSkVChUCxQtmyR8+KTTVVh/PhYC0QOmxKJJEEikiISgX37YMcO2a4+0fLzYwFSWQlOp9HViEwgQSKSrqsLPvgAdu6E7m6jq0kPdnts/GP8eCgqMroakWkkSIRhdD22h9cHH8RaK9Go0RWZT3k5TJgQW0Qoa0CEUSRIREoIBmHPnlgrpalJtmA5mZyc2PTd8eNlGxORGiRIRMoJBGJnodTWxv7N9PPkrdbY7rsjRsDw4TLzSqQeCRKR8pqbY6FSWxvb3ysTzkfJz4+FxogRUFYmO/CK1CZBIkwlFIqNqxw8GOsCa2sz/9iKosS6qwoKjrY8PB6jqxKi/yRIhKlpWuzwrZaWvl+pumbFZou1NgoKjn7l58tAuTA3CRKRlny+WJdYayt0dMT+2++P/RsIJLZ7zOkEtxtcrthXdvbR8JCtSUQ6kiARGSkQ6Bsufn/sKxKJzRjTtNiXrse+VDX2pShH//eRoDgSGm53LERkPENkGgkSIYQQQyKfnYQQQgyJBIkQQoghkSARQggxJBIkQgghhkSCRAghxJBIkAghhBgSCRIhhBBDIkEihBBiSCRIhBBCDIkEiRBCiCGRIBFCCDEksnm1SFsrV67k7bff7nOboii43W5GjRrFF77wBS688EIAxo8fzzXXXMO1115rRKlCmJoEiUhrkyZN4uabb+7972g0SkNDAw8++CDXX389ubm5nHXWWQZWKIT5SZCItOb1epk+ffoxty9cuJB58+bx5JNPSpAIMUQyRiIyksPhwG63oyhK723d3d3ceOONzJkzhxkzZnDdddfR3Nzc5+dWr17NxRdfzJQpU5g/fz4//vGP8fl8x9znsssuY8aMGUyePJlly5bx5z//OSmvSwgjSJCItKbrOpFIpPcrGAxSU1PD97//fXp6enrHSAAeeughwuEwv/nNb/jOd77DmjVruO2223q///TTT3P11VczZswYfve733HNNdewatUqvvGNb3DkWJ+XX36Zq6++murqau655x5++9vfMmLECG677TY2btyY9NcvRDJI15ZIa+vXr6e6urrPbYqiUFVVxW9+8xsWL17ce/uUKVP45S9/CcC8efPYuHEja9euBWKB9Ktf/YoFCxbwq1/9qvdnRo0axRe/+EXWrl3LokWL2L17N5/85Ce58cYbe+8zY8YM5s6dy1tvvcW0adMS+XKFMIQEiUhr1dXV3HrrrQA0NjZy5513Eg6HufPOOxkzZkyf+86aNavPfw8fPpzOzk4AampqaGho4Gtf+xqRSKT3Pqeddhper5d169axaNEivvzlLwPQ09PD3r17OXDgAJs3bwYgFAol7HUKYSQJEpHWPB4PU6ZM6f3vadOmccEFF3DFFVfw5JNPkp+f3/s9t9vd52dVVe3tsmpvbwfg1ltv7Q2mD2tsbASgtbWVm2++mdWrV6MoChUVFcyePRsAOdVapCsJEpFRCgsLuemmm/iv//ovfvKTn3DHHXf06+eys7MBuP7665kzZ84x38/JyQHgu9/9LjU1NTz44IPMmDEDu92O3+/nr3/9a/xehBApRgbbRcZZtmwZCxYs4JlnnjlmweKJjBkzhoKCAg4ePMiUKVN6v0pKSrjjjjvYtm0bABs2bOCcc85h7ty52O12AF555RUANE1LzAsSwmDSIhEZ6Qc/+AEXXHABP/7xj3nqqadOeX+LxcK3vvUtbrrpJiwWC4sXL6azs5N77rmHw4cP9w7oT506laeffprq6mpKS0t59913+eMf/4iiKPj9/kS/LCEMIUEiMtKYMWNYuXIlDzzwAH/5y1/69TOf+tSn8Hg8/OlPf+Kxxx7D7XYzc+ZMfvWrXzFixAgAfv7zn3P77bdz++23A7FZXbfeeiurVq3inXfeSdjrEcJIii4jgEIIIYZAxkiEEEIMiQSJEEKIIZEgEUIIMSQSJEIIIYZEgkQIIcSQSJAIIYQYEgkSIYQQQyJBIoQQYkgkSIQQQgyJBIkQQoghkSARQggxJBIkQgghhuT/B0YnqHu4DwgtAAAAAElFTkSuQmCC" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 123 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "# same as Euler diagram\n", + "from matplotlib_venn import venn3_unweighted\n" + ], + "id": "96f038a31ac729b0" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:17.583624Z", + "start_time": "2024-08-24T00:15:17.567225Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", - "df = pd.DataFrame(rows)" + "df = pd.DataFrame(lexical_conj_pairs)" ], "id": "ce0321a0c59abbe8", "outputs": [], - "execution_count": 138 + "execution_count": 42 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:40.747808Z", - "start_time": "2024-08-20T18:50:40.739794Z" + "end_time": "2024-08-24T00:15:17.802521Z", + "start_time": "2024-08-24T00:15:17.794851Z" } }, "cell_type": "code", @@ -1319,11 +1568,11 @@ "3 NO_SUFFIX acid NO_REL CHEBI:10045 \n", "4 acid ate obo:chebi#is_conjugate_acid_of CHEBI:10072 \n", "... ... ... ... ... \n", - "17165 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:130073 \n", - "17166 (1-) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:9162 \n", - "17167 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:79317 \n", - "17168 ate acid obo:chebi#is_conjugate_base_of CHEBI:994 \n", - "17169 acid ate obo:chebi#is_conjugate_acid_of CHEBI:995 \n", + "17161 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:130073 \n", + "17162 (1-) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:9162 \n", + "17163 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:79317 \n", + "17164 ate acid obo:chebi#is_conjugate_base_of CHEBI:994 \n", + "17165 acid ate obo:chebi#is_conjugate_acid_of CHEBI:995 \n", "\n", " chem2 charge_diff charge_diff_sign \\\n", "0 CHEBI:62070 NaN NaN \n", @@ -1332,11 +1581,11 @@ "3 CHEBI:10046 NaN NaN \n", "4 CHEBI:71201 NaN NaN \n", "... ... ... ... \n", - "17165 CHEBI:91301 NaN NaN \n", - "17166 CHEBI:79317 NaN NaN \n", - "17167 CHEBI:9162 NaN NaN \n", - "17168 CHEBI:995 NaN NaN \n", - "17169 CHEBI:994 NaN NaN \n", + "17161 CHEBI:91301 NaN NaN \n", + "17162 CHEBI:79317 NaN NaN \n", + "17163 CHEBI:9162 NaN NaN \n", + "17164 CHEBI:995 NaN NaN \n", + "17165 CHEBI:994 NaN NaN \n", "\n", " stem \n", "0 nalidix \n", @@ -1345,13 +1594,13 @@ "3 Wyerone \n", "4 xanthuren \n", "... ... \n", - "17165 5,20-diHEPE \n", - "17166 sinigrin \n", - "17167 sinigrin \n", - "17168 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", - "17169 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", + "17161 5,20-diHEPE \n", + "17162 sinigrin \n", + "17163 sinigrin \n", + "17164 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", + "17165 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", "\n", - "[17170 rows x 8 columns]" + "[17166 rows x 8 columns]" ], "text/html": [ "
\n", @@ -1450,7 +1699,7 @@ " ...\n", " \n", " \n", - " 17165\n", + " 17161\n", " NO_SUFFIX\n", " (1-)\n", " obo:chebi#is_conjugate_acid_of\n", @@ -1461,7 +1710,7 @@ " 5,20-diHEPE\n", " \n", " \n", - " 17166\n", + " 17162\n", " (1-)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -1472,7 +1721,7 @@ " sinigrin\n", " \n", " \n", - " 17167\n", + " 17163\n", " NO_SUFFIX\n", " (1-)\n", " obo:chebi#is_conjugate_acid_of\n", @@ -1483,7 +1732,7 @@ " sinigrin\n", " \n", " \n", - " 17168\n", + " 17164\n", " ate\n", " acid\n", " obo:chebi#is_conjugate_base_of\n", @@ -1494,7 +1743,7 @@ " cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio\n", " \n", " \n", - " 17169\n", + " 17165\n", " acid\n", " ate\n", " obo:chebi#is_conjugate_acid_of\n", @@ -1506,22 +1755,22 @@ " \n", " \n", "\n", - "

17170 rows × 8 columns

\n", + "

17166 rows × 8 columns

\n", "
" ] }, - "execution_count": 139, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 139 + "execution_count": 43 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:41.120173Z", - "start_time": "2024-08-20T18:50:41.111065Z" + "end_time": "2024-08-24T00:15:18.050305Z", + "start_time": "2024-08-24T00:15:18.042190Z" } }, "cell_type": "code", @@ -1532,13 +1781,13 @@ "data": { "text/plain": [ " suffix1 suffix2 predicate chem1 \\\n", - "15589 (1+) (5+) obo:chebi#is_conjugate_acid_of CHEBI:83553 \n", + "15585 (1+) (5+) obo:chebi#is_conjugate_acid_of CHEBI:83553 \n", "\n", " chem2 charge_diff charge_diff_sign \\\n", - "15589 CHEBI:82771 -4.0 -1.0 \n", + "15585 CHEBI:82771 -4.0 -1.0 \n", "\n", " stem \n", - "15589 N(4)-bis(aminopropyl)spermidine " + "15585 N(4)-bis(aminopropyl)spermidine " ], "text/html": [ "
\n", @@ -1571,7 +1820,7 @@ " \n", " \n", " \n", - " 15589\n", + " 15585\n", " (1+)\n", " (5+)\n", " obo:chebi#is_conjugate_acid_of\n", @@ -1586,18 +1835,18 @@ "
" ] }, - "execution_count": 140, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 140 + "execution_count": 44 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:41.198931Z", - "start_time": "2024-08-20T18:50:41.189397Z" + "end_time": "2024-08-24T00:15:18.091557Z", + "start_time": "2024-08-24T00:15:18.083784Z" } }, "cell_type": "code", @@ -1608,13 +1857,13 @@ "data": { "text/plain": [ " suffix1 suffix2 predicate chem1 \\\n", - "15588 (5+) (1+) obo:chebi#is_conjugate_base_of CHEBI:82771 \n", + "15584 (5+) (1+) obo:chebi#is_conjugate_base_of CHEBI:82771 \n", "\n", " chem2 charge_diff charge_diff_sign \\\n", - "15588 CHEBI:83553 4.0 1.0 \n", + "15584 CHEBI:83553 4.0 1.0 \n", "\n", " stem \n", - "15588 N(4)-bis(aminopropyl)spermidine " + "15584 N(4)-bis(aminopropyl)spermidine " ], "text/html": [ "
\n", @@ -1647,7 +1896,7 @@ " \n", " \n", " \n", - " 15588\n", + " 15584\n", " (5+)\n", " (1+)\n", " obo:chebi#is_conjugate_base_of\n", @@ -1662,18 +1911,18 @@ "
" ] }, - "execution_count": 141, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 141 + "execution_count": 45 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:41.313133Z", - "start_time": "2024-08-20T18:50:41.306184Z" + "end_time": "2024-08-24T00:15:18.269064Z", + "start_time": "2024-08-24T00:15:18.262903Z" } }, "cell_type": "code", @@ -1684,38 +1933,38 @@ "data": { "text/plain": [ "predicate\n", - "NO_REL 3260\n", + "NO_REL 3256\n", "obo:chebi#is_conjugate_acid_of 6378\n", "obo:chebi#is_conjugate_base_of 6378\n", "obo:chebi#is_tautomer_of 1154\n", "dtype: int64" ] }, - "execution_count": 142, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 142 + "execution_count": 46 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:41.476203Z", - "start_time": "2024-08-20T18:50:41.422225Z" + "end_time": "2024-08-24T00:15:18.419745Z", + "start_time": "2024-08-24T00:15:18.368478Z" } }, "cell_type": "code", "source": "df.to_csv(\"tmp/conjrels.csv\", index=False)\n", "id": "5ee58eef65ddbc9d", "outputs": [], - "execution_count": 143 + "execution_count": 47 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:41.558828Z", - "start_time": "2024-08-20T18:50:41.548557Z" + "end_time": "2024-08-24T00:15:18.505585Z", + "start_time": "2024-08-24T00:15:18.496412Z" } }, "cell_type": "code", @@ -1744,18 +1993,18 @@ "Length: 424, dtype: int64" ] }, - "execution_count": 144, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 144 + "execution_count": 48 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:42.149304Z", - "start_time": "2024-08-20T18:50:41.621761Z" + "end_time": "2024-08-24T00:15:19.199737Z", + "start_time": "2024-08-24T00:15:18.593731Z" } }, "cell_type": "code", @@ -1798,13 +2047,13 @@ "output_type": "display_data" } ], - "execution_count": 145 + "execution_count": 49 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:42.195926Z", - "start_time": "2024-08-20T18:50:42.180216Z" + "end_time": "2024-08-24T00:15:19.269514Z", + "start_time": "2024-08-24T00:15:19.254225Z" } }, "cell_type": "code", @@ -1979,18 +2228,18 @@ "" ] }, - "execution_count": 146, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 146 + "execution_count": 50 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:42.282446Z", - "start_time": "2024-08-20T18:50:42.269431Z" + "end_time": "2024-08-24T00:15:19.346191Z", + "start_time": "2024-08-24T00:15:19.334380Z" } }, "cell_type": "code", @@ -2002,42 +2251,42 @@ "text/plain": [ " suffix1 suffix2 predicate chem1 \\\n", "3026 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:141055 \n", - "10940 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:58644 \n", - "12234 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64003 \n", - "12334 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64364 \n", - "13410 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:72567 \n", - "14212 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:75297 \n", - "14386 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76278 \n", - "14676 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76819 \n", - "14702 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76922 \n", - "16430 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86083 \n", - "16456 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86380 \n", + "10936 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:58644 \n", + "12230 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64003 \n", + "12330 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64364 \n", + "13406 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:72567 \n", + "14208 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:75297 \n", + "14382 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76278 \n", + "14672 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76819 \n", + "14698 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76922 \n", + "16426 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86083 \n", + "16452 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86380 \n", "\n", " chem2 charge_diff charge_diff_sign \\\n", "3026 CHEBI:141057 NaN NaN \n", - "10940 CHEBI:32818 NaN NaN \n", - "12234 CHEBI:64004 NaN NaN \n", - "12334 CHEBI:10650 NaN NaN \n", - "13410 CHEBI:6438 NaN NaN \n", - "14212 CHEBI:31057 NaN NaN \n", - "14386 CHEBI:16299 NaN NaN \n", - "14676 CHEBI:15906 NaN NaN \n", - "14702 CHEBI:77055 NaN NaN \n", - "16430 CHEBI:86085 NaN NaN \n", - "16456 CHEBI:599440 NaN NaN \n", + "10936 CHEBI:32818 NaN NaN \n", + "12230 CHEBI:64004 NaN NaN \n", + "12330 CHEBI:10650 NaN NaN \n", + "13406 CHEBI:6438 NaN NaN \n", + "14208 CHEBI:31057 NaN NaN \n", + "14382 CHEBI:16299 NaN NaN \n", + "14672 CHEBI:15906 NaN NaN \n", + "14698 CHEBI:77055 NaN NaN \n", + "16426 CHEBI:86085 NaN NaN \n", + "16452 CHEBI:599440 NaN NaN \n", "\n", " stem \n", "3026 validoxylamine B \n", - "10940 p-coumaroylagmatine \n", - "12234 N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te... \n", - "12334 sumatriptan \n", - "13410 levobunolol \n", - "14212 13-deoxydaunorubicin \n", - "14386 dehydrocoformycin \n", - "14676 demethylmacrocin \n", - "14702 argemonine \n", - "16430 (Z)-p-coumaroylagmatine \n", - "16456 amorolfine " + "10936 p-coumaroylagmatine \n", + "12230 N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te... \n", + "12330 sumatriptan \n", + "13406 levobunolol \n", + "14208 13-deoxydaunorubicin \n", + "14382 dehydrocoformycin \n", + "14672 demethylmacrocin \n", + "14698 argemonine \n", + "16426 (Z)-p-coumaroylagmatine \n", + "16452 amorolfine " ], "text/html": [ "
\n", @@ -2081,7 +2330,7 @@ " validoxylamine B\n", " \n", " \n", - " 10940\n", + " 10936\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2092,7 +2341,7 @@ " p-coumaroylagmatine\n", " \n", " \n", - " 12234\n", + " 12230\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2103,7 +2352,7 @@ " N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te...\n", " \n", " \n", - " 12334\n", + " 12330\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2114,7 +2363,7 @@ " sumatriptan\n", " \n", " \n", - " 13410\n", + " 13406\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2125,7 +2374,7 @@ " levobunolol\n", " \n", " \n", - " 14212\n", + " 14208\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2136,7 +2385,7 @@ " 13-deoxydaunorubicin\n", " \n", " \n", - " 14386\n", + " 14382\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2147,7 +2396,7 @@ " dehydrocoformycin\n", " \n", " \n", - " 14676\n", + " 14672\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2158,7 +2407,7 @@ " demethylmacrocin\n", " \n", " \n", - " 14702\n", + " 14698\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2169,7 +2418,7 @@ " argemonine\n", " \n", " \n", - " 16430\n", + " 16426\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2180,7 +2429,7 @@ " (Z)-p-coumaroylagmatine\n", " \n", " \n", - " 16456\n", + " 16452\n", " (1+)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2195,18 +2444,18 @@ "
" ] }, - "execution_count": 147, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 147 + "execution_count": 51 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:50:42.368542Z", - "start_time": "2024-08-20T18:50:42.360114Z" + "end_time": "2024-08-24T00:15:19.593644Z", + "start_time": "2024-08-24T00:15:19.586411Z" } }, "cell_type": "code", @@ -2223,11 +2472,11 @@ "3 NO_SUFFIX acid NO_REL CHEBI:10045 \n", "4 acid ate obo:chebi#is_conjugate_acid_of CHEBI:10072 \n", "... ... ... ... ... \n", - "17165 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:130073 \n", - "17166 (1-) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:9162 \n", - "17167 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:79317 \n", - "17168 ate acid obo:chebi#is_conjugate_base_of CHEBI:994 \n", - "17169 acid ate obo:chebi#is_conjugate_acid_of CHEBI:995 \n", + "17161 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:130073 \n", + "17162 (1-) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:9162 \n", + "17163 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:79317 \n", + "17164 ate acid obo:chebi#is_conjugate_base_of CHEBI:994 \n", + "17165 acid ate obo:chebi#is_conjugate_acid_of CHEBI:995 \n", "\n", " chem2 charge_diff charge_diff_sign \\\n", "0 CHEBI:62070 NaN NaN \n", @@ -2236,11 +2485,11 @@ "3 CHEBI:10046 NaN NaN \n", "4 CHEBI:71201 NaN NaN \n", "... ... ... ... \n", - "17165 CHEBI:91301 NaN NaN \n", - "17166 CHEBI:79317 NaN NaN \n", - "17167 CHEBI:9162 NaN NaN \n", - "17168 CHEBI:995 NaN NaN \n", - "17169 CHEBI:994 NaN NaN \n", + "17161 CHEBI:91301 NaN NaN \n", + "17162 CHEBI:79317 NaN NaN \n", + "17163 CHEBI:9162 NaN NaN \n", + "17164 CHEBI:995 NaN NaN \n", + "17165 CHEBI:994 NaN NaN \n", "\n", " stem \n", "0 nalidix \n", @@ -2249,13 +2498,13 @@ "3 Wyerone \n", "4 xanthuren \n", "... ... \n", - "17165 5,20-diHEPE \n", - "17166 sinigrin \n", - "17167 sinigrin \n", - "17168 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", - "17169 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", + "17161 5,20-diHEPE \n", + "17162 sinigrin \n", + "17163 sinigrin \n", + "17164 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", + "17165 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n", "\n", - "[17170 rows x 8 columns]" + "[17166 rows x 8 columns]" ], "text/html": [ "
\n", @@ -2354,7 +2603,7 @@ " ...\n", " \n", " \n", - " 17165\n", + " 17161\n", " NO_SUFFIX\n", " (1-)\n", " obo:chebi#is_conjugate_acid_of\n", @@ -2365,7 +2614,7 @@ " 5,20-diHEPE\n", " \n", " \n", - " 17166\n", + " 17162\n", " (1-)\n", " NO_SUFFIX\n", " obo:chebi#is_conjugate_base_of\n", @@ -2376,7 +2625,7 @@ " sinigrin\n", " \n", " \n", - " 17167\n", + " 17163\n", " NO_SUFFIX\n", " (1-)\n", " obo:chebi#is_conjugate_acid_of\n", @@ -2387,7 +2636,7 @@ " sinigrin\n", " \n", " \n", - " 17168\n", + " 17164\n", " ate\n", " acid\n", " obo:chebi#is_conjugate_base_of\n", @@ -2398,7 +2647,7 @@ " cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio\n", " \n", " \n", - " 17169\n", + " 17165\n", " acid\n", " ate\n", " obo:chebi#is_conjugate_acid_of\n", @@ -2410,16 +2659,16 @@ " \n", " \n", "\n", - "

17170 rows × 8 columns

\n", + "

17166 rows × 8 columns

\n", "
" ] }, - "execution_count": 148, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 148 + "execution_count": 52 }, { "metadata": {}, @@ -2431,22 +2680,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:50:49.574648Z", - "start_time": "2024-08-20T18:50:42.471703Z" + "end_time": "2024-08-24T00:15:25.218850Z", + "start_time": "2024-08-24T00:15:19.687885Z" } }, "cell_type": "code", "source": "is_as = list(chebi.relationships(predicates=[IS_A]))", "id": "5a77f734fba5ae24", "outputs": [], - "execution_count": 149 + "execution_count": 53 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:50:49.714569Z", - "start_time": "2024-08-20T18:50:49.607491Z" + "end_time": "2024-08-24T00:15:25.362363Z", + "start_time": "2024-08-24T00:15:25.255263Z" } }, "cell_type": "code", @@ -2457,7 +2706,7 @@ ], "id": "7a7a9eb7c9110c39", "outputs": [], - "execution_count": 150 + "execution_count": 54 }, { "metadata": {}, @@ -2473,8 +2722,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:06.708351Z", - "start_time": "2024-08-20T18:50:49.719652Z" + "end_time": "2024-08-24T00:15:38.864654Z", + "start_time": "2024-08-24T00:15:25.397779Z" } }, "cell_type": "code", @@ -2486,14 +2735,14 @@ ], "id": "6ac6247a4be899da", "outputs": [], - "execution_count": 151 + "execution_count": 55 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:07.045609Z", - "start_time": "2024-08-20T18:51:07.042271Z" + "end_time": "2024-08-24T00:15:38.874220Z", + "start_time": "2024-08-24T00:15:38.870996Z" } }, "cell_type": "code", @@ -2506,33 +2755,33 @@ "716075" ] }, - "execution_count": 152, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 152 + "execution_count": 56 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:07.758052Z", - "start_time": "2024-08-20T18:51:07.386204Z" + "end_time": "2024-08-24T00:15:39.558412Z", + "start_time": "2024-08-24T00:15:39.176170Z" } }, "cell_type": "code", "source": "up_axiom_anns = [row for row in axiom_anns if row.annotation_predicate == \"oio:hasDbXref\" and row.annotation_value == \"UniProt\"]", "id": "6132886244ebdbe", "outputs": [], - "execution_count": 153 + "execution_count": 57 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:08.470347Z", - "start_time": "2024-08-20T18:51:08.095353Z" + "end_time": "2024-08-24T00:15:40.260839Z", + "start_time": "2024-08-24T00:15:39.867494Z" } }, "cell_type": "code", @@ -2548,19 +2797,19 @@ "16393" ] }, - "execution_count": 154, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 154 + "execution_count": 58 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:08.808603Z", - "start_time": "2024-08-20T18:51:08.805412Z" + "end_time": "2024-08-24T00:15:40.583435Z", + "start_time": "2024-08-24T00:15:40.580153Z" } }, "cell_type": "code", @@ -2573,12 +2822,12 @@ "16393" ] }, - "execution_count": 155, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 155 + "execution_count": 59 }, { "metadata": {}, @@ -2603,22 +2852,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:09.141745Z", - "start_time": "2024-08-20T18:51:09.139985Z" + "end_time": "2024-08-24T00:15:40.905262Z", + "start_time": "2024-08-24T00:15:40.903477Z" } }, "cell_type": "code", "source": "\n", "id": "9ab37d9db461f843", "outputs": [], - "execution_count": 155 + "execution_count": 59 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:09.472235Z", - "start_time": "2024-08-20T18:51:09.470545Z" + "end_time": "2024-08-24T00:15:41.219539Z", + "start_time": "2024-08-24T00:15:41.217559Z" } }, "cell_type": "code", @@ -2631,8 +2880,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:09.906415Z", - "start_time": "2024-08-20T18:51:09.812191Z" + "end_time": "2024-08-24T00:15:41.956470Z", + "start_time": "2024-08-24T00:15:41.855434Z" } }, "cell_type": "code", @@ -2645,14 +2894,14 @@ ], "id": "e5792b2c75c4a675", "outputs": [], - "execution_count": 156 + "execution_count": 60 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:10.244098Z", - "start_time": "2024-08-20T18:51:10.237797Z" + "end_time": "2024-08-24T00:15:41.969773Z", + "start_time": "2024-08-24T00:15:41.963911Z" } }, "cell_type": "code", @@ -2774,33 +3023,33 @@ "" ] }, - "execution_count": 157, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 157 + "execution_count": 61 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:10.660646Z", - "start_time": "2024-08-20T18:51:10.628401Z" + "end_time": "2024-08-24T00:15:42.342755Z", + "start_time": "2024-08-24T00:15:42.311545Z" } }, "cell_type": "code", "source": "ph_mapping = dict(zip(ph_mapping_df['CHEBI'], ph_mapping_df['CHEBI_PH7_3']))", "id": "afd7468af0f9a848", "outputs": [], - "execution_count": 158 + "execution_count": 62 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:10.694812Z", - "start_time": "2024-08-20T18:51:10.691763Z" + "end_time": "2024-08-24T00:15:42.386948Z", + "start_time": "2024-08-24T00:15:42.383741Z" } }, "cell_type": "code", @@ -2813,18 +3062,18 @@ "119807" ] }, - "execution_count": 159, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 159 + "execution_count": 63 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:10.711784Z", - "start_time": "2024-08-20T18:51:10.709198Z" + "end_time": "2024-08-24T00:15:42.438818Z", + "start_time": "2024-08-24T00:15:42.436633Z" } }, "cell_type": "code", @@ -2834,26 +3083,26 @@ ], "id": "f960fab3cbd8c968", "outputs": [], - "execution_count": 160 + "execution_count": 64 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:11.058399Z", - "start_time": "2024-08-20T18:51:11.055872Z" + "end_time": "2024-08-24T00:15:42.756643Z", + "start_time": "2024-08-24T00:15:42.754150Z" } }, "cell_type": "code", "source": "assert ph_mapping[CITRIC_ACID] != CITRIC_ACID", "id": "5d984c637fc464a6", "outputs": [], - "execution_count": 161 + "execution_count": 65 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:11.722648Z", - "start_time": "2024-08-20T18:51:11.720017Z" + "end_time": "2024-08-24T00:15:43.083666Z", + "start_time": "2024-08-24T00:15:43.078525Z" } }, "cell_type": "code", @@ -2864,13 +3113,26 @@ ], "id": "6215936dbaf1e9f7", "outputs": [], - "execution_count": 162 + "execution_count": 66 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:43.404535Z", + "start_time": "2024-08-24T00:15:43.399428Z" + } + }, + "cell_type": "code", + "source": "assert ph_mapping[WATER] == WATER", + "id": "3f295b2fce03ea7f", + "outputs": [], + "execution_count": 67 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:11.733231Z", - "start_time": "2024-08-20T18:51:11.730857Z" + "end_time": "2024-08-24T00:15:43.721454Z", + "start_time": "2024-08-24T00:15:43.717362Z" } }, "cell_type": "code", @@ -2880,14 +3142,14 @@ ], "id": "41959f4eb1f532e7", "outputs": [], - "execution_count": 163 + "execution_count": 68 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:12.111452Z", - "start_time": "2024-08-20T18:51:12.068035Z" + "end_time": "2024-08-24T00:15:44.150952Z", + "start_time": "2024-08-24T00:15:44.072155Z" } }, "cell_type": "code", @@ -2908,19 +3170,19 @@ "111077" ] }, - "execution_count": 164, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 164 + "execution_count": 69 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:12.447556Z", - "start_time": "2024-08-20T18:51:12.445087Z" + "end_time": "2024-08-24T00:15:45.017037Z", + "start_time": "2024-08-24T00:15:45.014725Z" } }, "cell_type": "code", @@ -2931,7 +3193,41 @@ ], "id": "9120365808e9bc04", "outputs": [], - "execution_count": 165 + "execution_count": 70 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:45.050409Z", + "start_time": "2024-08-24T00:15:45.024002Z" + } + }, + "cell_type": "code", + "source": [ + "rhea_sccs = []\n", + "rhea_singletons = []\n", + "for _, clique in rev_ph_mapping.items():\n", + " if len(clique) > 1:\n", + " rhea_sccs.append(set(clique))\n", + " else:\n", + " rhea_singletons.append(clique[0])\n", + "\n", + "len(rhea_sccs), len(rhea_singletons)" + ], + "id": "7c88440d52718d0a", + "outputs": [ + { + "data": { + "text/plain": [ + "(8140, 102937)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 71 }, { "metadata": {}, @@ -2946,8 +3242,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:12.789084Z", - "start_time": "2024-08-20T18:51:12.783392Z" + "end_time": "2024-08-24T00:15:45.417866Z", + "start_time": "2024-08-24T00:15:45.411105Z" } }, "cell_type": "code", @@ -2999,13 +3295,13 @@ ], "id": "e878bc530b346f26", "outputs": [], - "execution_count": 166 + "execution_count": 72 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:13.133279Z", - "start_time": "2024-08-20T18:51:13.129885Z" + "end_time": "2024-08-24T00:15:45.779732Z", + "start_time": "2024-08-24T00:15:45.775801Z" } }, "cell_type": "code", @@ -3018,18 +3314,18 @@ "'CHEBI:78608'" ] }, - "execution_count": 167, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 167 + "execution_count": 73 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:13.470030Z", - "start_time": "2024-08-20T18:51:13.468041Z" + "end_time": "2024-08-24T00:15:46.522219Z", + "start_time": "2024-08-24T00:15:46.520120Z" } }, "cell_type": "code", @@ -3041,8 +3337,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:13.821214Z", - "start_time": "2024-08-20T18:51:13.807687Z" + "end_time": "2024-08-24T00:15:46.540187Z", + "start_time": "2024-08-24T00:15:46.528398Z" } }, "cell_type": "code", @@ -3070,13 +3366,13 @@ ], "id": "5ac60d2c2ca3e36d", "outputs": [], - "execution_count": 168 + "execution_count": 74 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:14.161399Z", - "start_time": "2024-08-20T18:51:14.159592Z" + "end_time": "2024-08-24T00:15:46.953391Z", + "start_time": "2024-08-24T00:15:46.951638Z" } }, "cell_type": "code", @@ -3088,46 +3384,47 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:14.509576Z", - "start_time": "2024-08-20T18:51:14.507457Z" + "end_time": "2024-08-24T00:15:47.314950Z", + "start_time": "2024-08-24T00:15:47.312693Z" } }, "cell_type": "code", "source": "# assert CITRIC_ACID in members_to_canonical\n", "id": "b922566d4a6977f1", "outputs": [], - "execution_count": 169 + "execution_count": 75 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:14.855823Z", - "start_time": "2024-08-20T18:51:14.840532Z" + "end_time": "2024-08-24T00:15:47.676199Z", + "start_time": "2024-08-24T00:15:47.673888Z" } }, "cell_type": "code", - "source": [ - "chem_to_stem: Dict[str, str] = {}\n", - "for row in rows:\n", - " def _assign(chem: str, stem: str):\n", - " if chem in chem_to_stem:\n", - " if chem_to_stem[chem] != stem:\n", - " raise ValueError(f\"Conflicting stems for {chem}: {chem_to_stem[chem]} vs {stem}\")\n", - " else:\n", - " chem_to_stem[chem] = stem\n", - " stem = row[\"stem\"]\n", - " _assign(row[\"chem1\"], stem)\n", - " _assign(row[\"chem2\"], stem)" - ], + "source": "##", "id": "986db0b5676b022", "outputs": [], - "execution_count": 170 + "execution_count": 76 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:15.204220Z", - "start_time": "2024-08-20T18:51:15.195715Z" + "end_time": "2024-08-24T00:15:48.036517Z", + "start_time": "2024-08-24T00:15:48.034616Z" + } + }, + "cell_type": "code", + "source": "", + "id": "144827585302bdb6", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:48.400035Z", + "start_time": "2024-08-24T00:15:48.390743Z" } }, "cell_type": "code", @@ -3145,21 +3442,21 @@ { "data": { "text/plain": [ - "31416" + "31415" ] }, - "execution_count": 171, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 171 + "execution_count": 77 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:15.550153Z", - "start_time": "2024-08-20T18:51:15.545126Z" + "end_time": "2024-08-24T00:15:48.752801Z", + "start_time": "2024-08-24T00:15:48.747771Z" } }, "cell_type": "code", @@ -3176,21 +3473,21 @@ { "data": { "text/plain": [ - "1929" + "1927" ] }, - "execution_count": 172, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 172 + "execution_count": 78 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:21.437440Z", - "start_time": "2024-08-20T18:51:15.893723Z" + "end_time": "2024-08-24T00:15:54.975376Z", + "start_time": "2024-08-24T00:15:49.110226Z" } }, "cell_type": "code", @@ -3213,21 +3510,40 @@ { "data": { "text/plain": [ - "9325" + "9321" ] }, - "execution_count": 173, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 173 + "execution_count": 79 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "This number is the total number of cliques we will use", + "id": "ddcaccb58acaefc1" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:55.327907Z", + "start_time": "2024-08-24T00:15:55.326335Z" + } + }, + "cell_type": "code", + "source": "", + "id": "a426f8f57483a9e9", + "outputs": [], + "execution_count": null }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:21.793968Z", - "start_time": "2024-08-20T18:51:21.782209Z" + "end_time": "2024-08-24T00:15:55.688891Z", + "start_time": "2024-08-24T00:15:55.677465Z" } }, "cell_type": "code", @@ -3237,13 +3553,13 @@ ], "id": "7e12327ea167d1f7", "outputs": [], - "execution_count": 174 + "execution_count": 80 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:22.145787Z", - "start_time": "2024-08-20T18:51:22.142195Z" + "end_time": "2024-08-24T00:15:56.037218Z", + "start_time": "2024-08-24T00:15:56.034231Z" } }, "cell_type": "code", @@ -3256,25 +3572,49 @@ "'amino acid'" ] }, - "execution_count": 175, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 175 + "execution_count": 81 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:22.483277Z", - "start_time": "2024-08-20T18:51:22.480509Z" + "end_time": "2024-08-24T00:15:56.384195Z", + "start_time": "2024-08-24T00:15:56.382058Z" } }, "cell_type": "code", "source": "assert members_to_canonical[AMINO_ACID_ANION] == AMINO_ACID", "id": "8ae582d277664a82", "outputs": [], - "execution_count": 176 + "execution_count": 82 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:15:56.734865Z", + "start_time": "2024-08-24T00:15:56.732138Z" + } + }, + "cell_type": "code", + "source": "len(members_to_canonical)", + "id": "69daa1cf3e2af9a4", + "outputs": [ + { + "data": { + "text/plain": [ + "19624" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 83 }, { "metadata": {}, @@ -3291,13 +3631,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:27.039620Z", - "start_time": "2024-08-20T18:51:22.815342Z" + "end_time": "2024-08-24T00:16:00.126305Z", + "start_time": "2024-08-24T00:15:57.086873Z" } }, "id": "6f9c14d520e38499", "outputs": [], - "execution_count": 177 + "execution_count": 84 }, { "cell_type": "code", @@ -3307,8 +3647,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:27.387061Z", - "start_time": "2024-08-20T18:51:27.384197Z" + "end_time": "2024-08-24T00:16:00.495569Z", + "start_time": "2024-08-24T00:16:00.492363Z" } }, "id": "707d750e6f27b0d", @@ -3319,19 +3659,19 @@ "6762" ] }, - "execution_count": 178, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 178 + "execution_count": 85 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:27.718351Z", - "start_time": "2024-08-20T18:51:27.715451Z" + "end_time": "2024-08-24T00:16:00.878734Z", + "start_time": "2024-08-24T00:16:00.868151Z" } }, "cell_type": "code", @@ -3342,14 +3682,14 @@ ], "id": "d4012fde318a42cb", "outputs": [], - "execution_count": 179 + "execution_count": 86 }, { "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:28.052294Z", - "start_time": "2024-08-20T18:51:28.049520Z" + "end_time": "2024-08-24T00:16:01.236665Z", + "start_time": "2024-08-24T00:16:01.233771Z" } }, "cell_type": "code", @@ -3367,13 +3707,13 @@ ], "id": "eb60026fd7282828", "outputs": [], - "execution_count": 180 + "execution_count": 87 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:28.389188Z", - "start_time": "2024-08-20T18:51:28.385350Z" + "end_time": "2024-08-24T00:16:01.591510Z", + "start_time": "2024-08-24T00:16:01.588085Z" } }, "cell_type": "code", @@ -3386,18 +3726,18 @@ "'CHEBI:83410'" ] }, - "execution_count": 181, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 181 + "execution_count": 88 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:28.730553Z", - "start_time": "2024-08-20T18:51:28.726958Z" + "end_time": "2024-08-24T00:16:01.944113Z", + "start_time": "2024-08-24T00:16:01.940983Z" } }, "cell_type": "code", @@ -3407,7 +3747,7 @@ ], "id": "458724079e756895", "outputs": [], - "execution_count": 182 + "execution_count": 89 }, { "metadata": {}, @@ -3419,8 +3759,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:29.071951Z", - "start_time": "2024-08-20T18:51:29.070078Z" + "end_time": "2024-08-24T00:16:02.296984Z", + "start_time": "2024-08-24T00:16:02.295171Z" } }, "cell_type": "code", @@ -3448,6 +3788,7 @@ " relationships: List[Tuple[str, str]] = []\n", " inchi: Optional[str] = None\n", " physiologically_stable_form: Optional[str] = None\n", + " comments: List[str] = []\n", " \n", " def as_obo(self) -> str:\n", " name = self.label.replace('{', r'\\{')\n", @@ -3460,7 +3801,8 @@ " lines += [f\"alt_id: {alt_id}\" for alt_id in self.alt_ids or []]\n", " lines += [f\"is_a: {is_a}\" for is_a in self.parents or []]\n", " lines += [f\"xref: {xref}\" for xref in self.xrefs or []]\n", - " lines += [f\"property_value: {p} {v}\" for p, v in self.relationships or []]\n", + " lines += [f\"relationship: {p} {v}\" for p, v in self.relationships or []]\n", + " lines += [f\"comment: {'; '.join(self.comments)}\"] if self.comments else []\n", " lines += [f\"property_value: chemrof:inchi_string \\\"{self.inchi}\\\" xsd:string\"] if self.inchi else []\n", " lines += [f\"property_value: chemrof:has_physiologically_stable_form {self.physiologically_stable_form}\"] if self.physiologically_stable_form else []\n", " lines += [\"\"]\n", @@ -3483,13 +3825,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:29.909837Z", - "start_time": "2024-08-20T18:51:29.894997Z" + "end_time": "2024-08-24T00:16:02.662044Z", + "start_time": "2024-08-24T00:16:02.649433Z" } }, "id": "e1d93ec81dd98d00", "outputs": [], - "execution_count": 183 + "execution_count": 90 }, { "cell_type": "code", @@ -3528,11 +3870,18 @@ " else:\n", " alt_ids = []\n", " equiv_set = [id] + alt_ids\n", + " comments = []\n", " for alt_id in equiv_set:\n", " for parent in is_a_map.get(alt_id, []):\n", " rewired_parent = rewire(parent)\n", " if rewired_parent and rewired_parent not in term.parents:\n", " term.parents.append(rewired_parent)\n", + " if rewired_parent != parent or alt_id != id:\n", + " comments.append(f\"Parent {rewired_parent} was rewired from {alt_id} to {parent}\")\n", + " for (p, o) in preserved_rels_by_subject.get(alt_id, []):\n", + " rewired_o = rewire(o)\n", + " if rewired_o and (p, rewired_o) not in term.relationships:\n", + " term.relationships.append((p, rewired_o))\n", " # TODO: xrefs\n", " for xref in xrefs.get(alt_id, []):\n", " if xref.startswith(\"PMID:\"):\n", @@ -3541,6 +3890,7 @@ " if alt_id in inchis:\n", " if not term.inchi:\n", " term.inchi = inchis[alt_id]\n", + " term.comments = comments\n", " return term\n", "\n", "\n", @@ -3552,8 +3902,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:29.925855Z", - "start_time": "2024-08-20T18:51:29.916298Z" + "end_time": "2024-08-24T00:16:03.035098Z", + "start_time": "2024-08-24T00:16:03.023772Z" } }, "id": "b196288c3d4bc41c", @@ -3565,18 +3915,18 @@ "[Term]\n", "id: CHEBI:35235\n", "name: L-cysteine\n", - "alt_id: CHEBI:32443\n", - "alt_id: CHEBI:17561\n", "alt_id: CHEBI:32442\n", + "alt_id: CHEBI:17561\n", "alt_id: CHEBI:32445\n", + "alt_id: CHEBI:32443\n", "is_a: CHEBI:35237\n", "is_a: CHEBI:59869\n", "is_a: CHEBI:26650\n", "is_a: CHEBI:83813\n", "xref: Gmelin:49993\n", - "xref: Reaxys:5921923\n", - "xref: Gmelin:325856\n", - "xref: Beilstein:5921923\n", + "xref: Reaxys:4128886\n", + "xref: Gmelin:325857\n", + "xref: Beilstein:4128886\n", "xref: YMDB:YMDB00046\n", "xref: Wikipedia:Cysteine\n", "xref: Reaxys:1721408\n", @@ -3592,36 +3942,42 @@ "xref: DrugBank:DB00151\n", "xref: CAS:52-90-4\n", "xref: Beilstein:1721408\n", - "xref: Reaxys:4128886\n", - "xref: Gmelin:325857\n", - "xref: Beilstein:4128886\n", "xref: Gmelin:325860\n", + "xref: Reaxys:5921923\n", + "xref: Gmelin:325856\n", + "xref: Beilstein:5921923\n", + "relationship: RO:0018039 CHEBI:35236\n", + "relationship: RO:0000087 CHEBI:78675\n", + "relationship: RO:0000087 CHEBI:64577\n", + "relationship: RO:0000087 CHEBI:77703\n", + "relationship: RO:0000087 CHEBI:77746\n", + "comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813\n", "property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1\" xsd:string\n", "property_value: chemrof:has_physiologically_stable_form CHEBI:35235\n", "\n" ] } ], - "execution_count": 184 + "execution_count": 91 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:30.284374Z", - "start_time": "2024-08-20T18:51:30.281353Z" + "end_time": "2024-08-24T00:16:03.397248Z", + "start_time": "2024-08-24T00:16:03.394309Z" } }, "cell_type": "code", "source": "assert rewire(is_a_map[CORD_E][0]) not in exclusion_list", "id": "2a3ed58115eabd33", "outputs": [], - "execution_count": 185 + "execution_count": 92 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:30.689229Z", - "start_time": "2024-08-20T18:51:30.684618Z" + "end_time": "2024-08-24T00:16:03.748084Z", + "start_time": "2024-08-24T00:16:03.744976Z" } }, "cell_type": "code", @@ -3639,13 +3995,14 @@ "id: CHEBI:213754\n", "name: Cordycepamide E\n", "is_a: CHEBI:83410\n", + "comment: Parent CHEBI:83410 was rewired from CHEBI:213754 to CHEBI:46874\n", "property_value: chemrof:inchi_string \"InChI=1S/C15H19NO4/c1-9(2)13-14(18)16(3)12(15(19)20-13)8-10-4-6-11(17)7-5-10/h4-7,9,12-13,17H,8H2,1-3H3/t12-,13+/m0/s1\" xsd:string\n", "property_value: chemrof:has_physiologically_stable_form CHEBI:213754\n", "\n" ] } ], - "execution_count": 186 + "execution_count": 93 }, { "cell_type": "code", @@ -3660,8 +4017,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:31.392675Z", - "start_time": "2024-08-20T18:51:31.387984Z" + "end_time": "2024-08-24T00:16:04.101826Z", + "start_time": "2024-08-24T00:16:04.096810Z" } }, "id": "bc403819eabb21f5", @@ -3674,15 +4031,18 @@ "id: CHEBI:35237\n", "name: cysteine\n", "alt_id: CHEBI:32458\n", + "alt_id: CHEBI:32456\n", "alt_id: CHEBI:32457\n", "alt_id: CHEBI:15356\n", - "alt_id: CHEBI:32456\n", "is_a: CHEBI:33709\n", "is_a: CHEBI:78608\n", - "is_a: CHEBI:62031\n", "is_a: CHEBI:26834\n", + "is_a: CHEBI:62031\n", "xref: Gmelin:49992\n", "xref: Gmelin:325859\n", + "xref: Reaxys:4128885\n", + "xref: Gmelin:363235\n", + "xref: Beilstein:4128885\n", "xref: Gmelin:49990\n", "xref: Wikipedia:Cysteine\n", "xref: Reaxys:1721406\n", @@ -3692,16 +4052,16 @@ "xref: Gmelin:2933\n", "xref: CAS:3374-22-9\n", "xref: Beilstein:1721406\n", - "xref: Reaxys:4128885\n", - "xref: Gmelin:363235\n", - "xref: Beilstein:4128885\n", + "relationship: BFO:0000051 CHEBI:50326\n", + "relationship: RO:0000087 CHEBI:78675\n", + "comment: Parent CHEBI:33709 was rewired from CHEBI:35237 to CHEBI:35238; Parent CHEBI:78608 was rewired from CHEBI:32458 to CHEBI:33719; Parent CHEBI:26834 was rewired from CHEBI:32456 to CHEBI:63470; Parent CHEBI:62031 was rewired from CHEBI:15356 to CHEBI:26167\n", "property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)\" xsd:string\n", "property_value: chemrof:has_physiologically_stable_form CHEBI:35237\n", "\n" ] } ], - "execution_count": 187 + "execution_count": 94 }, { "cell_type": "code", @@ -3712,8 +4072,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:31.419886Z", - "start_time": "2024-08-20T18:51:31.417135Z" + "end_time": "2024-08-24T00:16:04.461658Z", + "start_time": "2024-08-24T00:16:04.458964Z" } }, "id": "79d094c451d6a3d9", @@ -3724,18 +4084,18 @@ "text": [ "CHEBI:33709 amino acid\n", "CHEBI:78608 alpha-amino acid zwitterion\n", - "CHEBI:62031 polar amino acid zwitterion\n", - "CHEBI:26834 sulfur-containing amino acid\n" + "CHEBI:26834 sulfur-containing amino acid\n", + "CHEBI:62031 polar amino acid zwitterion\n" ] } ], - "execution_count": 188 + "execution_count": 95 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:31.768983Z", - "start_time": "2024-08-20T18:51:31.765335Z" + "end_time": "2024-08-24T00:16:04.819737Z", + "start_time": "2024-08-24T00:16:04.815776Z" } }, "cell_type": "code", @@ -3749,25 +4109,106 @@ "[Term]\n", "id: CHEBI:78608\n", "name: an alpha-amino acid\n", - "alt_id: CHEBI:33704\n", "alt_id: CHEBI:33558\n", + "alt_id: CHEBI:33704\n", "alt_id: CHEBI:33719\n", "is_a: CHEBI:33709\n", "xref: MetaCyc:Alpha-Amino-Acids\n", "xref: KEGG:C05167\n", "xref: KEGG:C00045\n", + "comment: Parent CHEBI:33709 was rewired from CHEBI:78608 to CHEBI:35238\n", "property_value: chemrof:has_physiologically_stable_form CHEBI:78608\n", "\n" ] } ], - "execution_count": 189 + "execution_count": 96 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:16:05.176457Z", + "start_time": "2024-08-24T00:16:05.173476Z" + } + }, + "cell_type": "code", + "source": [ + "t = make_term(\"CHEBI:25944\")\n", + "t.comments\n", + "print(t.as_obo())\n" + ], + "id": "7cf212252b470eac", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Term]\n", + "id: CHEBI:25944\n", + "name: pesticide\n", + "is_a: CHEBI:33232\n", + "xref: Wikipedia:Pesticide\n", + "\n" + ] + } + ], + "execution_count": 97 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:32.103845Z", - "start_time": "2024-08-20T18:51:32.100101Z" + "end_time": "2024-08-24T00:16:05.536414Z", + "start_time": "2024-08-24T00:16:05.531782Z" + } + }, + "cell_type": "code", + "source": [ + "GLU_1M = \"CHEBI:14321\"\n", + "assert preserved_rels_by_subject[GLU_1M]\n", + "print(make_term(GLU_1M).as_obo())" + ], + "id": "4fde5945f3dbd863", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Term]\n", + "id: CHEBI:14321\n", + "name: glutamate\n", + "alt_id: CHEBI:18237\n", + "alt_id: CHEBI:29987\n", + "is_a: CHEBI:78608\n", + "is_a: CHEBI:62031\n", + "xref: Gmelin:327908\n", + "xref: Wikipedia:Glutamic_acid\n", + "xref: Reaxys:1723799\n", + "xref: KNApSAcK:C00019577\n", + "xref: KNApSAcK:C00001358\n", + "xref: KEGG:D04341\n", + "xref: KEGG:C00302\n", + "xref: Gmelin:101971\n", + "xref: CAS:617-65-2\n", + "xref: Beilstein:1723799\n", + "xref: Reaxys:4134100\n", + "xref: Gmelin:327903\n", + "xref: Beilstein:4134100\n", + "relationship: RO:0000087 CHEBI:78675\n", + "relationship: BFO:0000051 CHEBI:50329\n", + "comment: Parent CHEBI:78608 was rewired from CHEBI:14321 to CHEBI:33558; Parent CHEBI:62031 was rewired from CHEBI:18237 to CHEBI:26167\n", + "property_value: chemrof:inchi_string \"InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)/p-1\" xsd:string\n", + "property_value: chemrof:has_physiologically_stable_form CHEBI:14321\n", + "\n" + ] + } + ], + "execution_count": 98 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-24T00:16:05.895884Z", + "start_time": "2024-08-24T00:16:05.892222Z" } }, "cell_type": "code", @@ -3780,12 +4221,12 @@ "'an alpha-amino acid'" ] }, - "execution_count": 190, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 190 + "execution_count": 99 }, { "cell_type": "code", @@ -3796,8 +4237,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:32.438839Z", - "start_time": "2024-08-20T18:51:32.436347Z" + "end_time": "2024-08-24T00:16:06.253758Z", + "start_time": "2024-08-24T00:16:06.251105Z" } }, "id": "f8fda8f3ae80f063", @@ -3810,37 +4251,15 @@ "idspace: chemrof https://w3id.org/chemrof/\n", "\n", "[Term]\n", - "id: CHEBI:35237\n", - "name: cysteine\n", - "alt_id: CHEBI:32458\n", - "alt_id: CHEBI:32457\n", - "alt_id: CHEBI:15356\n", - "alt_id: CHEBI:32456\n", - "is_a: CHEBI:33709\n", - "is_a: CHEBI:78608\n", - "is_a: CHEBI:62031\n", - "is_a: CHEBI:26834\n", - "xref: Gmelin:49992\n", - "xref: Gmelin:325859\n", - "xref: Gmelin:49990\n", - "xref: Wikipedia:Cysteine\n", - "xref: Reaxys:1721406\n", - "xref: KNApSAcK:C00007323\n", - "xref: KNApSAcK:C00001351\n", - "xref: KEGG:C00736\n", - "xref: Gmelin:2933\n", - "xref: CAS:3374-22-9\n", - "xref: Beilstein:1721406\n", - "xref: Reaxys:4128885\n", - "xref: Gmelin:363235\n", - "xref: Beilstein:4128885\n", - "property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)\" xsd:string\n", - "property_value: chemrof:has_physiologically_stable_form CHEBI:35237\n", + "id: CHEBI:25944\n", + "name: pesticide\n", + "is_a: CHEBI:33232\n", + "xref: Wikipedia:Pesticide\n", "\n" ] } ], - "execution_count": 191 + "execution_count": 100 }, { "cell_type": "code", @@ -3851,13 +4270,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:32.774643Z", - "start_time": "2024-08-20T18:51:32.771644Z" + "end_time": "2024-08-24T00:16:06.609616Z", + "start_time": "2024-08-24T00:16:06.606612Z" } }, "id": "b0e8b052fec4776f", "outputs": [], - "execution_count": 192 + "execution_count": 101 }, { "cell_type": "code", @@ -3868,19 +4287,19 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:33.164815Z", - "start_time": "2024-08-20T18:51:33.162304Z" + "end_time": "2024-08-24T00:16:06.966113Z", + "start_time": "2024-08-24T00:16:06.963704Z" } }, "id": "2aecf0e555b32a2a", "outputs": [], - "execution_count": 193 + "execution_count": 102 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:34.008812Z", - "start_time": "2024-08-20T18:51:33.892002Z" + "end_time": "2024-08-24T00:16:07.314829Z", + "start_time": "2024-08-24T00:16:07.313073Z" } }, "cell_type": "code", @@ -3915,13 +4334,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:34.608896Z", - "start_time": "2024-08-20T18:51:34.605479Z" + "end_time": "2024-08-24T00:16:07.671947Z", + "start_time": "2024-08-24T00:16:07.668198Z" } }, "id": "6f93d120489cc7ac", "outputs": [], - "execution_count": 194 + "execution_count": 103 }, { "cell_type": "code", @@ -3929,8 +4348,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:34.950432Z", - "start_time": "2024-08-20T18:51:34.948702Z" + "end_time": "2024-08-24T00:16:08.026176Z", + "start_time": "2024-08-24T00:16:08.024489Z" } }, "id": "fb1e972e983ab1ba", @@ -3957,19 +4376,19 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:51:35.309869Z", - "start_time": "2024-08-20T18:51:35.306968Z" + "end_time": "2024-08-24T00:16:08.384392Z", + "start_time": "2024-08-24T00:16:08.381337Z" } }, "id": "373457deffad71d3", "outputs": [], - "execution_count": 195 + "execution_count": 104 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:36.027660Z", - "start_time": "2024-08-20T18:51:36.024778Z" + "end_time": "2024-08-24T00:16:08.738282Z", + "start_time": "2024-08-24T00:16:08.735221Z" } }, "cell_type": "code", @@ -3988,13 +4407,13 @@ ], "id": "4c511c4c5ff0977f", "outputs": [], - "execution_count": 196 + "execution_count": 105 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:39.591460Z", - "start_time": "2024-08-20T18:51:36.367370Z" + "end_time": "2024-08-24T00:16:12.151482Z", + "start_time": "2024-08-24T00:16:09.088855Z" } }, "cell_type": "code", @@ -4005,13 +4424,13 @@ ], "id": "9a96b67935019540", "outputs": [], - "execution_count": 197 + "execution_count": 106 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:43.668233Z", - "start_time": "2024-08-20T18:51:39.951740Z" + "end_time": "2024-08-24T00:16:17.573504Z", + "start_time": "2024-08-24T00:16:12.508101Z" } }, "cell_type": "code", @@ -4026,13 +4445,13 @@ ] } ], - "execution_count": 198 + "execution_count": 107 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:51:44.008139Z", - "start_time": "2024-08-20T18:51:44.003222Z" + "end_time": "2024-08-24T00:16:17.929169Z", + "start_time": "2024-08-24T00:16:17.924748Z" } }, "cell_type": "code", @@ -4049,18 +4468,18 @@ "[Term]\n", "id: CHEBI:35235\n", "name: L-cysteine\n", - "alt_id: CHEBI:32443\n", - "alt_id: CHEBI:17561\n", "alt_id: CHEBI:32442\n", + "alt_id: CHEBI:17561\n", "alt_id: CHEBI:32445\n", + "alt_id: CHEBI:32443\n", "is_a: CHEBI:35237\n", "is_a: CHEBI:59869\n", "is_a: CHEBI:26650\n", "is_a: CHEBI:83813\n", "xref: Gmelin:49993\n", - "xref: Reaxys:5921923\n", - "xref: Gmelin:325856\n", - "xref: Beilstein:5921923\n", + "xref: Reaxys:4128886\n", + "xref: Gmelin:325857\n", + "xref: Beilstein:4128886\n", "xref: YMDB:YMDB00046\n", "xref: Wikipedia:Cysteine\n", "xref: Reaxys:1721408\n", @@ -4076,28 +4495,37 @@ "xref: DrugBank:DB00151\n", "xref: CAS:52-90-4\n", "xref: Beilstein:1721408\n", - "xref: Reaxys:4128886\n", - "xref: Gmelin:325857\n", - "xref: Beilstein:4128886\n", "xref: Gmelin:325860\n", + "xref: Reaxys:5921923\n", + "xref: Gmelin:325856\n", + "xref: Beilstein:5921923\n", + "relationship: RO:0018039 CHEBI:35236\n", + "relationship: RO:0000087 CHEBI:78675\n", + "relationship: RO:0000087 CHEBI:64577\n", + "relationship: RO:0000087 CHEBI:77703\n", + "relationship: RO:0000087 CHEBI:77746\n", + "comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813\n", "property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1\" xsd:string\n", "property_value: chemrof:has_physiologically_stable_form CHEBI:35235\n", "\n" ] } ], - "execution_count": 199 + "execution_count": 108 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-20T18:55:07.947246Z", - "start_time": "2024-08-20T18:51:44.353485Z" + "end_time": "2024-08-24T00:17:38.010791Z", + "start_time": "2024-08-24T00:16:18.302490Z" } }, "cell_type": "code", "source": [ - "all_ids = list(chebi.descendants(ROOT))\n", + "from oaklib.datamodels.vocabulary import OWL_CLASS\n", + "\n", + "# all_ids = list(chebi.descendants(ROOT))\n", + "all_ids = list(chebi.entities(filter_obsoletes=True, owl_type=OWL_CLASS))\n", "terms = generate_write_all(all_ids, \"tmp/all.obo\")\n" ], "id": "7fec1427e96baea2", @@ -4106,68 +4534,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processed 10000 IDs, made 9192 terms\n", - "Processed 20000 IDs, made 18382 terms\n", - "Processed 30000 IDs, made 27595 terms\n", - "Processed 40000 IDs, made 36834 terms\n", - "Processed 50000 IDs, made 46076 terms\n", - "Processed 60000 IDs, made 55286 terms\n", - "Processed 70000 IDs, made 64495 terms\n", - "Processed 80000 IDs, made 73689 terms\n", - "Processed 90000 IDs, made 82886 terms\n", - "Processed 100000 IDs, made 92106 terms\n", - "Processed 110000 IDs, made 101323 terms\n", - "Processed 120000 IDs, made 110551 terms\n", - "Processed 130000 IDs, made 119755 terms\n", - "Processed 140000 IDs, made 128917 terms\n", - "Processed 150000 IDs, made 138144 terms\n", - "Processed 160000 IDs, made 147361 terms\n", - "Processed 170000 IDs, made 156616 terms\n", - "Processed 180000 IDs, made 165852 terms\n", - "Processed 190000 IDs, made 175057 terms\n", - "Processed 200000 IDs, made 184107 terms\n", - "Processed 210000 IDs, made 190965 terms\n", - "Processed 220000 IDs, made 198424 terms\n", - "Processed 230000 IDs, made 207624 terms\n", - "Processed 240000 IDs, made 216843 terms\n", - "Processed 250000 IDs, made 226039 terms\n", - "Processed 260000 IDs, made 235245 terms\n", - "Processed 270000 IDs, made 244469 terms\n", - "Processed 280000 IDs, made 253712 terms\n", - "Processed 290000 IDs, made 262933 terms\n", - "Processed 300000 IDs, made 272147 terms\n", - "Processed 310000 IDs, made 281377 terms\n", - "Processed 320000 IDs, made 290585 terms\n", - "Processed 330000 IDs, made 299798 terms\n", - "Processed 340000 IDs, made 308976 terms\n", - "Processed 350000 IDs, made 318202 terms\n", - "Processed 360000 IDs, made 327420 terms\n", - "Processed 370000 IDs, made 336655 terms\n", - "Processed 380000 IDs, made 345885 terms\n", - "Processed 390000 IDs, made 355079 terms\n", - "Processed 400000 IDs, made 364320 terms\n", - "Processed 410000 IDs, made 373533 terms\n", - "Processed 420000 IDs, made 382211 terms\n", - "Processed 430000 IDs, made 391514 terms\n", - "Processed 440000 IDs, made 400853 terms\n", - "Processed 450000 IDs, made 409434 terms\n", - "Processed 460000 IDs, made 417875 terms\n", - "Processed 470000 IDs, made 426284 terms\n", - "Processed 480000 IDs, made 434681 terms\n", - "Processed 490000 IDs, made 443094 terms\n", - "Processed 500000 IDs, made 451519 terms\n", - "Processed 510000 IDs, made 460013 terms\n", - "Processed 520000 IDs, made 468955 terms\n", - "Processed 530000 IDs, made 477808 terms\n", - "Processed 540000 IDs, made 486681 terms\n", - "Processed 550000 IDs, made 495617 terms\n", - "Processed 560000 IDs, made 502651 terms\n", - "Processed 570000 IDs, made 509659 terms\n", - "Processed 580000 IDs, made 516618 terms\n" + "Processed 10000 IDs, made 9955 terms\n", + "Processed 20000 IDs, made 19917 terms\n", + "Processed 30000 IDs, made 29878 terms\n", + "Processed 40000 IDs, made 38207 terms\n", + "Processed 50000 IDs, made 47129 terms\n", + "Processed 60000 IDs, made 56582 terms\n", + "Processed 70000 IDs, made 65929 terms\n", + "Processed 80000 IDs, made 75147 terms\n", + "Processed 90000 IDs, made 84597 terms\n", + "Processed 100000 IDs, made 93930 terms\n", + "Processed 110000 IDs, made 103890 terms\n", + "Processed 120000 IDs, made 113825 terms\n", + "Processed 130000 IDs, made 123784 terms\n", + "Processed 140000 IDs, made 132470 terms\n", + "Processed 150000 IDs, made 141078 terms\n", + "Processed 160000 IDs, made 149833 terms\n", + "Processed 170000 IDs, made 158566 terms\n", + "Processed 180000 IDs, made 166455 terms\n", + "Processed 190000 IDs, made 174804 terms\n", + "Processed 200000 IDs, made 184378 terms\n" ] } ], - "execution_count": 200 + "execution_count": 109 }, { "cell_type": "code", @@ -4177,8 +4567,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:08.319197Z", - "start_time": "2024-08-20T18:55:08.315691Z" + "end_time": "2024-08-24T00:17:38.389994Z", + "start_time": "2024-08-24T00:17:38.385274Z" } }, "id": "153be3a8b30c2713", @@ -4186,15 +4576,15 @@ { "data": { "text/plain": [ - "519952" + "185206" ] }, - "execution_count": 201, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 201 + "execution_count": 110 }, { "cell_type": "code", @@ -4205,8 +4595,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:08.714983Z", - "start_time": "2024-08-20T18:55:08.664411Z" + "end_time": "2024-08-24T00:17:38.794221Z", + "start_time": "2024-08-24T00:17:38.762055Z" } }, "id": "ac31bc98e2ef7ef7", @@ -4214,15 +4604,15 @@ { "data": { "text/plain": [ - "56" + "16" ] }, - "execution_count": 202, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 202 + "execution_count": 111 }, { "cell_type": "code", @@ -4230,13 +4620,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:11.185465Z", - "start_time": "2024-08-20T18:55:09.054942Z" + "end_time": "2024-08-24T00:17:39.164648Z", + "start_time": "2024-08-24T00:17:39.162207Z" } }, "id": "858eda4cbfd8fb5d", "outputs": [], - "execution_count": 203 + "execution_count": 112 }, { "cell_type": "code", @@ -4246,13 +4636,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:11.891901Z", - "start_time": "2024-08-20T18:55:11.889518Z" + "end_time": "2024-08-24T00:17:39.531493Z", + "start_time": "2024-08-24T00:17:39.529104Z" } }, "id": "dbc4c586e92c3328", "outputs": [], - "execution_count": 204 + "execution_count": 113 }, { "cell_type": "code", @@ -4263,8 +4653,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T19:10:32.972903Z", - "start_time": "2024-08-20T19:10:32.957095Z" + "end_time": "2024-08-24T00:17:39.897479Z", + "start_time": "2024-08-24T00:17:39.893737Z" } }, "id": "90336d5cc96554c7", @@ -4285,7 +4675,7 @@ ] } ], - "execution_count": 210 + "execution_count": 114 }, { "cell_type": "code", @@ -4295,8 +4685,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:12.256599Z", - "start_time": "2024-08-20T18:55:12.251643Z" + "end_time": "2024-08-24T00:17:40.260764Z", + "start_time": "2024-08-24T00:17:40.255847Z" } }, "id": "e1185be596f3b559", @@ -4307,12 +4697,12 @@ "'oligopeptide'" ] }, - "execution_count": 206, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 206 + "execution_count": 115 }, { "cell_type": "code", @@ -4320,8 +4710,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:55:13.596467Z", - "start_time": "2024-08-20T18:55:13.594489Z" + "end_time": "2024-08-24T00:17:40.642267Z", + "start_time": "2024-08-24T00:17:40.640098Z" } }, "id": "3e17588e0084f22d", diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 41e75dd2f..599694f72 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -74,7 +74,6 @@ from oaklib.implementations.obograph.obograph_implementation import ( OboGraphImplementation, ) -from oaklib.implementations.semsimian.semsimian_implementation import SemSimianImplementation from oaklib.implementations.sqldb.sql_implementation import SqlImplementation from oaklib.interfaces import ( BasicOntologyInterface, @@ -178,10 +177,10 @@ trim_graph, ) from oaklib.utilities.publication_utils.pubmed_wrapper import PubmedWrapper -from oaklib.utilities.semsim.similarity_utils import load_information_content_map from oaklib.utilities.subsets.slimmer_utils import ( roll_up_to_named_subset, ) +from oaklib.utilities.subsets.subset_validator import SubsetValidationConfig from oaklib.utilities.table_filler import ColumnDependency, TableFiller, TableMetadata from oaklib.utilities.taxon.taxon_constraint_utils import parse_gain_loss_file from oaklib.utilities.validation.lint_utils import lint_ontology @@ -2585,12 +2584,7 @@ def similarity( if not isinstance(impl, SemanticSimilarityInterface): raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") if information_content_file: - if isinstance(impl, SemSimianImplementation): - impl.custom_ic_map_path = information_content_file - else: - impl.cached_information_content_map = load_information_content_map( - information_content_file - ) + impl.load_information_content_scores(information_content_file) set1it = None set2it = None if not (set1_file or set2_file): @@ -2672,17 +2666,8 @@ def termset_similarity( if not isinstance(impl, SemanticSimilarityInterface): raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") - # TODO: @cmungall - one possibility in future is to relieve client of the need for - # out of band knowledge about impl details. The generic SemSim interface could have - # a load_ic_map method, with the generic impl being to directly load, and the semsimian - # impl passing the path through. if information_content_file: - if isinstance(impl, SemSimianImplementation): - impl.custom_ic_map_path = information_content_file - else: - impl.cached_information_content_map = load_information_content_map( - information_content_file - ) + impl.load_information_content_scores(information_content_file) terms = list(terms) ix = terms.index("@") set1 = list(query_terms_iterator(terms[0:ix], impl)) @@ -3608,17 +3593,15 @@ def roots(output: str, output_type: str, predicates: str, has_prefix: str, annot impl = settings.impl writer = _get_writer(output_type, impl, StreamingCsvWriter) writer.output = output - if isinstance(impl, OboGraphInterface): - actual_predicates = _process_predicates_arg(predicates) - prefixes = list(has_prefix) if has_prefix else None - for curie in impl.roots( - actual_predicates, annotated_roots=annotated_roots, id_prefixes=prefixes - ): - writer.emit(dict(id=curie, label=impl.label(curie))) - # print(f"{curie} ! {impl.label(curie)}") - writer.finish() - else: + if not isinstance(impl, OboGraphInterface): raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") + actual_predicates = _process_predicates_arg(predicates) + prefixes = list(has_prefix) if has_prefix else None + for curie in impl.roots( + actual_predicates, annotated_roots=annotated_roots, id_prefixes=prefixes + ): + writer.emit(dict(id=curie, label=impl.label(curie))) + writer.finish() @main.command() @@ -5423,6 +5406,130 @@ def validate_synonyms( writer.finish() +@main.command() +@autolabel_option +@output_type_option +@adapter_mapping_option +@predicates_option +@click.option( + "--exclude-query", + "-X", + help="A query to exclude certain terms", +) +@click.option( + "--information-content-file", + help="File containing information content for each term", +) +@click.option( + "--information-content-adapter", + help="Adapter to use for information content scores", +) +@click.option("--config-yaml") +@output_option +@configuration_file_option +@click.argument("terms", nargs=-1) +def validate_subset( + terms, + autolabel, + predicates, + adapter_mapping, + information_content_file, + information_content_adapter, + exclude_query, + config_yaml, + output: str, + output_type: str, + configuration_file: str, +): + """ + Validates term subsets. + + The default metrics used for evaluation involve calculating the degree of overlap between members of the + subset. Subsets in general should partition the ontology into sets that overlap as little as possible. + + Different overlap metrics can be plugged in, see the information-content methods for more details. + + The simplest way to run this is to pass in a list of terms via a subset query + + runoak -i po.db validate-subset p i,p .in Tomato + + You can also calculate IC scores for each term and pass them in via a file: + + runoak -i amigo:NCBITaxon:9606 information-content -o human-ic.tsv + + Then + + runoak -i go.db validate-subset p i,p .in goslim_generic --information-content-file human-ic.tsv + + """ + impl = settings.impl + writer = _get_writer(output_type, impl, StreamingYamlWriter) + writer.output = output + writer.autolabel = autolabel + if not isinstance(impl, SemanticSimilarityInterface): + raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") + if information_content_file: + impl.load_information_content_scores(information_content_file) + configs = [] + if config_yaml: + main_config = yaml.safe_load(open(config_yaml)) + # assume GO schema for now + subsets_objs = main_config.get("subsets") + for subset in subsets_objs: + if subset.get("status") == "obsolete": + continue + if "ExclusionList" in subset.get("role", []): + continue + taxa = subset.get("taxa") + if taxa: + taxa_ids = [x["id"] for x in taxa] + else: + taxa_ids = ["NCBITaxon:1"] + for taxa_id in taxa_ids: + config = SubsetValidationConfig( + subset_name=subset["id"], ic_score_adapter_name=f"amigo:{taxa_id}" + ) + configs.append(config) + logging.info(f"Loaded config for {subset['id']} with {taxa_id}") + else: + if terms: + entities = list(query_terms_iterator(terms, impl)) + else: + raise ValueError("No terms provided") + + config = SubsetValidationConfig( + subset_terms=entities, + ) + configs = [config] + for config in configs: + if information_content_adapter: + config.ic_score_adapter_name = information_content_adapter + actual_predicates = _process_predicates_arg(predicates) + if actual_predicates: + config.predicates = actual_predicates + if exclude_query: + config.exclude_terms = list(query_terms_iterator(exclude_query.split(" "), impl)) + import oaklib.utilities.subsets.subset_validator as subset_validator + + try: + result = subset_validator.validate_subset(impl, config) + except Exception as e: + logging.error(e) + continue + if isinstance(writer, StreamingCsvWriter): + # denormalize + obj = result.model_dump() + for k in ["overall", "sibling_pairs", "ancestor_pairs", "leaf_pairs"]: + v = obj.get(k) + del obj[k] + for k2, v2 in v.items(): + obj[f"{k}_{k2}"] = v2 + writer.emit_obj(obj) + else: + writer.emit_obj(result) + writer.finish() + + @main.command() @click.argument("curie_pairs", nargs=-1) @click.option( diff --git a/src/oaklib/implementations/amigo/amigo_implementation.py b/src/oaklib/implementations/amigo/amigo_implementation.py index 03703bd75..1ba432748 100644 --- a/src/oaklib/implementations/amigo/amigo_implementation.py +++ b/src/oaklib/implementations/amigo/amigo_implementation.py @@ -501,7 +501,7 @@ def information_content_scores( if term == "bioentity": n_bioentities = count if n_bioentities is None: - raise ValueError("No bioentities found") + raise ValueError(f"No bioentities found in query {fq}") kwargs = {} # if curies: # kwargs["facet.query"] = [_fq_element(ISA_PARTOF_CLOSURE, curie) for curie in curies] diff --git a/src/oaklib/implementations/semsimian/semsimian_implementation.py b/src/oaklib/implementations/semsimian/semsimian_implementation.py index 377228cb7..21d328dce 100644 --- a/src/oaklib/implementations/semsimian/semsimian_implementation.py +++ b/src/oaklib/implementations/semsimian/semsimian_implementation.py @@ -123,6 +123,14 @@ def _get_semsimian_object( return self.semsimian_object_cache[predicates] + def load_information_content_scores(self, source: str) -> None: + """ + Load information content from a source. + + :param source: The source of information content. + """ + self.custom_ic_map_path = source + def pairwise_similarity( self, subject: CURIE, diff --git a/src/oaklib/implementations/sqldb/sql_implementation.py b/src/oaklib/implementations/sqldb/sql_implementation.py index ffb73d838..136943a94 100644 --- a/src/oaklib/implementations/sqldb/sql_implementation.py +++ b/src/oaklib/implementations/sqldb/sql_implementation.py @@ -46,6 +46,7 @@ OwlAxiomAnnotation, OwlDisjointClassStatement, OwlEquivalentClassStatement, + OwlHasValue, OwlSomeValuesFrom, Prefix, RdfFirstStatement, @@ -57,7 +58,7 @@ TermAssociation, TransitivePropertyNode, ) -from sqlalchemy import and_, create_engine, delete, distinct, func, insert, text, update +from sqlalchemy import and_, create_engine, delete, distinct, func, insert, select, text, update from sqlalchemy.orm import aliased, sessionmaker from sssom_schema import Mapping @@ -993,6 +994,10 @@ def relationships( if exclude_blank and (_is_blank(s) or _is_blank(o)): continue yield s, p, o + for s, p, o in self._subclass_of_has_value_relationships(subjects, predicates, objects): + if exclude_blank and (_is_blank(s) or _is_blank(o)): + continue + yield s, p, o for s, p, o in self._object_property_assertion_relationships( subjects, predicates, objects ): @@ -1055,6 +1060,33 @@ def _object_property_assertion_relationships( continue yield row.subject, row.predicate, row.object + def _subclass_of_has_value_relationships( + self, + subjects: List[CURIE] = None, + predicates: List[PRED_CURIE] = None, + objects: List[CURIE] = None, + ) -> Iterator[RELATIONSHIP]: + sc = aliased(RdfsSubclassOfStatement) + hv = aliased(OwlHasValue) + stmt = select(sc.subject, hv.on_property, hv.filler).join(hv, sc.object == hv.id) + if subjects: + stmt = stmt.where(sc.subject.in_(tuple(subjects))) + if predicates: + predicates = set(predicates).difference( + {IS_A, RDF_TYPE, SUBPROPERTY_OF, RDFS_DOMAIN, RDFS_RANGE, INVERSE_OF} + ) + if not predicates: + return + stmt = stmt.where(hv.on_property.in_(tuple(predicates))) + if objects: + stmt = stmt.where(hv.filler.in_(tuple(objects))) + + logging.debug(f"Abox HasValue query: {stmt}") + + result = self.session.execute(stmt) + for row in result: + yield row.subject, row.on_property, row.filler + def _rdf_type_relationships( self, subjects: List[CURIE] = None, diff --git a/src/oaklib/interfaces/basic_ontology_interface.py b/src/oaklib/interfaces/basic_ontology_interface.py index fe8b9fabf..5ad2396c6 100644 --- a/src/oaklib/interfaces/basic_ontology_interface.py +++ b/src/oaklib/interfaces/basic_ontology_interface.py @@ -781,7 +781,7 @@ def roots( :param predicates: predicates to be considered (default: all) :param ignore_owl_thing: do not consider artificial/trivial owl:Thing when calculating (default=True) :param filter_obsoletes: do not include obsolete/deprecated nodes in results (default=True) - :param annotated_roots: use nodes explicitly annotated as root + :param annotated_roots: use nodes explicitly annotated as root (IAO:0000700) :param id_prefixes: limit search to specific prefixes :return: """ diff --git a/src/oaklib/interfaces/obograph_interface.py b/src/oaklib/interfaces/obograph_interface.py index bf5b855ae..d223b6501 100644 --- a/src/oaklib/interfaces/obograph_interface.py +++ b/src/oaklib/interfaces/obograph_interface.py @@ -206,7 +206,10 @@ def direct_graph( return g def ancestor_graph( - self, start_curies: Union[CURIE, List[CURIE]], predicates: List[PRED_CURIE] = None + self, + start_curies: Union[CURIE, List[CURIE]], + predicates: List[PRED_CURIE] = None, + **kwargs, ) -> Graph: """ Return a graph object that consists of all the nodes specified in the start_curies list, @@ -231,7 +234,7 @@ def ancestor_graph( logging.info( f"Computing ancestor graph for {start_curies} / {predicates} using graph walking" ) - g = self._graph(walk_up(self, start_curies, predicates=predicates)) + g = self._graph(walk_up(self, start_curies, predicates=predicates, **kwargs)) if self.transitive_query_cache is not None: self.transitive_query_cache[key] = g return g diff --git a/src/oaklib/interfaces/semsim_interface.py b/src/oaklib/interfaces/semsim_interface.py index c3e52abe3..4feecec08 100644 --- a/src/oaklib/interfaces/semsim_interface.py +++ b/src/oaklib/interfaces/semsim_interface.py @@ -19,7 +19,10 @@ from oaklib.types import CURIE, PRED_CURIE from oaklib.utilities.iterator_utils import chunk from oaklib.utilities.obograph_utils import as_digraph -from oaklib.utilities.semsim.similarity_utils import setwise_jaccard_similarity +from oaklib.utilities.semsim.similarity_utils import ( + load_information_content_map, + setwise_jaccard_similarity, +) class SemanticSimilarityInterface(BasicOntologyInterface, ABC): @@ -174,6 +177,24 @@ def common_descendants( ) -> Iterable[CURIE]: raise NotImplementedError + def load_information_content_scores(self, source: str) -> None: + """ + Load term information content values from file + + :param source: + :return: + """ + self.cached_information_content_map = load_information_content_map(source) + + def set_information_content_scores(self, scores: Iterable[Tuple[CURIE, float]]) -> None: + """ + Load term information content values from file + + :param source: + :return: + """ + self.cached_information_content_map = dict(scores) + def get_information_content( self, curie: CURIE, predicates: List[PRED_CURIE] = None ) -> Optional[float]: @@ -262,7 +283,7 @@ def information_content_scores( if curie not in self.cached_information_content_map: self.cached_information_content_map[curie] = 0.0 if self.cached_information_content_map is not None: - logging.info("Using cached IC map") + logging.debug("Using cached IC map") for curie in curies: if curie in self.cached_information_content_map: yield curie, self.cached_information_content_map[curie] @@ -324,7 +345,7 @@ def pairwise_similarity( ) if OWL_THING in cas: cas.remove(OWL_THING) - logging.info(f"Retrieving IC for {len(cas)} common ancestors") + logging.debug(f"Retrieving IC for {len(cas)} common ancestors") ics = { a: ic for a, ic in self.information_content_scores(cas, object_closure_predicates=predicates) @@ -339,11 +360,13 @@ def pairwise_similarity( if min_ancestor_information_content is not None: if max_ic < min_ancestor_information_content: return None - logging.info(f"MRCA = {anc} with {max_ic}") + logging.debug(f"MRCA = {anc} with {max_ic}") sim = TermPairwiseSimilarity( subject_id=subject, object_id=object, ancestor_id=anc, + subject_information_content=ics.get(subject, self.get_information_content(subject)), + object_information_content=ics.get(object, self.get_information_content(object)), ancestor_information_content=max_ic, jaccard_similarity=jaccard_similarity, ) diff --git a/src/oaklib/query.py b/src/oaklib/query.py index 66c286273..697c03b76 100644 --- a/src/oaklib/query.py +++ b/src/oaklib/query.py @@ -620,7 +620,11 @@ def chain_results(v): logging.debug(f"Roots: {term}") params = _parse_params(term) this_predicates = params.get("predicates", predicates) - roots = adapter.roots(predicates=this_predicates) + id_prefixes = params.get("prefix", []) + annotated_roots = bool(params.get("annotated", False)) + roots = adapter.roots( + predicates=this_predicates, id_prefixes=id_prefixes, annotated_roots=annotated_roots + ) chain_results(roots) elif term.startswith(".leaf"): logging.debug(f"Leafs: {term}") diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py index c8010b30e..eee89a6c6 100644 --- a/src/oaklib/utilities/caching.py +++ b/src/oaklib/utilities/caching.py @@ -11,7 +11,7 @@ from oaklib.datamodels.vocabulary import APP_NAME -_durations = {'d': 1, 'w': 7, 'm': 30, 'y': 365} +_durations = {"d": 1, "w": 7, "m": 30, "y": 365} _logger = logging.getLogger(__name__) @@ -141,18 +141,18 @@ def from_string(cls, value): """ value = value.lower() - if value == 'refresh': + if value == "refresh": return cls.REFRESH - elif value == 'no-refresh': + elif value == "no-refresh": return cls.NO_REFRESH - elif value in ['reset', 'clear']: + elif value in ["reset", "clear"]: return cls.RESET else: - if m := re.match('^([0-9]+)([sdwmy])?', value): + if m := re.match("^([0-9]+)([sdwmy])?", value): num, qual = m.groups() if not qual: - qual = 'd' - if qual == 's': + qual = "d" + if qual == "s": return cls(int(num)) else: return cls(timedelta(days=int(num) * _durations[qual]).total_seconds()) @@ -204,7 +204,7 @@ def ClickType(cls): from click import ParamType class CachePolicyParamType(ParamType): - name = 'cache-policy' + name = "cache-policy" def convert(self, value, param, ctx): if isinstance(value, cls): @@ -236,7 +236,7 @@ def __init__(self, module): """ self._module = module - self._default_policy = CachePolicy.from_string('1w') + self._default_policy = CachePolicy.from_string("1w") self._forced_policy = None self._policies = [] self._config_file = os.path.join(user_config_dir(APP_NAME), "cache.conf") @@ -380,12 +380,16 @@ def _get_configuration(self, pathname): items = line.split("=", maxsplit=1) pattern = items[0].strip() if len(items) != 2: - _logger.warning(f"{filename}({n}): Ignoring missing caching policy for {pattern}") + _logger.warning( + f"{filename}({n}): Ignoring missing caching policy for {pattern}" + ) continue policy = CachePolicy.from_string(items[1].strip()) if policy is None: - _logger.warning(f"{filename}({n}): Ignoring invalid caching policy for {pattern}") + _logger.warning( + f"{filename}({n}): Ignoring invalid caching policy for {pattern}" + ) continue if pattern in ["default", "*"]: diff --git a/src/oaklib/utilities/kgcl_utilities.py b/src/oaklib/utilities/kgcl_utilities.py index bbb4f916a..e509bf5b5 100644 --- a/src/oaklib/utilities/kgcl_utilities.py +++ b/src/oaklib/utilities/kgcl_utilities.py @@ -5,7 +5,7 @@ import uuid from io import TextIOWrapper from pathlib import Path -from typing import Iterator, List, Optional, TextIO, Union +from typing import Callable, Iterator, List, Optional, TextIO, Union import kgcl_schema.datamodel.kgcl as kgcl import kgcl_schema.grammar.parser as kgcl_parser @@ -72,6 +72,53 @@ def parse_kgcl_files( yield change +CURIE_SLOTS = { + "subject": "subject_type", + "object": "object_type", + "predicate": "predicate_type", + "about_node": "about_node_representation", +} + + +def substitute_curies_for_labels( + changes: Union[List[kgcl.Change], kgcl.Change], label_function: Callable +): + if isinstance(changes, list): + for change in changes: + substitute_curies_for_labels(change, label_function) + return + change = changes + for k, v in vars(change).items(): + if k in CURIE_SLOTS: + new_v = label_function(v) + if new_v: + new_v = f"'{new_v}'" + setattr(change, k, new_v) + setattr(change, CURIE_SLOTS[k], "label") + + +def substitute_labels_for_curies( + changes: Union[List[kgcl.Change], kgcl.Change], curie_function: Callable +): + if isinstance(changes, list): + for change in changes: + substitute_labels_for_curies(change, curie_function) + return + change = changes + for k, v in vars(change).items(): + if k in CURIE_SLOTS: + k_type = CURIE_SLOTS[k] + try: + k_type_value = getattr(change, k_type) + except AttributeError: + continue + if k_type_value == "label" or k_type_value == "literal": + new_v = curie_function(v) + if new_v: + setattr(change, k, new_v) + setattr(change, CURIE_SLOTS[k], "curie") + + def write_kgcl( changes: List[kgcl.Change], file: Optional[Union[str, Path, TextIO]], changes_format="json" ): diff --git a/src/oaklib/utilities/subsets/subset_validator.py b/src/oaklib/utilities/subsets/subset_validator.py new file mode 100644 index 000000000..f3eee44ef --- /dev/null +++ b/src/oaklib/utilities/subsets/subset_validator.py @@ -0,0 +1,200 @@ +from typing import List, Optional + +import numpy as np +import pandas as pd +from pydantic import BaseModel + +from oaklib import BasicOntologyInterface, get_adapter +from oaklib.datamodels.vocabulary import IS_A, PART_OF +from oaklib.interfaces import OboGraphInterface +from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface +from oaklib.types import CURIE, PRED_CURIE + + +class SubsetValidationConfig(BaseModel): + subset_name: Optional[str] = None + subset_description: Optional[str] = None + subset_terms: Optional[list[CURIE]] = None + predicates: Optional[list[PRED_CURIE]] = None + exclude_terms: Optional[List[CURIE]] = None + ic_score_adapter_name: Optional[str] = None + + +class TermPair(BaseModel): + left_term: CURIE + right_term: CURIE + left_term_label: Optional[str] = None + right_term_label: Optional[str] = None + + +class SubsetValidationResultComponent(BaseModel): + min_ic_dist: float + min_ic_dist_term_pairs: Optional[List[TermPair]] = None + min_j_dist: float + min_j_dist_term_pairs: Optional[List[TermPair]] = None + avg_min_ic_dist_by_term: float + avg_min_j_by_term: float + + +class SubsetValidationResult(BaseModel): + subset_name: Optional[str] = None + configuration: Optional[SubsetValidationConfig] = None + combined_score: Optional[float] = None + terms_used: Optional[list[CURIE]] = None + ignored_terms: Optional[list[CURIE]] = None + overall: Optional[SubsetValidationResultComponent] = None + sibling_pairs: Optional[SubsetValidationResultComponent] = None + ancestor_pairs: Optional[SubsetValidationResultComponent] = None + leaf_pairs: Optional[SubsetValidationResultComponent] = None + + +def set_labels(adapter: BasicOntologyInterface, term_pair: TermPair): + term_pair.left_term_label = adapter.label(term_pair.left_term) + term_pair.right_term_label = adapter.label(term_pair.right_term) + + +def validate_subset( + adapter: SemanticSimilarityInterface, configuration: Optional[SubsetValidationConfig] = None +) -> SubsetValidationResult: + """ + Validate a subset of terms and predicates + + :param adapter: + :param configuration: + :return: + """ + graph_adapter = adapter + if not isinstance(graph_adapter, OboGraphInterface): + raise ValueError("Adapter must implement OboGraphInterface") + if configuration is None: + configuration = SubsetValidationConfig() + if not configuration.subset_terms: + if not configuration.subset_name: + raise ValueError("No subset name or terms provided") + configuration.subset_terms = list(adapter.subset_members(configuration.subset_name)) + if not configuration.predicates: + configuration.predicates = [IS_A, PART_OF] + if configuration.ic_score_adapter_name: + ic_adapter = get_adapter(configuration.ic_score_adapter_name) + if not isinstance(ic_adapter, SemanticSimilarityInterface): + raise ValueError( + "Information content adapter must implement SemanticSimilarityInterface" + ) + scores = ic_adapter.information_content_scores(predicates=configuration.predicates) + adapter.set_information_content_scores(scores) + terms = configuration.subset_terms + pairs = [] + t2score = {t: adapter.get_information_content(t) for t in terms} + t2score = {t: s for t, s in t2score.items() if s is not None} + max_score = max(t2score.values()) + ic_factor = 1.0 / (max_score * 2) + terms = [t for t in terms if t in t2score and t2score[t] > 0.0] + if not terms: + raise ValueError("No terms found with information content") + result = SubsetValidationResult( + subset_name=configuration.subset_name, + configuration=configuration, + terms_used=terms, + ignored_terms=[t for t in configuration.subset_terms if t not in terms], + ) + if len(terms) < 2: + result.combined_score = 0.0 + return result + non_leaf_terms = set() + for i, term_i in enumerate(terms): + term_i_ancs = set( + graph_adapter.ancestors(term_i, configuration.predicates, reflexive=False) + ) + non_leaf_terms.update(term_i_ancs) + for j, term_j in enumerate(terms): + if i >= j: + continue + term_j_ancs = set(graph_adapter.ancestors(term_j, configuration.predicates)) + sim = adapter.pairwise_similarity(term_i, term_j, configuration.predicates) + # If the term pair are in separate branches or subgraphs, ignore them; + # this is either if the universal root term (which has zero information content) is the ancestor, + # of if the ancestor is in the set of excluded terms (e.g. upper ontology groupings) + if sim.jaccard_similarity == 0.0: + continue + anc_ic = sim.ancestor_information_content + if np.isclose(anc_ic, 0.0): + continue + if configuration.exclude_terms and sim.ancestor_id in configuration.exclude_terms: + continue + left_ic_diff = abs(sim.subject_information_content - anc_ic) + right_ic_diff = abs(sim.object_information_content - anc_ic) + ic_diff = left_ic_diff + right_ic_diff + + if term_i in term_j_ancs: + rel = 1 + elif term_j in term_i_ancs: + rel = -1 + else: + rel = 0 + pair = { + "term_i": term_i, + "term_j": term_j, + "rel": rel, + "j": 1 - sim.jaccard_similarity, + "ic_distance": ic_diff * ic_factor, + } + pairs.append(pair) + from copy import copy + + rev_pair = copy(pair) + rev_pair["term_i"] = term_j + rev_pair["term_j"] = term_i + pairs.append(rev_pair) + + leaf_terms = {t for t in terms if t not in non_leaf_terms} + df = pd.DataFrame(pairs) + df_sibs = df[df["rel"] == 0] + df_ancs = df[df["rel"] != 0] + df_all = df + df_leafs = df[df["term_i"].isin(leaf_terms) & df["term_j"].isin(leaf_terms)] + + overall_score = 0.0 + n = 0 + for cn, df in [ + ("overall", df_all), + ("sibling_pairs", df_sibs), + ("ancestor_pairs", df_ancs), + ("leaf_pairs", df_leafs), + ]: + min_ic_dist = df["ic_distance"].min() + min_j = df["j"].min() + # Get witnesses for min_ic_dist + min_ic_dist_rows = df[np.isclose(df["ic_distance"], min_ic_dist)] + min_ic_dist_pairs = [] + for _i, row in min_ic_dist_rows.iterrows(): + min_ic_dist_pair = TermPair(left_term=row["term_i"], right_term=row["term_j"]) + set_labels(adapter, min_ic_dist_pair) + min_ic_dist_pairs.append(min_ic_dist_pair) + + # Get witnesses for min_j + # these are floats so we don't expect exact - get approximate + min_j_rows = df[np.isclose(df["j"], min_j)] + # min_j_rows = df[df['j'] == min_j] + min_j_pairs = [] + for _i, row in min_j_rows.iterrows(): + min_j_pair = TermPair(left_term=row["term_i"], right_term=row["term_j"]) + set_labels(adapter, min_j_pair) + min_j_pairs.append(min_j_pair) + + for metric in ["ic_distance", "j"]: + min_agg_metric = f"min_{metric}" + df[min_agg_metric] = df[[metric]].min(axis=1) + component = SubsetValidationResultComponent( + min_ic_dist=min_ic_dist, + min_ic_dist_term_pairs=min_ic_dist_pairs, + min_j_dist=min_j, + min_j_dist_term_pairs=min_j_pairs, + avg_min_ic_dist_by_term=df["min_ic_distance"].mean(), + avg_min_j_by_term=df["min_j"].mean(), + ) + if not np.isnan(min_ic_dist): + overall_score += component.min_ic_dist + component.avg_min_ic_dist_by_term + n += 2 + setattr(result, cn, component) + result.combined_score = overall_score / n + return result diff --git a/tests/input/go-nucleus-ic.tsv b/tests/input/go-nucleus-ic.tsv new file mode 100644 index 000000000..68630073b --- /dev/null +++ b/tests/input/go-nucleus-ic.tsv @@ -0,0 +1,174 @@ +id information_content +BFO:0000002 1.2531189369687112 +BFO:0000003 1.7150230412855292 +BFO:0000004 1.6880559936852597 +BFO:0000015 1.742503777707636 +BFO:0000017 4.273018494406416 +BFO:0000020 3.1950159824051427 +BFO:0000023 4.442943495848728 +BFO:0000040 1.770518153877233 +CARO:0000000 2.742503777707636 +CARO:0000003 3.442943495848729 +CARO:0000006 3.355480654598389 +CARO:0001010 7.442943495848729 +CARO:0030000 2.6880559936852597 +CHEBI:10545 3.1950159824051427 +CHEBI:23367 3.53605290024021 +CHEBI:24431 2.9193815397917158 +CHEBI:24432 4.857980995127573 +CHEBI:24433 4.635588573791124 +CHEBI:25585 4.635588573791124 +CHEBI:26020 5.857980995127572 +CHEBI:26079 5.442943495848728 +CHEBI:26082 4.442943495848728 +CHEBI:28659 4.857980995127573 +CHEBI:33233 3.050626073069968 +CHEBI:33241 4.857980995127573 +CHEBI:33250 3.273018494406416 +CHEBI:33252 3.1950159824051427 +CHEBI:33253 3.121015400961366 +CHEBI:33284 5.857980995127572 +CHEBI:33300 4.442943495848728 +CHEBI:33302 4.273018494406416 +CHEBI:33318 3.8579809951275723 +CHEBI:33560 4.121015400961366 +CHEBI:33579 3.983511877211431 +CHEBI:33675 4.121015400961366 +CHEBI:33937 6.442943495848729 +CHEBI:36338 3.121015400961366 +CHEBI:36339 3.050626073069968 +CHEBI:36340 2.8579809951275723 +CHEBI:36342 2.584962500721156 +CHEBI:36343 2.9193815397917158 +CHEBI:36344 2.983511877211431 +CHEBI:36347 3.050626073069968 +CHEBI:36357 4.442943495848728 +CHEBI:36359 5.121015400961366 +CHEBI:36360 4.857980995127573 +CHEBI:37577 4.635588573791124 +CHEBI:50906 4.442943495848728 +CHEBI:52211 5.121015400961366 +CHEBI:78295 5.442943495848728 +CL:0000000 3.53605290024021 +GO:0003674 2.983511877211431 +GO:0003824 3.273018494406416 +GO:0004857 6.442943495848729 +GO:0005575 2.9193815397917158 +GO:0005622 3.742503777707636 +GO:0005634 5.857980995127572 +GO:0005635 6.442943495848729 +GO:0005737 4.857980995127573 +GO:0005773 6.442943495848729 +GO:0005886 5.857980995127572 +GO:0005938 6.442943495848729 +GO:0006793 3.355480654598389 +GO:0006796 3.742503777707636 +GO:0008047 6.442943495848729 +GO:0008150 1.9510903995190536 +GO:0008152 2.7990873060740036 +GO:0009892 4.857980995127573 +GO:0009893 4.635588573791124 +GO:0009987 2.7990873060740036 +GO:0010562 5.121015400961366 +GO:0010563 5.442943495848728 +GO:0012505 5.857980995127572 +GO:0016020 4.121015400961366 +GO:0016301 4.635588573791124 +GO:0016310 4.273018494406416 +GO:0016740 3.8579809951275723 +GO:0016772 4.442943495848728 +GO:0019207 5.857980995127572 +GO:0019209 7.442943495848729 +GO:0019210 7.442943495848729 +GO:0019220 4.121015400961366 +GO:0019222 3.1950159824051427 +GO:0031090 6.442943495848729 +GO:0031323 3.442943495848729 +GO:0031324 5.121015400961366 +GO:0031325 4.857980995127573 +GO:0031965 7.442943495848729 +GO:0031967 5.857980995127572 +GO:0031975 5.442943495848728 +GO:0033673 7.442943495848729 +GO:0033674 6.442943495848729 +GO:0042325 4.635588573791124 +GO:0042326 6.442943495848729 +GO:0042327 5.857980995127572 +GO:0043085 5.442943495848728 +GO:0043086 5.121015400961366 +GO:0043226 3.983511877211431 +GO:0043227 4.273018494406416 +GO:0043229 4.442943495848728 +GO:0043231 4.857980995127573 +GO:0043549 5.442943495848728 +GO:0044092 4.857980995127573 +GO:0044093 5.121015400961366 +GO:0044237 3.050626073069968 +GO:0045936 5.857980995127572 +GO:0045937 5.442943495848728 +GO:0048518 4.273018494406416 +GO:0048519 4.442943495848728 +GO:0048522 4.635588573791124 +GO:0048523 4.857980995127573 +GO:0050789 2.7990873060740036 +GO:0050790 3.8579809951275723 +GO:0050794 3.1950159824051427 +GO:0051174 3.742503777707636 +GO:0051338 4.635588573791124 +GO:0051347 5.857980995127572 +GO:0051348 6.442943495848729 +GO:0065007 2.2334901302197787 +GO:0065009 3.53605290024021 +GO:0071944 4.857980995127573 +GO:0098590 7.442943495848729 +GO:0099568 6.442943495848729 +GO:0099738 7.442943495848729 +GO:0110165 2.983511877211431 +IAO:0000078 7.442943495848729 +IAO:0000225 7.442943495848729 +IAO:0000409 7.442943495848729 +NCBITaxon:1 1.82823365173352 +NCBITaxon:10239 7.442943495848729 +NCBITaxon:1117 7.442943495848729 +NCBITaxon:131567 1.9193815397917156 +NCBITaxon:142796 5.121015400961366 +NCBITaxon:1783272 5.857980995127572 +NCBITaxon:1798711 6.442943495848729 +NCBITaxon:2 5.442943495848728 +NCBITaxon:2058185 6.442943495848729 +NCBITaxon:2058949 5.857980995127572 +NCBITaxon:2157 7.442943495848729 +NCBITaxon:2605435 4.857980995127573 +NCBITaxon:2611352 6.442943495848729 +NCBITaxon:2759 3.1950159824051427 +NCBITaxon:33083 5.442943495848728 +NCBITaxon:33090 7.442943495848729 +NCBITaxon:33154 6.442943495848729 +NCBITaxon:33682 7.442943495848729 +NCBITaxon:4751 7.442943495848729 +NCBITaxon:554915 4.635588573791124 +NCBITaxon:5782 7.442943495848729 +NCBITaxon:Union_0000000 4.635588573791124 +NCBITaxon:Union_0000002 5.857980995127572 +NCBITaxon:Union_0000004 4.857980995127573 +NCBITaxon:Union_0000006 4.121015400961366 +NCBITaxon:Union_0000007 4.121015400961366 +NCBITaxon:Union_0000008 5.857980995127572 +NCBITaxon:Union_0000020 4.857980995127573 +NCBITaxon:Union_0000021 3.635588573791124 +NCBITaxon:Union_0000022 5.857980995127572 +NCBITaxon:Union_0000023 4.273018494406416 +NCBITaxon:Union_0000024 3.983511877211431 +NCBITaxon:Union_0000025 3.050626073069968 +NCBITaxon:Union_0000030 1.8579809951275723 +OBI:0100026 1.799087306074004 +PATO:0000001 4.273018494406416 +PATO:0001241 4.442943495848728 +PATO:0001396 4.635588573791124 +PATO:0001404 4.857980995127573 +PATO:0001405 7.442943495848729 +PATO:0001406 7.442943495848729 +PATO:0001407 7.442943495848729 +PATO:0001908 6.442943495848729 +PATO:0002505 5.442943495848728 +oio:Subset 7.442943495848729 diff --git a/tests/input/graph_projection.db b/tests/input/graph_projection.db new file mode 100644 index 000000000..131adf10e Binary files /dev/null and b/tests/input/graph_projection.db differ diff --git a/tests/input/graph_projection.omn b/tests/input/graph_projection.omn new file mode 100644 index 000000000..f415fcbf2 --- /dev/null +++ b/tests/input/graph_projection.omn @@ -0,0 +1,47 @@ +Prefix: ex: +Prefix: : +Ontology: + +## +Individual: I + Types: A + +Class: A + SubClassOf: B + +Class: B + SubClassOf: P some CTestSome + SubClassOf: P only CTestOnly + SubClassOf: P exactly 1 CTestExactly1 + SubClassOf: P exactly 0 CTestExactly0 + SubClassOf: P min 1 CTestExactly0 + SubClassOf: P value ITestValue + EquivalentTo: CTestEquiv + +Individual: J + Facts: P I2 + Types: CTestType, P some CTestTypeSome, P value ITestValue + +Individual: I2 + Types: I2C + +Class: I2C + +Class: CTestSome +Class: CTestOnly +Class: CTestExactly1 +Class: CTestExactly0 +Class: CTestEquiv +Class: CTestType +Class: CTestTypeSome + +Individual: ITestValue + Types: ITestValueC + +Class: ITestValueC + +ObjectProperty: P + Characteristics: Transitive + + + diff --git a/tests/test_implementations/__init__.py b/tests/test_implementations/__init__.py index a90060c5c..d3d832123 100644 --- a/tests/test_implementations/__init__.py +++ b/tests/test_implementations/__init__.py @@ -10,13 +10,14 @@ import unittest from dataclasses import dataclass from pathlib import Path -from typing import Callable, List +from typing import Callable, List, Optional import kgcl_schema.grammar.parser as kgcl_parser from kgcl_schema.datamodel import kgcl from kgcl_schema.datamodel.kgcl import Change, NodeObsoletion from kgcl_schema.grammar.render_operations import render from linkml_runtime.dumpers import json_dumper, yaml_dumper +from sssom.constants import OWL_EQUIVALENT_CLASS from oaklib import BasicOntologyInterface, get_adapter from oaklib.datamodels import obograph @@ -47,6 +48,7 @@ OWL_THING, PART_OF, PRECEDED_BY, + RDF_TYPE, RDFS_DOMAIN, RDFS_RANGE, SUBPROPERTY_OF, @@ -716,6 +718,57 @@ def test_equiv_relationships(self, oi: BasicOntologyInterface): rels = oi.outgoing_relationship_map(s1) test.assertCountEqual(rels[EQUIVALENT_CLASS], [s2]) + def test_graph_projections(self, oi: BasicOntologyInterface, supported: Optional[List] = None): + """ + Tests projections of OWL axioms into graph + + :param oi: + :param supported: if not None, only test for these projections + :return: + """ + test = self.test + cases = [ + (True, "B", "P", "CTestSome", "some", "tbox", True), + (True, "B", "P", "ITestValue", "value", "abox", True), + (True, "B", OWL_EQUIVALENT_CLASS, "CTestEquiv", "equiv", "tbox", True), + (True, "I", RDF_TYPE, "A", "type", "abox", True), + (True, "J", "P", "I2", "fact", "abox", True), + # (True, "J", "P", "CTestTypeSome", "fact-some", "abox", True), # TODO + (True, "A", "P", "CTestSome", "some", "tbox", False), + # abox not supported in RG + # (True, "I", "P", "CTestSome", "some", "abox", False), + (False, "B", "P", "CTestExactly0", None, None, True), + ] + all_relations = list(oi.relationships()) + all_tbox_relations = list(oi.relationships(include_tbox=True, include_abox=False)) + all_abox_relations = list(oi.relationships(include_tbox=False, include_abox=True)) + all_entailed_relations = list(oi.relationships(include_entailed=True)) + for expected, s, p, o, projection, typ, direct in cases: + if supported is not None and projection is not None and projection not in supported: + continue + s = "ex:" + s + p = "ex:" + p if ":" not in p else p + o = "ex:" + o + if direct: + if expected: + test.assertIn((s, p, o), all_relations) + else: + test.assertNotIn((s, p, o), all_relations) + if typ == "tbox": + if expected: + test.assertIn((s, p, o), all_tbox_relations) + else: + test.assertNotIn((s, p, o), all_tbox_relations) + if typ == "abox": + if expected: + test.assertIn((s, p, o), all_abox_relations) + else: + test.assertNotIn((s, p, o), all_abox_relations) + if expected: + test.assertIn((s, p, o), all_entailed_relations) + else: + test.assertNotIn((s, p, o), all_entailed_relations) + def test_logical_definitions(self, oi: OboGraphInterface): test = self.test cases = [ diff --git a/tests/test_implementations/test_semsimian_implementation.py b/tests/test_implementations/test_semsimian_implementation.py index acdbd35ca..df28eec17 100644 --- a/tests/test_implementations/test_semsimian_implementation.py +++ b/tests/test_implementations/test_semsimian_implementation.py @@ -170,7 +170,7 @@ def test_all_by_all_similarity_with_custom_ic_map(self): raise AssertionError("SemanticSimilarityInterface not implemented") entities = [VACUOLE, ENDOMEMBRANE_SYSTEM] - sim = (adapter.all_by_all_pairwise_similarity(entities, entities, predicates=self.predicates)) + sim = adapter.all_by_all_pairwise_similarity(entities, entities, predicates=self.predicates) for s in sim: self.assertIsNotNone(s) diff --git a/tests/test_implementations/test_sparql.py b/tests/test_implementations/test_sparql.py index 29e269285..1488ea9aa 100644 --- a/tests/test_implementations/test_sparql.py +++ b/tests/test_implementations/test_sparql.py @@ -96,8 +96,6 @@ def test_instance_graph(self): expected.remove(t) self.assertEqual([], expected) rels = list(oi.relationships()) - for rel in rels: - print(rel) self.assertCountEqual( [ ("http://example.org/b", "rdfs:subClassOf", "http://example.org/a"), diff --git a/tests/test_implementations/test_sqldb.py b/tests/test_implementations/test_sqldb.py index de89ae327..b2d2efa39 100644 --- a/tests/test_implementations/test_sqldb.py +++ b/tests/test_implementations/test_sqldb.py @@ -195,6 +195,11 @@ def test_instance_graph(self): rels, ) + def test_graph_projections(self): + path = INPUT_DIR / "graph_projection.db" + oi = SqlImplementation(OntologyResource(slug=f"sqlite:///{str(path)}")) + self.compliance_tester.test_graph_projections(oi) + def test_all_nodes(self): for curie in self.oi.entities(): logging.info(curie) @@ -643,7 +648,7 @@ def test_associations(self): best_score = score if match == gene: found = True - self.assertAlmostEquals(best_score, score) + self.assertAlmostEqual(best_score, score) self.assertTrue(found) def test_association_counts(self): diff --git a/tests/test_utilities/test_caching.py b/tests/test_utilities/test_caching.py index e8dff2467..20e84911d 100644 --- a/tests/test_utilities/test_caching.py +++ b/tests/test_utilities/test_caching.py @@ -18,8 +18,8 @@ def test_refresh_policy(self): now = time.time() self.assertTrue(policy.refresh(now)) - self.assertTrue(policy.refresh(now + 86400)) # 1 day in the future - self.assertTrue(policy.refresh(now - 86400)) # 1 day in the past + self.assertTrue(policy.refresh(now + 86400)) # 1 day in the future + self.assertTrue(policy.refresh(now - 86400)) # 1 day in the past def test_never_refresh_policy(self): policy = CachePolicy.from_string("no-refresh") @@ -54,15 +54,15 @@ def test_reset_policy(self): self.assertTrue(policy.refresh(now - 86400)) def test_refresh_after_1day_policy(self): - policy = CachePolicy.from_string('1d') + policy = CachePolicy.from_string("1d") self.assertFalse(policy.always_refresh) self.assertFalse(policy.never_refresh) self.assertFalse(policy.reset) now = time.time() - self.assertTrue(policy.refresh(now - 90000)) # 25 hours in the past - self.assertFalse(policy.refresh(now - 82800)) # 23 hours in the past + self.assertTrue(policy.refresh(now - 90000)) # 25 hours in the past + self.assertFalse(policy.refresh(now - 82800)) # 23 hours in the past def test_refresh_file(self): now = time.time() @@ -76,8 +76,8 @@ def test_refresh_file(self): self.assertTrue(CachePolicy.REFRESH.refresh_file(path)) self.assertTrue(CachePolicy.RESET.refresh_file(path)) self.assertFalse(CachePolicy.NO_REFRESH.refresh_file(path)) - self.assertTrue(CachePolicy.from_string('2d').refresh_file(path)) - self.assertFalse(CachePolicy.from_string('4d').refresh_file(path)) + self.assertTrue(CachePolicy.from_string("2d").refresh_file(path)) + self.assertFalse(CachePolicy.from_string("4d").refresh_file(path)) os.unlink(path) @@ -94,10 +94,11 @@ def test_parsing_durations(self): self.assertIsNone(CachePolicy.from_string("bogus")) + class TestFileCache(unittest.TestCase): def test_parse_cache_configuration(self): - cache = FileCache(None) # we don't need a Pystow module here + cache = FileCache(None) # we don't need a Pystow module here with self.assertLogs() as log: cache._get_configuration("tests/input/cache.conf") diff --git a/tests/test_utilities/test_kgcl_utilities.py b/tests/test_utilities/test_kgcl_utilities.py index 1df7cf273..789e99fe8 100644 --- a/tests/test_utilities/test_kgcl_utilities.py +++ b/tests/test_utilities/test_kgcl_utilities.py @@ -1,15 +1,60 @@ import unittest -from oaklib.utilities.kgcl_utilities import parse_kgcl_files +from kgcl_schema.datamodel.kgcl import Change +from kgcl_schema.grammar.render_operations import render +from linkml_runtime.dumpers import yaml_dumper + +from oaklib.utilities.kgcl_utilities import ( + parse_kgcl_files, + substitute_curies_for_labels, + substitute_labels_for_curies, + write_kgcl, +) from tests import INPUT_DIR, OUTPUT_DIR TEST_ONT = INPUT_DIR / "go-nucleus.obo" TEST_OUT = OUTPUT_DIR / "go-nucleus.lexical.yaml" +TEST_OUT_KGCL = OUTPUT_DIR / "test-create.kgcl.txt" +TEST_OUT_KGCL_YAML = OUTPUT_DIR / "test-create.kgcl.yaml" + class TestKgclUtilities(unittest.TestCase): def setUp(self) -> None: pass def test_parse_kgcl_files(self): - list(parse_kgcl_files([str(INPUT_DIR / "test-create.kgcl.txt")], None)) + def w(objs): + for obj in objs: + print(yaml_dumper.dumps(obj)) + + def r(objs): + for obj in objs: + print(render(obj)) + + objs = list(parse_kgcl_files([str(INPUT_DIR / "test-create.kgcl.txt")], None)) + # w(objs) + for obj in objs: + assert isinstance(obj, Change) + + def fake_labeler(curie: str) -> str: + # return "'" + curie.replace(":", "_") + "'" if curie else curie + if curie is None: + return None + return curie.replace(":", "_") + + def fake_unlabeler(label: str) -> str: + if label is None: + return None + return label.replace("_", ":") + + substitute_curies_for_labels(objs, fake_labeler) + write_kgcl(objs, str(TEST_OUT_KGCL), "kgcl") + w(objs) + r(objs) + objs = list(parse_kgcl_files([str(TEST_OUT_KGCL)], None)) + w(objs) + substitute_labels_for_curies(objs, fake_unlabeler) + w(objs) + + # def test_substitution(self): diff --git a/tests/test_utilities/test_validate_subset.py b/tests/test_utilities/test_validate_subset.py new file mode 100644 index 000000000..aeecdd090 --- /dev/null +++ b/tests/test_utilities/test_validate_subset.py @@ -0,0 +1,16 @@ +import yaml + +from oaklib import get_adapter +from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface +from oaklib.utilities.subsets.subset_validator import SubsetValidationConfig, validate_subset +from tests import EXAMPLE_ONTOLOGY_DB, INPUT_DIR + + +def test_validate_subset(): + adapter: SemanticSimilarityInterface = get_adapter(EXAMPLE_ONTOLOGY_DB) + adapter.load_information_content_scores(str(INPUT_DIR / "go-nucleus-ic.tsv")) + for subset in adapter.subsets(): + print(f"## Subset: {subset}") + conf = SubsetValidationConfig(subset_name=subset) + result = validate_subset(adapter, conf) + print(yaml.dump(result.model_dump(exclude_unset=True), sort_keys=False))