Skip to content

Commit

Permalink
Merge pull request #587 from monarch-initiative/update-koza
Browse files Browse the repository at this point in the history
Update Koza, imports, and tests. Fix lint/format make targets
  • Loading branch information
kevinschaper authored May 16, 2024
2 parents 0d51e68 + 4544837 commit e137505
Show file tree
Hide file tree
Showing 90 changed files with 1,549 additions and 1,427 deletions.
20 changes: 7 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ WGET = /usr/bin/env wget --timestamping --no-verbose
.DEFAULT_GOAL := all
SHELL := bash

RUN = poetry run

.PHONY: all
all: install format test clean
Expand All @@ -29,12 +30,12 @@ install-full:

.PHONY: test
test: install
poetry run python -m pytest tests
$(RUN) python -m pytest tests


.PHONY: docs
docs: install-full
poetry run typer src/monarch_ingest/main.py utils docs --name ingest --output docs/CLI.md
$(RUN) typer src/monarch_ingest/main.py utils docs --name ingest --output docs/CLI.md


.PHONY: clean
Expand All @@ -47,18 +48,11 @@ clean:

.PHONY: lint
lint: install-full
poetry run flake8 --exit-zero --max-line-length 120 src/monarch_ingest/ tests/
poetry run black --check --diff monarch_ingest tests
poetry run isort --check-only --diff monarch_ingest tests
$(RUN) ruff check --diff --exit-zero src/ tests/
$(RUN) black --check --diff -l 120 src/ tests/


.PHONY: format
format: install-full
poetry run autoflake \
--recursive \
--remove-all-unused-imports \
--remove-unused-variables \
--ignore-init-module-imports \
--in-place monarch_ingest tests
poetry run isort monarch_ingest tests
poetry run black monarch_ingest tests
$(RUN) ruff check --fix --exit-zero src/ tests/
$(RUN) black -l 120 src/ tests/
2 changes: 1 addition & 1 deletion docs/Create-an-Ingest/4. Implement.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Most Koza scripts can run in flat mode, which means that the transform code itse
Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer.

```python
from koza.cli_runner import koza_app
from koza.cli_utils import koza_app
from biolink.pydanticmodel_v2 import Gene

# The source name is used for reading and writing
Expand Down
2 changes: 1 addition & 1 deletion docs/Create-an-Ingest/5. Test.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ First, set up your basic fixtures, taking care to set the correct source name an

```python
import pytest
from koza.cli_runner import get_translation_table
from koza.cli_utils import get_translation_table

@pytest.fixture
def tt():
Expand Down
2 changes: 1 addition & 1 deletion ingest_template/example_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from koza.cli_runner import get_translation_table
from koza.cli_utils import get_translation_table


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion ingest_template/source-file-template-csv.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

from biolink.pydanticmodel_v2 import Gene
from koza.cli_runner import koza_app
from koza.cli_utils import koza_app

# You've got 'NCBI_Gene:' and you want 'NCBIGene:'? clean it up.
curie_cleaner = koza_app.curie_cleaner
Expand Down
2 changes: 1 addition & 1 deletion ingest_template/source-file-template-json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
GeneToPhenotypicFeatureAssociation,
PhenotypicFeature,
)
from koza.cli_runner import koza_app
from koza.cli_utils import koza_app

# include logging if necessary
from loguru import logger
Expand Down
1,367 changes: 765 additions & 602 deletions poetry.lock

Large diffs are not rendered by default.

34 changes: 18 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,36 @@ packages = [
]

[tool.poetry.dependencies]
python = ">=3.10,<3.12"
# biolink-model = "^4.2.0"
# When 4.2.0 (or any release after 4.1.6) is released, we can remove the git dependency
biolink-model = { git = "https://github.com/biolink/biolink-model", branch = "master" }
python = "^3.10"
biolink-model = "^4.2.0"
bmt = "^1.0.15"
cat-merge = "0.2.1"
closurizer = "0.5.1"
kghub-downloader = "^0.3.2"
kgx = { git = "https://github.com/biolink/kgx", branch = "master" } # ">=2.1"
koza = ">=0.5.2"
linkml = "1.6.3"
linkml-solr = "0.1.5" # "^0.1.3"
kgx = ">=2.4.0"
koza = ">=0.6.0"
linkml = "^1.7.8"
linkml-solr = ">=0.1.5"
multi-indexer = "0.0.5"
# Other Dependencies
botocore = "^1.31"
importlib-metadata = ">=4.6.1"
loguru = "*"
pydantic = "^2.5"
sh = "^1.14.3"
typer = "^0.7"
typer-cli = "^0.0.13"
typer = "^0.12"
yamllint = "^1.35.1"
linkml-runtime = "1.6.3"
linkml-runtime = "^1.7.5"
# Remove this once cat-merge fixes its pandas dependency
pandas = "2.0.3"

[tool.poetry.group.dev]
optional = true

[tool.poetry.group.dev.dependencies]
pytest = "^7.1.1"
mkdocs = "^1.3.0"
mkdocs-material = "^8.2.9"
pytest = "^8.1.1"
mkdocs = "^1.4"
mkdocs-material = ">=9.5"
black = "^24.3"
ruff = "*"

Expand All @@ -57,11 +56,14 @@ requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.black]
line_length = 120
line-length = 120
skip-string-normalization = true

[tool.ruff]
line-length = 120
ignore = [
"F541", # f-strings with no placeholders
]

]
[tool.ruff.lint.per-file-ignores]
"tests/**.py" = ["F811"] # redefinition of unused imports (mock_koza)
3 changes: 2 additions & 1 deletion src/monarch_ingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from importlib import metadata
__version__ = metadata.version("monarch_ingest")

__version__ = metadata.version("monarch_ingest")
15 changes: 4 additions & 11 deletions src/monarch_ingest/cli_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import csv
import gc
import os
import pkgutil
import sys
import tarfile
import yaml
Expand All @@ -18,7 +17,7 @@
from cat_merge.merge import merge
from closurizer.closurizer import add_closure
from kgx.cli.cli_utils import transform as kgx_transform
from koza.cli_runner import transform_source
from koza.cli_utils import transform_source
from koza.model.config.source_config import OutputFormat
from linkml_runtime.utils.formatutils import camelcase

Expand Down Expand Up @@ -192,7 +191,7 @@ def transform_phenio(
"primary_knowledge_source",
"aggregator_knowledge_source",
"knowledge_level",
"agent_type"
"agent_type",
]
),
axis=1,
Expand Down Expand Up @@ -524,15 +523,9 @@ def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):
)
# index files on s3 after upload
sh.multi_indexer(
*f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(
" "
)
)
sh.gsutil(
*f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(
" "
)
*f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(" ")
)
sh.gsutil(*f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(" "))

logger.debug("Cleaning up files...")
sh.rm(f"output/{release_ver}")
Expand Down
13 changes: 5 additions & 8 deletions src/monarch_ingest/ingests/alliance/gene.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from koza.cli_runner import get_koza_app
from koza.cli_utils import get_koza_app
from source_translation import source_map

from biolink_model.datamodel.pydanticmodel_v2 import Gene
Expand Down Expand Up @@ -48,21 +48,18 @@
id=gene_id,
symbol=row["symbol"],
name=row["symbol"],
full_name=row["name"].replace("\r",""), # Replacement to remove stray carriage returns in XenBase files
full_name=row["name"].replace("\r", ""), # Replacement to remove stray carriage returns in XenBase files
# No place in the schema for gene type (SO term) right now
# type=row["soTermId"],
in_taxon=[in_taxon],
in_taxon_label=in_taxon_label,
provided_by=[source]
provided_by=[source],
)

if row["basicGeneticEntity"]["crossReferences"]:
gene.xref = [
koza_app.curie_cleaner.clean(xref["id"])
for xref in row["basicGeneticEntity"]["crossReferences"]
]
gene.xref = [koza_app.curie_cleaner.clean(xref["id"]) for xref in row["basicGeneticEntity"]["crossReferences"]]
if "synonyms" in row["basicGeneticEntity"].keys():
# more handling for errant carriage returns
gene.synonym = [synonym.replace("\r","") for synonym in row["basicGeneticEntity"]["synonyms"] ]
gene.synonym = [synonym.replace("\r", "") for synonym in row["basicGeneticEntity"]["synonyms"]]

koza_app.write(gene)
18 changes: 6 additions & 12 deletions src/monarch_ingest/ingests/alliance/gene_to_expression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

import uuid

from koza.cli_runner import get_koza_app
from koza.cli_utils import get_koza_app
from source_translation import source_map

from biolink_model.datamodel.pydanticmodel_v2 import GeneToExpressionSiteAssociation, KnowledgeLevelEnum, AgentTypeEnum
Expand Down Expand Up @@ -33,30 +32,27 @@
# but may have an UBERON term that we can use
# stage_term_id = get_data(row, "whenExpressed.stageUberonSlimTerm.uberonTerm")



publication_ids = [get_data(row, "evidence.publicationId")]

xref = get_data(row, "crossReference.id")
if xref:
publication_ids.append(xref)


# Our current ingest policy is to first use a reported Anatomical structure term...
if anatomical_entity_id:
koza_app.write(
GeneToExpressionSiteAssociation(
id="uuid:" + str(uuid.uuid1()),
subject=gene_id,
predicate='biolink:expressed_in',
predicate="biolink:expressed_in",
object=anatomical_entity_id,
stage_qualifier=stage_term_id,
qualifiers=([get_data(row, "assay")] if get_data(row, "assay") else None),
publications=publication_ids,
aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
primary_knowledge_source=source,
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent
agent_type=AgentTypeEnum.manual_agent,
)
)

Expand All @@ -67,15 +63,15 @@
GeneToExpressionSiteAssociation(
id="uuid:" + str(uuid.uuid1()),
subject=gene_id,
predicate='biolink:expressed_in',
predicate="biolink:expressed_in",
object=cellular_component_id,
stage_qualifier=stage_term_id,
qualifiers=([get_data(row, "assay")] if get_data(row, "assay") else None),
publications=publication_ids,
aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
primary_knowledge_source=source,
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent
agent_type=AgentTypeEnum.manual_agent,
)
)
else:
Expand All @@ -85,6 +81,4 @@
)

except Exception as exc:
logger.error(
f"Alliance gene expression ingest parsing exception for data row:\n\t'{str(row)}'\n{str(exc)}"
)
logger.error(f"Alliance gene expression ingest parsing exception for data row:\n\t'{str(row)}'\n{str(exc)}")
13 changes: 8 additions & 5 deletions src/monarch_ingest/ingests/alliance/gene_to_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

import uuid

from koza.cli_runner import get_koza_app
from koza.cli_utils import get_koza_app
from source_translation import source_map

from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation, KnowledgeLevelEnum, AgentTypeEnum
from biolink_model.datamodel.pydanticmodel_v2 import (
GeneToPhenotypicFeatureAssociation,
KnowledgeLevelEnum,
AgentTypeEnum,
)


from loguru import logger
Expand Down Expand Up @@ -42,9 +46,8 @@
publications=[row["evidence"]["publicationId"]],
aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
primary_knowledge_source=source,
knowledge_level = KnowledgeLevelEnum.knowledge_assertion,
agent_type = AgentTypeEnum.manual_agent

knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent,
)

if "conditionRelations" in row.keys() and row["conditionRelations"] is not None:
Expand Down
18 changes: 9 additions & 9 deletions src/monarch_ingest/ingests/alliance/publication.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from koza.cli_runner import get_koza_app
from koza.cli_utils import get_koza_app
from dateutil.parser import parse, ParserError

from biolink_model.datamodel.pydanticmodel_v2 import Publication
Expand All @@ -9,9 +9,7 @@
while (row := koza_app.get_row()) is not None:

# TODO: remove DOI exclusion once curie regex can handle them
xrefs = [
xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")
]
xrefs = [xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")]

# Parse creation date for different time formats
creation_date = row["datePublished"]
Expand All @@ -21,10 +19,12 @@
creation_date = None

source: str
if 'MODReferenceTypes' in row and \
len(row['MODReferenceTypes']) > 0 and \
'source' in row['MODReferenceTypes'][0] and \
row['MODReferenceTypes'][0]['source'] in source_map:
if (
'MODReferenceTypes' in row
and len(row['MODReferenceTypes']) > 0
and 'source' in row['MODReferenceTypes'][0]
and row['MODReferenceTypes'][0]['source'] in source_map
):
source = source_map[row['MODReferenceTypes'][0]['source']]
else: # default source
source = "infores:alliancegenome"
Expand All @@ -36,7 +36,7 @@
xref=xrefs,
type=[koza_app.translation_table.resolve_term("publication")],
creation_date=creation_date,
provided_by=[source]
provided_by=[source],
)

if "authors" in row.keys():
Expand Down
2 changes: 1 addition & 1 deletion src/monarch_ingest/ingests/bgee/gene_to_expression.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from koza.cli_runner import get_koza_app
from koza.cli_utils import get_koza_app
from monarch_ingest.ingests.bgee.gene_to_expression_utils import process_koza_source


Expand Down
Loading

0 comments on commit e137505

Please sign in to comment.