Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert SSSOM TSV => JSON => TSV and confirm the MappingSetDataFrame object remains consistent. #429

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,21 +212,28 @@ def parse_sssom_table(
logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
sssom_metadata[k] = v
meta = sssom_metadata

if "curie_map" in sssom_metadata:
if CURIE_MAP in sssom_metadata:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have rewritten all of this code using curies already and am waiting for #426 (ready) then #401 (needs some more work) to get merged. I think it's better that you reduce effort on re-writing it again, since the curies solution is much more clean and elegant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I split the code for updating this into #431

if prefix_map:
# Convert sssom_metadata[CURIE_MAP] keys to lowercase for case-insensitive comparison
curie_map_lower = {k.lower(): k for k in sssom_metadata[CURIE_MAP].keys()}

for k, v in prefix_map.items():
if k in sssom_metadata[CURIE_MAP]:
if sssom_metadata[CURIE_MAP][k] != v:
k_lower = k.lower()

if k_lower in curie_map_lower:
original_key = curie_map_lower[k_lower]
if sssom_metadata[CURIE_MAP][original_key] != v:
logging.warning(
f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
f"conflicts with provided ({prefix_map[k]})."
f"SSSOM prefix map {original_key} ({sssom_metadata[CURIE_MAP][original_key]}) "
f"conflicts with provided ({v})."
)
sssom_metadata[CURIE_MAP][original_key] = v
else:
logging.info(
f"Externally provided metadata {k}:{v} is added to metadata set."
)
sssom_metadata[CURIE_MAP][k] = v

prefix_map = sssom_metadata[CURIE_MAP]

meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
Expand Down Expand Up @@ -264,10 +271,32 @@ def parse_sssom_json(
) -> MappingSetDataFrame:
"""Parse a TSV to a :class:`MappingSetDocument` to a :class`MappingSetDataFrame`."""
raise_for_bad_path(file_path)
metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

with open(file_path) as json_file:
jsondoc = json.load(json_file)

# Get prefix map from jsondoc and update metadata.
# This takes priority over default prefix_map in case of a tie.
jsondoc_prefix_map = jsondoc["@context"]

# Convert keys in both maps to lower case for comparison
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this wise? I think we should not be doing all of this lowercasing. The prefix maps should be correct as they are. Doing all of these ad-hoc operations makes sssom-py insanely hard to work with (I can tell you this from first hand experience since I have tried to understand the whole history of the package)

if prefix_map:
lowercase_prefix_map = {k.lower(): v for k, v in prefix_map.items()}

# Iterate over jsondoc_prefix_map
for key, value in jsondoc_prefix_map.items():
# If lowercase key exists in lowercase_prefix_map, update the value and key
if key.lower() in lowercase_prefix_map:
# Remove the old key-value pair
if key in prefix_map:
del prefix_map[key]
elif key.lower() in prefix_map:
del prefix_map[key.lower()]
# Add the new key-value pair
prefix_map[key] = value

metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=metadata.prefix_map, meta=metadata.metadata)
# df: pd.DataFrame = msdf.df
# if mapping_predicates and not df.empty():
Expand Down
8 changes: 7 additions & 1 deletion src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,13 @@ def clean_prefix_map(self, strict: bool = True) -> None:
if self.metadata:
prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata))

missing_prefixes = prefixes_in_table - self.converter.get_prefixes()
# Convert prefixes_in_table to lowercase
prefixes_in_table_lower = {prefix.lower() for prefix in prefixes_in_table}
# Convert self.converter.get_prefixes() to lowercase
converter_prefixes_lower = {prefix.lower() for prefix in self.converter.get_prefixes()}
missing_prefixes = prefixes_in_table_lower - converter_prefixes_lower
# missing_prefixes = prefixes_in_table - self.converter.get_prefixes()

if missing_prefixes and strict:
raise ValueError(
f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map"
Expand Down
8 changes: 8 additions & 0 deletions tests/data/sample1.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#curie_map:
# FBbt: "http://purl.obolibrary.org/obo/FBbt_"
# ORCID: "https://orcid.org/"
# UBERON: "http://purl.obolibrary.org/obo/UBERON_"
#creator_id:
# - "ORCID:0000-0002-6095-8718"
subject_id subject_label predicate_id object_id mapping_justification
FBbt:00000001 organism semapv:crossSpeciesExactMatch UBERON:0000468 semapv:ManualMappingCuration
20 changes: 19 additions & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@
from_sssom_dataframe,
from_sssom_json,
from_sssom_rdf,
parse_sssom_json,
parse_sssom_table,
)
from sssom.typehints import Metadata
from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns
from sssom.writers import write_table
from sssom.writers import to_json, write_json, write_table
from tests.test_data import data_dir as test_data_dir
from tests.test_data import test_out_dir

Expand Down Expand Up @@ -245,3 +246,20 @@ def test_parse_obographs_merged(self):
)
msdf = parse_sssom_table(outfile)
self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())

def test_tsv_to_json_and_back(self):
"""Test converting SSSOM TSV => JSON => SSSOM TSV such that it is reproducible."""
sample_tsv = f"{test_data_dir}/sample1.sssom.tsv"
json_outfile = f"{test_out_dir}/sample1.json"
msdf1 = parse_sssom_table(sample_tsv)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hrshdhgd other hints: when writing tests, make the things you're testing more explicit. The way this test is written, it doesn't really test anythign specific. Why not explicitly check that ORCID is in the places you expect to to be? Otherwise, someone reading this test learns nothing about what's actually supposed to be happening here.

msdf1.clean_prefix_map()
json_doc = to_json(msdf1)

self.assertEqual(msdf1.prefix_map, json_doc["@context"])

with open(json_outfile, "w") as file:
write_json(msdf1, file)

msdf2 = parse_sssom_json(json_outfile)
msdf2.clean_prefix_map()
self.assertEqual(msdf1.prefix_map, msdf2.prefix_map)