Skip to content

Commit

Permalink
restructure project into subdir, add spacy for NER, work on name parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
phette23 committed Oct 31, 2023
1 parent d0547dd commit f1953cb
Show file tree
Hide file tree
Showing 8 changed files with 1,093 additions and 19 deletions.
56 changes: 56 additions & 0 deletions migrate/maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Mapping our mods/name/role/roleTerms to Invenio roles
# ! NOTE cast terms to LOWERCASE before using this map. Our metadata is inconsistent between title case and lowercase.
# MODS (which uses MARC list): https://www.loc.gov/marc/relators/relaterm.html
# Invenio roles: https://github.com/inveniosoftware/invenio-rdm-records/blob/master/invenio_rdm_records/fixtures/data/vocabularies/roles.yaml
# contactperson, datacollector, datacurator, datamanager, distributor, editor, hostinginstitution, other, producer, projectleader, projectmanager, projectmember, registrationagency, registrationauthority, relatedperson, researchgroup, researcher, rightsholder, sponsor, supervisor, workpackageleader
role_map = {
"academic partner": "",
"architect": "",
"artist": "",
"associated name": "",
"author": "",
"author of introduction, etc.": "",
"book designer": "",
"bookjacket designer": "",
"calligrapher": "",
"cinematographer": "",
"collaborator": "",
"compiler": "",
"creator": "",
"curator": "",
"curator assistant": "",
"designer": "",
"editor": "editor",
"founder": "",
"illustrator": "",
"installation artist": "",
"instructor assistant": "",
"instructor/curator": "",
"interviewee": "",
"interviewer": "",
"manufacturer": "",
"minute taker": "",
"narrator": "",
"organizer": "",
"organizer of meeting": "",
"painter": "",
"performance artist": "",
"performer": "",
"photographer": "",
"platemaker": "",
"poet": "",
"printer": "",
"printmaker": "",
"producer": "",
"professor": "",
"publisher": "",
"recording engineer": "",
"researcher": "researcher",
"reviewer": "",
"sculptor": "",
"singer songwriter": "",
"speaker": "",
"teacher": "",
"transcriber": "",
"writer": "",
}
86 changes: 86 additions & 0 deletions migrate/names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import re

import spacy

nlp = spacy.load("en_core_web_lg")


def ner(str):
# return a list of named PERSON or ORG entities from a string
# https://spacy.io/usage/linguistic-features#named-entities
with nlp.disable_pipes(
["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]
):
doc = nlp(str)
return [
{"entity": e.text, "type": e.label_}
for e in doc.ents
if e.label_ in ("PERSON", "ORG")
]


def entity_to_name(entity, namePart):
if entity["type"] == "PERSON":
return parse_name(entity["entity"])
else:
# default to organization
return {"name": namePart}


def parse_name(namePart):
"""parse wild variety of name strings into {givename, familyname}
or, if it looks like an orgnaization name, return only {name}"""

# semi-colon separated list of names
if ";" in namePart:
return [parse_name(p) for p in namePart.split("; ")]

# usually Surname, Givenname but sometimes other things
if "," in namePart:
# last, first
parts = namePart.split(",")
if len(parts) == 2:
return {"given_name": parts[1], "family_name": parts[0]}
# name with a DOB/dath date string after a second comma
if len(parts) == 3 and re.match("\d{4}\-(\d{4})?", parts[2].strip()):
return {"given_name": parts[1], "family_name": parts[0]}
# two or more commas, maybe we have a comma-separated list of names?
if len(parts) > 2:
entities = ner(namePart)
if len(entities) == 1:
# just one entity, easy, assume the NER type inference is correct
return entity_to_name(entities[0], namePart)
if len(entities) > 1:
# if we have more than one PERSON entity, assume we have a list of names
if len([e for e in entities if e["type"] == "PERSON"]) > 1:
return [parse_name(p) for p in parts]
# split on spaces, often "Givenname Surname", but multiple spaces is where it gets tricky
else:
parts = namePart.split(" ")
if len(parts) == 1:
# looks like an organization name
return {"name": namePart}
if len(parts) == 2:
return {"given_name": parts[0], "family_name": parts[1]}
if len(parts) == 3:
# could be "First Second Third" name or an organization
entities = ner(namePart)
if len(entities) == 0:
# no entities, most likely an organization
return {"name": namePart}
elif len(entities) == 1:
return {"given_name": " ".join(parts[0:2]), "family_name": parts[2]}
# more than one entity but they're all PERSON, assume one name
elif len(entities) > 1 and len(
[e for e in entities if e["type"] == "PERSON"]
) == len(entities):
l = len(parts)
return {
"given_name": " ".join(parts[0 : (l - 1)]),
"family_name": parts[l - 1],
}
else:
# multiple entities of different types, no comma, hard to say what's going on here
raise Exception(
f'Found multiple entities in namePart "{namePart}": {entities}'
)
26 changes: 15 additions & 11 deletions migrate.py → migrate/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,9 @@

import xmltodict


def mklist(x):
# ensure value is a list
if type(x) == list:
return x
elif type(x) == str or type(x) == dict:
return [x]
elif x is None:
return []
# ? should we raise a TypeError here?
from names import parse_name
from maps import role_map
from utils import mklist


def postprocessor(path, key, value):
Expand Down Expand Up @@ -89,6 +82,17 @@ def addl_titles(self):
atitles.append({"title": title, "type": {"id": "other"}})
return atitles

@property
def creators(self):
# mods/name
# https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
names = mklist(self.xml.get("mods", {}).get("name"))
creators = []
for name in names:
# TODO
creators.append(parse_name(name["namePart"]))
return creators

@property
def type(self):
# https://127.0.0.1:5000/api/vocabularies/resourcetypes
Expand Down Expand Up @@ -150,7 +154,7 @@ def get(self):
# contributor/creator roles: contactperson, datacollector, datacurator, datamanager, distributor, editor, hostinginstitution, other, producer, projectleader, projectmanager, projectmember, registrationagency, registrationauthority, relatedperson, researchgroup, researcher, rightsholder, sponsor, supervisor, workpackageleader
"contributors": [],
# https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
"creators": [],
"creators": self.creators,
# additional NON-PUBLICATION dates
# date types: accepted, available, collected, copyrighted, created, issued, other, submitted, updated, valid, withdrawn
"dates": [],
Expand Down
3 changes: 2 additions & 1 deletion tests.py → migrate/tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from migrate import *
from record import Record
from utils import mklist


@pytest.mark.parametrize(
Expand Down
9 changes: 9 additions & 0 deletions migrate/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
def mklist(x):
# ensure value is a list
if type(x) == list:
return x
elif type(x) == str or type(x) == dict:
return [x]
elif x is None:
return []
# ? should we raise a TypeError here?
Loading

0 comments on commit f1953cb

Please sign in to comment.