restructure project into subdir, add spacy for NER, work on name parsing

cca · Oct 31, 2023 · f1953cb · f1953cb
1 parent d0547dd
commit f1953cb
Show file tree

Hide file tree

Showing 8 changed files with 1,093 additions and 19 deletions.
diff --git a/migrate/maps.py b/migrate/maps.py
@@ -0,0 +1,56 @@
+# Mapping our mods/name/role/roleTerms to Invenio roles
+# ! NOTE cast terms to LOWERCASE before using this map. Our metadata is inconsistent between title case and lowercase.
+# MODS (which uses MARC list): https://www.loc.gov/marc/relators/relaterm.html
+# Invenio roles: https://github.com/inveniosoftware/invenio-rdm-records/blob/master/invenio_rdm_records/fixtures/data/vocabularies/roles.yaml
+# contactperson, datacollector, datacurator, datamanager, distributor, editor, hostinginstitution, other, producer, projectleader, projectmanager, projectmember, registrationagency, registrationauthority, relatedperson, researchgroup, researcher, rightsholder, sponsor, supervisor, workpackageleader
+role_map = {
+    "academic partner": "",
+    "architect": "",
+    "artist": "",
+    "associated name": "",
+    "author": "",
+    "author of introduction, etc.": "",
+    "book designer": "",
+    "bookjacket designer": "",
+    "calligrapher": "",
+    "cinematographer": "",
+    "collaborator": "",
+    "compiler": "",
+    "creator": "",
+    "curator": "",
+    "curator assistant": "",
+    "designer": "",
+    "editor": "editor",
+    "founder": "",
+    "illustrator": "",
+    "installation artist": "",
+    "instructor assistant": "",
+    "instructor/curator": "",
+    "interviewee": "",
+    "interviewer": "",
+    "manufacturer": "",
+    "minute taker": "",
+    "narrator": "",
+    "organizer": "",
+    "organizer of meeting": "",
+    "painter": "",
+    "performance artist": "",
+    "performer": "",
+    "photographer": "",
+    "platemaker": "",
+    "poet": "",
+    "printer": "",
+    "printmaker": "",
+    "producer": "",
+    "professor": "",
+    "publisher": "",
+    "recording engineer": "",
+    "researcher": "researcher",
+    "reviewer": "",
+    "sculptor": "",
+    "singer songwriter": "",
+    "speaker": "",
+    "teacher": "",
+    "transcriber": "",
+    "writer": "",
+}
diff --git a/migrate/names.py b/migrate/names.py
@@ -0,0 +1,86 @@
+import re
+
+import spacy
+
+nlp = spacy.load("en_core_web_lg")
+
+
+def ner(str):
+    # return a list of named PERSON or ORG entities from a string
+    # https://spacy.io/usage/linguistic-features#named-entities
+    with nlp.disable_pipes(
+        ["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]
+    ):
+        doc = nlp(str)
+        return [
+            {"entity": e.text, "type": e.label_}
+            for e in doc.ents
+            if e.label_ in ("PERSON", "ORG")
+        ]
+
+
+def entity_to_name(entity, namePart):
+    if entity["type"] == "PERSON":
+        return parse_name(entity["entity"])
+    else:
+        # default to organization
+        return {"name": namePart}
+
+
+def parse_name(namePart):
+    """parse wild variety of name strings into {givename, familyname}
+    or, if it looks like an orgnaization name, return only {name}"""
+
+    # semi-colon separated list of names
+    if ";" in namePart:
+        return [parse_name(p) for p in namePart.split("; ")]
+
+    # usually Surname, Givenname but sometimes other things
+    if "," in namePart:
+        # last, first
+        parts = namePart.split(",")
+        if len(parts) == 2:
+            return {"given_name": parts[1], "family_name": parts[0]}
+        # name with a DOB/dath date string after a second comma
+        if len(parts) == 3 and re.match("\d{4}\-(\d{4})?", parts[2].strip()):
+            return {"given_name": parts[1], "family_name": parts[0]}
+        # two or more commas, maybe we have a comma-separated list of names?
+        if len(parts) > 2:
+            entities = ner(namePart)
+            if len(entities) == 1:
+                # just one entity, easy, assume the NER type inference is correct
+                return entity_to_name(entities[0], namePart)
+            if len(entities) > 1:
+                # if we have more than one PERSON entity, assume we have a list of names
+                if len([e for e in entities if e["type"] == "PERSON"]) > 1:
+                    return [parse_name(p) for p in parts]
+    # split on spaces, often "Givenname Surname", but multiple spaces is where it gets tricky
+    else:
+        parts = namePart.split(" ")
+        if len(parts) == 1:
+            # looks like an organization name
+            return {"name": namePart}
+        if len(parts) == 2:
+            return {"given_name": parts[0], "family_name": parts[1]}
+        if len(parts) == 3:
+            # could be "First Second Third" name or an organization
+            entities = ner(namePart)
+            if len(entities) == 0:
+                # no entities, most likely an organization
+                return {"name": namePart}
+            elif len(entities) == 1:
+                return {"given_name": " ".join(parts[0:2]), "family_name": parts[2]}
+            # more than one entity but they're all PERSON, assume one name
+            elif len(entities) > 1 and len(
+                [e for e in entities if e["type"] == "PERSON"]
+            ) == len(entities):
+                l = len(parts)
+                return {
+                    "given_name": " ".join(parts[0 : (l - 1)]),
+                    "family_name": parts[l - 1],
+                }
+            else:
+                # multiple entities of different types, no comma, hard to say what's going on here
+                raise Exception(
+                    f'Found multiple entities in namePart "{namePart}": {entities}'
+                )
diff --git a/migrate.py → migrate/record.py b/migrate.py → migrate/record.py
@@ -9,16 +9,9 @@
 
 import xmltodict
 
-
-def mklist(x):
-    # ensure value is a list
-    if type(x) == list:
-        return x
-    elif type(x) == str or type(x) == dict:
-        return [x]
-    elif x is None:
-        return []
-    # ? should we raise a TypeError here?
+from names import parse_name
+from maps import role_map
+from utils import mklist
 
 
 def postprocessor(path, key, value):
@@ -89,6 +82,17 @@ def addl_titles(self):
                         atitles.append({"title": title, "type": {"id": "other"}})
         return atitles
 
+    @property
+    def creators(self):
+        # mods/name
+        # https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
+        names = mklist(self.xml.get("mods", {}).get("name"))
+        creators = []
+        for name in names:
+            # TODO
+            creators.append(parse_name(name["namePart"]))
+        return creators
+
     @property
     def type(self):
         # https://127.0.0.1:5000/api/vocabularies/resourcetypes
@@ -150,7 +154,7 @@ def get(self):
                 # contributor/creator roles: contactperson, datacollector, datacurator, datamanager, distributor, editor, hostinginstitution, other, producer, projectleader, projectmanager, projectmember, registrationagency, registrationauthority, relatedperson, researchgroup, researcher, rightsholder, sponsor, supervisor, workpackageleader
                 "contributors": [],
                 # https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n
-                "creators": [],
+                "creators": self.creators,
                 # additional NON-PUBLICATION dates
                 # date types: accepted, available, collected, copyrighted, created, issued, other, submitted, updated, valid, withdrawn
                 "dates": [],

diff --git a/tests.py → migrate/tests.py b/tests.py → migrate/tests.py
@@ -1,6 +1,7 @@
 import pytest
 
-from migrate import *
+from record import Record
+from utils import mklist
 
 
 @pytest.mark.parametrize(

diff --git a/migrate/utils.py b/migrate/utils.py
@@ -0,0 +1,9 @@
+def mklist(x):
+    # ensure value is a list
+    if type(x) == list:
+        return x
+    elif type(x) == str or type(x) == dict:
+        return [x]
+    elif x is None:
+        return []
+    # ? should we raise a TypeError here?