generated from CatalogueOfLife/data-template-textree
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-coldp.py
executable file
·81 lines (69 loc) · 3.05 KB
/
generate-coldp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
import csv, re
from openpyxl import Workbook, load_workbook
from collections import namedtuple
from unidecode import unidecode
xlsName = 'ALF_Animalia_K-F_2019 Final_2019xii16 2.xlsx'
sheetNames= ['Animalia-Kingdom to Family']
notesCol = 'T'
ranks = ['kingdom','subkingdom','infrakingdom', 'superphylum','phylum','subphylum','infraphylum', 'superclass','class','subclass','infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'series', 'subseries', 'superfamily', 'family']
#xlsName = 'ALF_Non-Animalia_K-F_2019 Final_2019xii19.xlsx'
#sheetNames= ['Archaea-Bacteria', 'Protozoa', 'Chromista', 'Fungi', 'Plantae']
#notesCol = 'U'
#ranks = ['kingdom','subkingdom','infrakingdom', 'superphylum','phylum','subphylum','infraphylum','parvphylum', 'superclass','class','subclass','infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'series', 'subseries', 'superfamily', 'family']
firstRow = 2
firstCol = 'A'
outFile = 'NameUsage.csv'
Taxon = namedtuple('Taxon', 'id col name notes')
refMatcher = re.compile('\\b([A-Z][a-z]+) *(?:et al.?)?[, ]*(\\d{4})\\b')
synMatcher = re.compile('^(.+) *\[= *(.+) *] *')
parents = []
sheet = None
IDprefix = ''
def read(row):
note = sheet[notesCol + str(row+1)].value
for col in range(1, len(ranks)+1):
xxx = chr(ord(firstCol) + col - 1) + str(row+1)
val = sheet[xxx].value
#print(f'{xxx} => {val}')
if val:
return Taxon(str(row), col, val, note)
if note:
return Taxon(str(row), col, None, note)
return None
def writeUsage(out, row, t, parentID, status):
refID = ""
refIDs = []
if t.notes:
ascii = unidecode(t.notes)
# might be several references separated by semicolon
for note in ascii.split(';'):
m = refMatcher.search(note)
if m:
refIDs.append(m.group(1).lower()+m.group(2))
if refIDs:
refID = '|'.join(refIDs)
out.write("%s:%s,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\"\n" % (IDprefix, t.id, parentID, row, status, ranks[t.col-1], t.name.replace('"', '""'), refID, (t.notes or "").replace('"', '""')))
wb = load_workbook(filename = xlsName)
with open(outFile, 'w', newline='') as out:
out.write("ID,parentID,ordinal,status,rank,scientificName,referenceID,remarks\n")
for sheetName in sheetNames:
sheet = wb[sheetName]
IDprefix = sheetName[0:2]
row = 1
t = read(row)
while (t):
#print(t)
if (t.name):
while(parents and parents[-1].col >= t.col):
parents.pop()
pid = IDprefix + ':' + parents[-1].id if parents else None
m = synMatcher.search(t.name)
if m:
s = Taxon('s'+str(row), t.col, m.group(2), None)
t = Taxon(t.id, t.col, m.group(1), t.notes)
writeUsage(out, row, s, t.id, 'synonym')
writeUsage(out, row, t, pid or '', 'accepted')
parents.append(t)
row = row + 1
t = read(row)