Skip to content

Commit

Permalink
add full entries for forms with ˚, new structure for IPA generation (…
Browse files Browse the repository at this point in the history
…#9), handle compounds as new entries (#7), and general restructuring of extensions
  • Loading branch information
aryamanarora committed Apr 20, 2021
1 parent a775e5b commit 06bd0ad
Show file tree
Hide file tree
Showing 30 changed files with 173,536 additions and 240,143 deletions.
59 changes: 52 additions & 7 deletions cldf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import csv
import unidecode
import re
import glob
from segments.tokenizer import Tokenizer, Profile
import unicodedata

superscript = {
'a': 'ᵃ', 'e': 'ᵉ', 'i': 'ᶦ',
Expand All @@ -11,6 +14,12 @@
'ŕ': 'ʳ́', 'ĕ': 'ᵉ̆', 'n': 'ⁿ'
}

tokenizers = {}

for file in glob.glob("data/ipa/cdial/*.txt"):
lang = file.split('/')[-1].split('.')[0]
tokenizers[lang] = Tokenizer(file)

with open('data/all.json', 'r') as fin:
data = json.load(fin)

Expand All @@ -29,7 +38,7 @@
write2.writerow([row[0], row[2], row[3], row[4]])

a = set()
with open('cldf/forms.csv', 'w') as fout, open('data/extensions.csv', 'r') as fin:
with open('cldf/forms.csv', 'w') as fout, open('errors.txt', 'w') as errors:
num = 0
write = csv.writer(fout)
write.writerow(['ID', 'Language_ID', 'Parameter_ID', 'Form', 'Gloss', 'Native', 'Phonemic', 'Cognateset', 'Description', 'Source'])
Expand All @@ -42,7 +51,8 @@

lang = unidecode.unidecode(lang)
a.add(lang)
for word in form['words']:
reference = ''
for i, word in enumerate(form['words']):
num += 1
if lang == 'Indo-Aryan':
if word == '': continue
Expand All @@ -54,15 +64,50 @@
write.writerow([num, lang, entry, word, desc, '', '', entry, '', 'CDIAL'])
else:
if word[0] == '': continue
word[0] = word[0].strip('.,')
word[0] = word[0].lower()

oldest = unicodedata.normalize('NFD', word[0])
oldest = oldest.replace('̄˘', '̄̆')
oldest = oldest.replace('̆̄', '̄̆')
oldest = oldest.replace('̄̆', '̄̆')
if '̄̆' in oldest:
form['words'].append([oldest.replace('̄̆', '̄'), word[1]])
word[0] = word[0].replace('̄̆', '')
word[0] = unicodedata.normalize('NFC', word[0])

for i in superscript:
word[0] = word[0].replace('ˊ', '́').replace(' --', '-').replace('-- ', '-')
word[0] = word[0].replace(f'<superscript>{i}</superscript>', superscript[i])
write.writerow([num, lang, entry, word[0], word[1], '', '', entry, '', 'CDIAL'])

if '˚' not in word[0]: reference = word[0]
else:
old = word[0]
if word[0] != '˚':
if word[0][-1] == '˚':
word[0] = re.sub(r'^.*?' + word[0][-2], word[0][:-1], reference)
elif word[0][0] == '˚':
word[0] = re.sub(word[0][1] + r'[^' + word[0][1] + r']*?$', word[0][1:], reference)
if reference == word[0]:
word[0] = old

ipa = ''
if lang in tokenizers and '˚' not in word[0]:
ipa = tokenizers[lang](word[0], column='IPA').replace(' ', '').replace('#', ' ')
if '�' in ipa:
if lang == 'S': errors.write(f'{lang} {oldest} {word[0]} {ipa}\n')
ipa = ''

write.writerow([num, lang, entry, word[0], word[1], '', ipa, entry, '', 'CDIAL'])

read = csv.reader(fin)
for i, row in enumerate(read):
if i == 0: continue
write.writerow([f'e{i}', row[0], row[1], row[2], row[3], row[4], row[5], row[1], row[6], row[7]])
i = 0
for file in glob.glob("data/words/*.csv"):
with open(file, 'r') as fin:
read = csv.reader(fin)
for row in read:
if row[1]:
write.writerow([f'e{i}', row[0], row[1], row[2], row[3], row[4], row[5], row[1], row[6], row[7]])
i += 1

print(sorted(list(a)))
b = set()
Expand Down
19 changes: 16 additions & 3 deletions cldf/cognates.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15425,11 +15425,24 @@ e6,Indo-Aryan,saptā-nnādya,'seven grains',patyal2
e7,Indo-Aryan,*kalyā-hāra,'breakfast',patyal5
e8,Indo-Aryan,*citra-karbura,'spotted',patyal5
e9,Indo-Aryan,*catur-akṣa,'four-eyed',patyal5
e10,Indo-Aryana,jánitr̥,,patyal5
e10,Indo-Aryan,jánitr̥,,patyal5
e11,Indo-Aryan,*dēva-sthala,,patyal5
e12,Indo-Aryan,*vādya-tantra,,patyal5
e13,Indo-Aryan,*mr̥taka-sthāna,,patyal5
e14,Indo-Aryan,*locis-kāṣṭhikā,,patyal5
e15,Indo-Aryan,sapakṣa,'easy',patyal5
e16,Indo-Aryan,*ubbāsī?,'yawn',arora
e17,Indo-Aryan,*kakkara?,'cloud',arora
e16,Indo-Aryan,*ubbāsī,'yawn',arora
e17,Indo-Aryan,*kakkara,'cloud',arora
e18,Indo-Aryan,*celha,'waist',arora
e19,Indo-Aryan,*cora,'asparagus beans',arora
e20,Indo-Aryan,*cora,'hut' (perhaps related to cakrá?),arora
e21,Indo-Aryan,*jhañjhaṭ,'problem',arora
e22,Indo-Aryan,*padopānah,"'shoe' [padá1, upānáh]",fritz
e23,Indo-Aryan,*yavanakadala,"'banana' [yavaná, kadala]",fritz
e24,Indo-Aryan,*yavanavālukā,"'sand' [yavaná, vālukā]",fritz
e25,Indo-Aryan,*varṣadhanus,"'rainbow' [varṣá, dhánus]",fritz
e26,Indo-Aryan,*ayasgaṇḍa,"'iron' [áyas, gaṇḍa]",fritz
e27,Indo-Aryan,*pādadīrghamarkaṭa,"'long-legged (?) spider' [pā́da, dīrghá, markaṭa]",fritz
e28,Indo-Aryan,*bhakkhamukha,"'lump-faced' [bhakkha, múkha]",fritz
e29,Indo-Aryan,*rūpāsti,"'has form?' [rūpá, ásti]",fritz
e30,Indo-Aryan,*anyaika,"'another' [anyá, ḗka]",fritz
Expand Down
Loading

0 comments on commit 06bd0ad

Please sign in to comment.