-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathhgwnames.py
90 lines (78 loc) · 2.79 KB
/
hgwnames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Parsing of highway names
"""
import os
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import setup
import compat
MATCH_THR = 60
def normalize(text):
return re.sub(' *\(.*\)', '', text.lower().strip())
def parse(name):
"""Transform the name of a street from Cadastre conventions to OSM ones."""
name = re.sub('[,]+', ', ', name).strip() # Avoids comma without trailing space
result = []
for (i, word) in enumerate(re.split('[ ]+', name.strip())):
nude_word = re.sub('^\(|\)$', '', word) # Remove enclosing parenthesis
if i == 0:
try:
new_word = setup.highway_types[word]
except KeyError:
new_word = word
elif nude_word in setup.lowcase_words: # Articles
new_word = word.lower()
elif "'" in word[1:-1]: # Articles with aphostrope
left = word.split("'")[0]
right = word.split("'")[-1]
if left in ['C', 'D', 'L', 'N', 'S']:
new_word = left.lower() + "'" + right.title()
elif right in ['S', 'N', 'L', 'LA', 'LS']:
new_word = left.title() + "'" + right.lower()
else:
new_word = word.title()
else:
new_word = word.title()
new_word = new_word.replace(u'·L', u'·l') # Letra ele geminada
new_word = new_word.replace(u'.L', u'·l') # Letra ele geminada
result.append(new_word)
return ' '.join(result).strip()
def match(name, choices):
"""
Fuzzy search best match for string name in iterable choices, if the result
is not good enough returns the name parsed
Args:
name (str): String to look for
choices (list): Iterable with choices
"""
if fuzz:
normalized = [normalize(c) for c in choices]
matching = process.extractOne(normalize(parse(name)),
normalized, scorer=fuzz.token_sort_ratio)
if matching and matching[1] > MATCH_THR:
return choices[normalized.index(matching[0])]
return parse(name)
def dsmatch(name, dataset, fn):
"""
Fuzzy search best matching object for string name in dataset
Args:
name (str): String to look for
dataset (list): List of objects to search for
fn (function): Function to obtain a string from a element of the dataset
Returns:
First element with the maximun fuzzy ratio.
"""
max_ratio = 0
matching = None
for e in dataset:
if fuzz:
ratio = fuzz.token_sort_ratio(normalize(name), normalize(fn(e)))
if ratio > max_ratio:
max_ratio = ratio
matching = e
elif normalize(name) == normalize(fn(e)):
matching = e
break
return matching