-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpython_parser.py
96 lines (74 loc) · 3.46 KB
/
python_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""This script parses a JSON file containing contact data and writes the data to an Excel file."""
import json
import re
import pandas as pd
import spacy
with open('contact_data1_2.json', 'r', encoding="utf-8") as f:
data = json.load(f)
nlp = spacy.load("en_core_web_sm")
def split_name(name):
"""function to split a name into first and last name using spaCy,regex"""
cleaned_name = re.sub(r'\(([^)]+)\)', '', name)
titles = ["Dr.", "Ms.", "Mr.", "PhD", "Mrs.", "Miss", "Mx.", "Prof.", "Dr", "Ms", "Mr",
"PhD", "Mrs", "Miss", "Mx", "Prof", "P.E.", "Ph.D.", "WOSB", "MBE", "MBA", "PMP",
"PE", "CPA", "CFLE", "PH", "MBS", "MPH", "CFP", "CRC", "CPC", "CPMA", "MD",
"WBE", "DBE", ",", "RDN", "CDN", "M.P.S.", "PSA", "CRS", "GRI", "AHWD", "GMS",
"AIA", "LEED", "AP", "MPA", "WBE", "DPT", "NCARB", "CCIM", "CPM", "CMMC", "SME",
"M.Ed.", "CBI", "DDS", "CFM", "MSW", "LCSW", "MBBS", "MS", "MIT", "CDIA", "Ed", "MCT",
"CMP", "BC", "SCP", "CGMA", "CISM", "CASP"]
for title in titles:
cleaned_name = cleaned_name.replace(title, '')
# Process the name with spaCy
doc = nlp(cleaned_name)
# Remove pronouns based on token's POS tag
non_pronoun_words = [token.text for token in doc if token.pos_ != 'PRON']
# strip() to remove any leading or trailing spaces
cleaned_name = ' '.join(non_pronoun_words).strip()
# Split the cleaned name into first and last name
parts = cleaned_name.split()
first_name = parts[0] if parts else '' # Check if parts is not empty
last_name = ' '.join(parts[1:]) if len(parts) > 1 else ''
return first_name, last_name
def standardize_phone(phone):
"""function to standardize phone numbers to (XXX) XXX-XXXX"""
# Remove all non-numeric characters
digits = re.sub(r'\D', '', phone)
# Remove +1 at the beginning if the number is more than 10 digits long
if len(digits) > 10 and digits.startswith('1'):
digits = digits[1:]
# Return in the format (XXX) XXX-XXXX
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
def extract_region(address):
"""function to extract the region from an address"""
match = re.search(r'Primary State/Region of Business Operation(.+)', address)
return match.group(1).strip() if match else ''
def extract_graduating_site(address):
match = re.search(r'Graduating Site(.*?)Graduating Cohort', address)
return match.group(1).strip() if match else ''
cleaned_data = []
for record in data:
first_name, last_name = split_name(record.get('name', ''))
phone = standardize_phone(record.get('phone', ''))
address = extract_region(record.get('address', ''))
graduating_site = extract_graduating_site(record.get('address', ''))
email = record.get('email', '')
website = record.get('website', '')
businessName = record.get('businessName', '')
jobTitle = record.get('jobTitle', '')
industry = record.get('industry', '')
personPageUrl = record.get('personPageUrl', '')
cleaned_data.append({
'First Name': first_name,
'Last Name': last_name,
'Phone': phone,
'Address': address,
'Graduating site': graduating_site,
'Email': email,
'Website': website,
'Business Name': businessName,
'Job Title': jobTitle,
'Industry': industry,
'Person PageURL': personPageUrl
})
df = pd.DataFrame(cleaned_data)
df.to_excel('cleaned_contact_data.xlsx', index=False)