-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlparse_ads.py
executable file
·56 lines (42 loc) · 1.18 KB
/
xmlparse_ads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import csv
from lxml import etree
adsinfo = "ads.csv"
adsinfo_data = []
parser = etree.XMLParser(recover=False, remove_blank_text=False)
inp_file = "ads.xml"
# use lxml to read and parse xml
root = etree.parse(inp_file, parser)
# element names with data to keep
tag_list = [ "ad_id", "category_id", "region_id", "ad_price" ]
# add field names by copying tag_list
adsinfo_data.append(tag_list[:])
# pull info out of each node
def get_info(a):
info = []
for tag in tag_list:
node = a.find(tag)
if node is not None and node.text:
info.append(node.text)
else:
info.append("")
return info
# write csv file
def write_csv(inp, outfile):
f = open(outfile, 'w', encoding='utf-8')
print("--> writing", outfile, "...", end="")
csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL, delimiter=";")
for row in inp:
csv_writer.writerow(row)
f.close()
print("[finished]")
## start program
print("parsing xml file", inp_file, "...", end="")
# get all elements
ads = root.findall(".//ad")
for a in ads:
ads_info = get_info(a)
if ads_info:
adsinfo_data.append(ads_info)
print("[finished]")
print("writing output files...")
write_csv(adsinfo_data, adsinfo)