This repository has been archived by the owner on Sep 25, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfromPAGEtoText.py
131 lines (112 loc) · 5.38 KB
/
fromPAGEtoText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import datetime
from bs4 import BeautifulSoup
from config import textcollectionnames as collection_list
def create_folder(directory):
"""Create a new folder.
:param directory: path to new directory
:type directory: string
"""
if not os.path.exists(directory):
os.makedirs(directory)
def initiate_log():
"""Initiate a log file in __logs__ directory, name after a timestamp.
"""
collist = ' '.join(["'%s'" % collection for collection in collection_list])
path_to_file = os.path.join(path_to_logs, "log-text-%s.txt") % TIMESTAMP
intro = """
TRANSFORMING XML FILES (PAGE FORMAT) TO TEXT FILES
Script ran at : %s
For collections %s.
---------------------
""" % (now, collist)
with open(path_to_file, "w") as f:
f.write(intro)
def create_log(xml_counter, page_counter, document):
"""Create simple reports in log file.
:param xml_counter: number of xml files in document/subcollection directory.
:param page_counter: number of xml-page files in document/subcollection directory.
:param document: name of document/subcollection directory.
:type xml_counter: integer
:type page_counter: integer
:type document: string
"""
if xml_counter == 0:
log = "No .xml file in '%s' directory.\n\n" % document
else:
log = "Found %s .xml file(s) in '%s' directory.\n" % (xml_counter, document)
if page_counter == 0:
log = log + "\tNo .xml file matched PAGE format (root must be '<PcGts>'.\n\n"
else:
log = log + "\tFound %s .xml file(s) matching PAGE format.\n\n" % page_counter
path_to_file = os.path.join(path_to_logs, "log-text-%s.txt") % TIMESTAMP
with open(path_to_file, "a") as f:
f.write(log)
# ========================== #
now = datetime.datetime.now()
TIMESTAMP = "%s-%s-%s-%s-%s" % (now.year, now.month, now.day, now.hour, now.minute)
cwd = os.path.dirname(os.path.abspath(__file__))
path_to_logs = os.path.join(cwd, "__logs__")
initiate_log()
for collection in collection_list:
path = os.path.join(cwd, "data", collection)
path_to_export_text = os.path.join(path, "__TextExports__")
# PREPARING FILES
try:
col_content = os.listdir(path)
if "__TextExports__" in col_content:
col_content.remove("__TextExports__")
if "__AllInOne__" in col_content:
col_content.remove("__AllInOne__")
if len(col_content) > 0:
create_folder(path_to_export_text)
for document in col_content:
path_to_doc = os.path.join(path, document)
path_to_textfile = os.path.join(path_to_export_text, "%s.txt") % document
try:
folder_content = os.listdir(path_to_doc)
sorted_content = []
if len(folder_content) > 0:
# ORDERING XML FILES
for filename in [f for f in folder_content if f.endswith(".xml")]:
filename, ext = os.path.splitext(filename)
try:
sorted_content.append(int(filename))
except TypeError:
sorted_content.append(filename)
sorted_content.sort()
if len(sorted_content) > 0:
# erasing file if exists already
with open(path_to_textfile, "w") as f:
f.write("")
folder_content = [str(filename) + ".xml" for filename in sorted_content]
xml_counter = len(folder_content)
# GETTING TEXT FROM XML FILES
page_counter = 0
for file in folder_content:
page_nb, ext = os.path.splitext(file)
path_to_file = os.path.join(path_to_doc, file)
with open(path_to_file, "r") as f:
content = f.read()
soup = BeautifulSoup(content, "xml")
if soup.PcGts:
page_counter += 1
textregion_all = soup.find_all("TextRegion")
for textregion in textregion_all:
region_id = textregion['id']
textequiv_all = textregion("TextEquiv", recursive=False)
for textequiv in textequiv_all:
text = textequiv.Unicode.get_text()
# CREATING TEXT FILES
# With Zone and Region separators
with open(path_to_textfile, "a") as f:
f.write("%s\n\n[.../R fin de la zone %s]\n\n" % (text, region_id))
with open(path_to_textfile, "a") as f:
f.write("\n[.../... fin de la page %s]\n\n" % page_nb)
create_log(xml_counter, page_counter, document)
except Exception as e:
print(e)
else:
print("No file to transform in collection named '%s'." % collection)
except Exception as e:
print(e)