-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsecgen.py
executable file
·242 lines (217 loc) · 10.6 KB
/
secgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python
#
# secgen.py:
# Twitter/Too the daily schedule of the UN Secretary-General
#
# Copyright (c) 2018 Matthew Somerville.
# http://www.dracos.co.uk/
import re
import html.entities
import arrow
from bs4 import BeautifulSoup
from bot import SchedulerBot, Event
REGEX_TIME = re.compile('(\*?(\d+)(?:(?::|\.)\s*(\d+)|\s*(a\.?m\.?|p\.?m\.?|noon))+\.?\s*\*?)')
def remove_changing_bits(s):
return re.sub('(?s)^.*?view-content', '', s)
class SecgenBot(SchedulerBot):
localfile = '/home/sympl/scheduler/data/%s-schedule'
not_got = ('(?i)Proxy Error|urgent maintenance|Not Found|Service Temporarily Unavailable' +
'|Internal server error|HTTP Error 50[17]|SQLState')
def alert(self, event, now):
return now >= event.time and now < event.time.shift(minutes=5)
def fetch_diff(self, a, b):
return remove_changing_bits(a) != remove_changing_bits(b)
def fetch(self):
new = self.get_contents('https://www.un.org/sg/en/content/sg/appointments-secretary-general')
return self.fetch_check_file(new)
def parse(self, warn=0):
d = self.parse_get_file(warn)
if not d:
return []
soup = BeautifulSoup(d, 'html.parser')
table = soup.find('div', 'view-schedules')
if not table:
return []
events = []
pastnoon = False
date = arrow.get(table.find('span', 'date-display-single')['content'], 'YYYY-MM-DDTHH:mm:ssZZ')
for row in table('tr'):
row = parsecell(row.renderContents().decode('utf-8'))
m = REGEX_TIME.match(row)
if not m:
if row[0:2] in ('- ', 'Mr') or row[0:4] == 'Amb.':
event = parsecell(row, True)
last = events[-1]
events[-1] = (last[0], '%s %s' % (last[1], event))
continue
time = m.group(1)
text = row.replace(time, '')
time, pastnoon = parsetime(time, date, pastnoon)
event = parsecell(text, True)
event = prettify(event)
events.append(Event(time=time, status=event))
return events
def parsetime(time, date, pastnoon):
m = REGEX_TIME.search(time)
if m:
(dummy, hour, min, pm) = m.groups()
if min is None:
min = 0
if len(hour) == 3:
hour, min = hour[0], hour[1:]
elif time == 'noon':
hour = 12
min = 0
pm = 'noon'
hour = int(hour)
min = int(min)
if not pm and pastnoon:
hour += 12
if pm in ('pm', 'p.m', 'p.m.') and hour < 12:
hour += 12
if pm in ('am', 'a.m', 'a.m.') and hour == 12:
hour -= 12
if pm in ('pm', 'p.m', 'p.m.', 'noon'):
pastnoon = True
d = date.replace(hour=hour, minute=min)
return d, pastnoon
def titlecaseifuppercase(s):
if re.match('[A-Z]+$', s) and len(s) > 2:
return s.title()
if s == 'OF':
return 'of'
return s
def prettify(s):
s = re.sub('^approx\. ', '', s)
s = ''.join(map(titlecaseifuppercase, re.split('([() ])', s)))
s = s.replace('Secretery', 'Secretary')
if (re.match('Addressing|Meeting (with|on)|Visiting|Visit to|Trilateral Meeting', s)
and not re.search('Secretary-General (will|to) make remarks', s)):
return s
if re.match('Chairing of the ', s):
return re.sub('Chairing of the ', 'Chairing the ', s)
if re.match('Joint press encounter by the Secretary-General with: ', s):
return re.sub('Joint press encounter by the Secretary-General with: ', 'Joint press encounter with ', s)
if re.match('Joint Declaration on (.*?) by the Secretary-General and ', s):
return re.sub('Joint (.*?) by the Secretary-General and ', r'Joint \1 with ', s)
if re.match('(The )?Secretary-General[^a-zA-Z]*(to|will) address ', s):
return re.sub('(The )?Secretary-General[^a-zA-Z]*(to|will) address ', 'Addressing ', s)
if re.match('(The )?Secretary-General (to|will) make ', s):
return re.sub('(The )?Secretary-General (to|will) make ', 'Making ', re.sub(r'\bhis\b', 'my', s))
if re.match('Secretary-General to attend ', s):
return re.sub('Secretary-General to attend ', 'Attending ', s)
if re.match('.*? hosted by the Secretary-General ', s):
return re.sub('(.*?) hosted by the Secretary-General ', r'Hosting \1 ', s)
if re.match('Secretary-General to host ', s):
return re.sub('Secretary-General to host ', 'Hosting ', s)
if re.match('The Secretary-General departs ', s):
return re.sub('The Secretary-General departs ', 'Departing ', s)
if re.match('Secretary-General to brief ', s):
return re.sub('Secretary-General to brief ', 'Briefing ', s)
if re.search('to hear a briefing by the Secretary-General', s):
return 'Briefing a ' + re.sub('to hear a briefing by the Secretary-General ', '', s)
if re.match('Secretary-General’s briefing to ', s):
return re.sub('Secretary-General’s briefing to ', 'Briefing to ', s)
if re.match('Secretary-General to speak at ', s):
return re.sub('Secretary-General to speak at ', 'Speaking at ', s)
if re.match('Secretary-General to speak to ', s):
return re.sub('Secretary-General to speak to ', 'Speaking to ', s)
if re.match('Secretary-General\'s opening statement at ', s):
return re.sub('Secretary-General\'s opening statement at his ', 'Making opening statement at my ', s)
if re.match('Secretary-General\'s closing statement at ', s):
return re.sub('Secretary-General\'s closing statement at his ', 'Making closing statement at my ', s)
if re.match('Secretary-General to deliver ', s):
return re.sub('Secretary-General to deliver ', 'Delivering ', s)
if re.match('Secretary-General will hold ', s):
return re.sub('Secretary-General will hold ', 'Holding ', s)
if re.match('Secretary-General to give ', s):
return re.sub('Secretary-General to give ', 'Giving ', s)
if re.match('Drop by at ', s):
return re.sub('Drop by at ', 'Dropping by ', s)
if re.match('Remarks by the Secretary-General |SG remarks at|' +
'Secretary(-| )General\'?s? (to (make|give) )?remarks |Welcoming Remarks ', s):
return re.sub(('Remarks by the Secretary-General |SG remarks |' +
'Secretary(-| )General\'?s? (to (make|give) )?remarks |Welcoming Remarks '),
'Making remarks ', s)
m = re.search(' (?:.\200\223 |- |\[|{|\()(?:The )?Secretary-General (?:to|will) (?:make|deliver) ' +
'([Oo]pening |closing )?[rR]emarm?ks(\]|}|\))?\.?$', s)
if m:
new = 'Making %sremarks at ' % (m.group(1) or '').lower()
s = re.sub('^Addressing ', '', s)
if not re.match('(?i)The ', s):
new += 'the '
return re.sub('^(.*) (?:.\200\223 |- |\[|{|\()(?:The )?Secretary-General (?:to|will) (?:make|deliver) ' +
'([Oo]pening |closing )?[rR]emarm?ks(\]|}|\))?', new + r'\1', s)
if re.match('\[Remarks at\] ', s):
return re.sub('\[Remarks at\] ', 'Making remarks at ', s)
if (re.search('(?i)Presentation of credential', s) or re.match('Remarks at', s) or re.match('Election of', s)
or re.match('Swearing[ -]in Ceremony', s)):
pass
elif (re.search('(?<!on )Youth$|^Sages Group|Messengers$|^Group of Friends|^Leaders|^Chairmen|' +
'^Permanent Representatives?|^Executive Secretaries|Board members|Contact Group|Envoys|Team$|' +
'^Honou?rable|Interns|Order|Board of Trustees|Journalists$|Committee( of the .*Parliament)$|' +
'Fellows$|^(UN )?Youth Delegates', s)
and not re.search('(president|photo opportunity|concert|luncheon|breakfast|event)(?i)', s)
and not re.match('Meeting of|Joint meeting|Mr', s)):
s = 'Meeting the %s' % s
elif re.match(r'- Mr|His (Royal|Serene) Highness|President|Association of|Vuk|Queen|Prince|Major-General|' +
'His Excellency|His Eminence|His Holiness|His Majesty|Her Majesty|Their Majesties|Ambassador\b|' +
'H\.?R\.?H|H\. ?M\.|H\. ?H\.|H\.? ?E\.?|S\. ?E\.|Rev\.|The Very Rev|Sir|General (?!Assembly)|' +
'H\.S\.H|\.?Mr\.?|Mrs\.|Prof\.|Dr\.?\b|Lord|Lady|Justice|Professor|Ms\.?|Amb\.?\b|Mayor|Messrs\.|' +
'Senator|(The )?R(igh)?t\.? Hon(ou?rable)?\.?|The Hon\.|Hon\.|U\.S\. House|U\.S\. Senator|' +
'US Congressman|Judge|Cardinal|Archbishop|The Honou?rable|Rabbi|Lt\.|Major General|Lieutenant|' +
'Excelent|Metropolitan|Psy|Thura|Lang Lang|Bahey|Antti|Bishop|Pastor|Shaykh|Srgjan|Michel|' +
'Commissioner', s) and not re.search('(?i)luncheon', s):
s = re.sub('Amb\.', 'Ambassador', s)
s = re.sub('^Amb ', 'Ambassador ', s)
if re.match('The ', s):
s = re.sub('^The', 'the', s)
s = 'Meeting %s' % s
elif (re.search('(?i)Delegation|Members', s)
and not re.search('(?i)(Joint.*Meeting|Group Meeting|concert|luncheon|breakfast)', s)):
s = 'Meeting the %s' % s
elif (re.search(r'Elder|High Representative|Chairman\b|Secretary-General of the League|Senior Adviser|' +
'Special Adviser|Special Representative|Permanent Representative|Minister of|' +
'Secretary of State for|Administrator|CEO|National Adviser|Ambassador|students|Students', s) and
not re.search('(?i)(concert|conversation|luncheon|breakfast|hosted by|hand-over|meeting|conference)', s)):
s = 'Meeting %s' % s
elif re.match('The ', s):
s = re.sub('^The ', 'Attending the ', s)
else:
s = 'Attending the %s' % s
return s
def parsecell(s, d=False):
s = re.sub('^REV.1 ', '', s)
s = re.sub('\xc2\xa0', ' ', s)
s = re.sub(u'\xa0', ' ', s)
if d:
s = re.sub("<br />", ", ", s)
s = re.sub("</p>", " ", s)
s = re.sub("<[^>]*>", "", s)
s = re.sub(" ", " ", s)
s = re.sub(""", '"', s)
s = re.sub("\s+", " ", s)
s = s.strip(" ")
s = unescape(s)
return s
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(html.entities.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
SecgenBot('secgen').run()