-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathytm_wd
executable file
·370 lines (315 loc) · 14 KB
/
ytm_wd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/env python3
"""
Yandex Transport Monitor - WebDriver Version
Very, very, very, very ,very dirty hack thing.
Uses Selenium web automation tool to scan for public transport.
Exceptionally heavy, or maybe not so heavy if used in headless mode.
"""
# Workflow pipeline:
# 1. Get the page source code via Selenium Automation + WebDriver
# 2. Parse the page.
# 3 Save contents of the page to the database.
# 4. Delete the page source code
# 5. Wait (1 minute is enough).
# 6. Repeat.
# String Quotation Policy
# The string quotation policy is as follows:
# - Strings which are visible to end user should be double-quoted (print, log).
# - Strings which are "internal", such as values, dictionary keys etc. are single-quoted.
# - Do not mix single-quoted and double-quoted strings in one statement.
# - Since SQL queries usually contain single-quotes, it's better to put the whole query
# in double quotes.
import time
import datetime
import re
import argparse
import signal
import sys
import uuid
import os
import selenium
from selenium import webdriver
from ytm_pageparser import YTMPageParser
import setproctitle
class Application:
"""
Yandex traffic monitor application
"""
VERSION = "1.1.7"
driver = None
RESULT_OK = 0
RESULT_VERSION = 1
ERROR_NO_URL = 2
LOG_ERROR = 1
LOG_WARNING = 2
LOG_INFO = 3
LOG_DEBUG = 4
def __init__(self):
setproctitle.setproctitle('ytm_wd')
# Verbosity level
# 0 - stdout output only, no errors/warnings/info/debug
# 1 - show errors
# 2 - show warnings
# 3 - show info
# 4 - full debug
self.verbose = 1
# Chrome WebDriver location
self.chrome_driver_location = '/usr/bin/chromedriver'
# Time to wait between queries, in seconds.
self.wait_time = 60
# Run the script only once, wait_time will be ignored
self.run_once = False
# Output mode (plain, csv, json, xml, yaml)
self.out_mode = 'plain'
# Yandex Maps query URL, if empty the program will exit.
self.url = ''
# Station_id, if empty - will try and get station_id from url.
self.station_id = ''
# Save data to database (PostgreSQL)?
self.save_to_database = False
# Database settings
self.db_host = 'localhost'
self.db_port = '5432'
self.db_name = 'ytmonitor'
self.db_username = 'ytmonitor'
self.db_password = 'password'
self.is_running = True
self._savefile = 'page-'+str(uuid.uuid4())+'.html'
signal.signal(signal.SIGINT, self.sigint_handler)
def log(self, log_level, text):
"""
Log data to stderr
:param log_level: log level, see Application.LOG_SOMETHING constants.
:param text: text to print
:return: nothing
"""
if log_level == self.LOG_ERROR:
if self.verbose >= self.LOG_ERROR:
print("ERROR:", str(text), file=sys.stderr)
return
if log_level == self.LOG_WARNING:
if self.verbose >= self.LOG_WARNING:
print("WARNING:", str(text), file=sys.stderr)
return
if log_level == self.LOG_INFO:
if self.verbose >= self.LOG_INFO:
print("INFO:", str(text), file=sys.stderr)
return
if log_level == self.LOG_DEBUG:
if self.verbose >= self.LOG_DEBUG:
print("DEBUG:", str(text), file=sys.stderr)
return
def sigint_handler(self, _signal, _time):
"""
SIGINT signal handler
:param _signal: signal
:param _time: time
:return: nothing
"""
self.log(self.LOG_INFO, "SIGINT received! Graceful termination is in progress...")
self.is_running = False
if self.driver is not None:
self.driver.quit()
os.remove(self._savefile)
self.log(self.LOG_INFO, "Program terminated.")
sys.exit(0)
def configure(self):
"""
Parsing command line arguments.
"""
parser = argparse.ArgumentParser(description="Yandex Transport Monitor, this script "
"monitors one stop for incoming public "
"transport prognosis. Designed to be used "
"with Yandex maps for Russian GPS/GNSS "
"enabled public transport. Is itself a "
"very dirty hack, since Yandex.Transport "
"does not provide any API.\n"
"This script reqires comatible versions of "
"Chrome/Chromium web browser and "
"Chrome Web Driver.")
parser.add_argument("--version", action="store_true", default=False,
help="Show version info.")
parser.add_argument("--verbose", default=self.verbose,
help="Verbosity level. \n"
" 0: Output only \n"
" 1: Error messages only\n"
" 2: Errors and warnings\n"
" 3: Errors, warning and info\n"
" 4: Debug")
parser.add_argument("--chrome_driver_location", default=self.chrome_driver_location,
help="Chrome/Chromium Web Driver location.")
parser.add_argument("--wait_time", default=self.wait_time,
help="Time to wait between queries.")
parser.add_argument("--out_mode", default=self.out_mode,
help="Output mode (plain, csv)")
parser.add_argument("--run_once", action="store_true", default=self.run_once,
help="Run the script only once.")
parser.add_argument("--url", default=self.url,
help="URL to query, should not be empty.")
parser.add_argument("--station_id", default=self.station_id,
help="Station ID, will try to derive from URL if empty.")
parser.add_argument("--save_to_database", action="store_true",
default=self.save_to_database,
help="Save data to PostgreSQL database.")
parser.add_argument("--db_host", default=self.db_host,
help="PostgreSQL Database host address.")
parser.add_argument("--db_port", default=self.db_port,
help="PostgreSQL Database port.")
parser.add_argument("--db_name", default=self.db_name,
help="PostgreSQL Database name.")
parser.add_argument("--db_username", default=self.db_username,
help="PostgreSQL Database username.")
parser.add_argument("--db_password", default=self.db_password,
help="PostgreSQL Database password.")
# Parsing the CLI arguments
args = parser.parse_args()
if args.version:
print("Yandex Transport Monitor, Version:", str(self.VERSION))
return self.RESULT_VERSION
self.verbose = int(args.verbose)
self.chrome_driver_location = args.chrome_driver_location
self.wait_time = int(args.wait_time)
self.run_once = args.run_once
self.out_mode = args.out_mode
self.url = args.url
if self.url == "":
self.log(self.LOG_ERROR, "No URL being provided, terminating the program.")
return self.ERROR_NO_URL
self.station_id = args.station_id
if self.station_id == "":
self.log(self.LOG_WARNING, "No station ID provided, trying to derive from URL.")
# Question marks means regexp is not greedy.
res = re.match(r".*stopId.*?=(.*?)&.*", self.url)
if res is not None:
self.station_id = res.group(1)
if self.station_id == "":
self.log(self.LOG_WARNING, "Could not parse Station ID, it will be left blank.\n"
"You sure this is a Yandex Maps URL of public transport top?")
else:
self.log(self.LOG_INFO, "Station ID value = " + str(self.station_id))
# Database settings
self.save_to_database = args.save_to_database
self.db_host = args.db_host
self.db_port = args.db_port
self.db_name = args.db_name
self.db_username = args.db_username
self.db_password = args.db_password
# Printing the configuration in debug mode
self.log(self.LOG_DEBUG, "Configuration values:")
self.log(self.LOG_DEBUG, " chrome_driver_location = " +"\""
+ str(self.chrome_driver_location) + "\"")
self.log(self.LOG_DEBUG, " wait_time = " + str(self.wait_time))
self.log(self.LOG_DEBUG, " url = " + str(self.url))
self.log(self.LOG_DEBUG, " station_id = " + str(self.station_id))
self.log(self.LOG_DEBUG, " out_mode = " + str(self.out_mode))
self.log(self.LOG_DEBUG, "Use database: "+str(self.save_to_database))
if self.save_to_database:
self.log(self.LOG_DEBUG, " db_host = " + "\"" + str(self.db_host) + "\"")
self.log(self.LOG_DEBUG, " db_port = " + str(self.db_port))
self.log(self.LOG_DEBUG, " db_name = " + "\"" + str(self.db_name) + "\"")
self.log(self.LOG_DEBUG, " db_username = " + "\"" + str(self.db_username) + "\"")
self.log(self.LOG_DEBUG, " db_password = " + "\"" + str(self.db_password) + "\"")
if self.run_once:
self.log(self.LOG_DEBUG, "Run once: " + str(self.run_once))
return self.RESULT_OK
def print_data(self, mode, data):
"""
Prints the data in desired format
:param mode: print mode ('plain', 'csv')
:param data: data tuple to print, containing parsed info of transit schedule.
:return:
"""
if mode == 'plain':
self.print_data_plain(data)
return
if mode == 'csv':
self.print_data_csv(data)
return
def print_data_plain(self, data):
"""
Prints data in plain format
:param data: data tuple to print, containing parsed info of transit schedule.
:return: nothing
"""
print("Timestamp :", self.time_now)
print("Station ID:", self.station_id)
for line in data:
print(line)
print("")
def print_data_csv(self, data):
"""
Prints data in plain format
:param data: data tuple to print, containing parsed info of transit schedule.
:return: nothing
"""
for line in data:
print(str(line[0]), end=",")
print(self.time_now, end=",")
print(str(self.station_id), end=",")
for i in range(1, len(line)-1):
print(str(line[i]), end=",")
print(str(line[-1]), end="\n")
def run(self):
"""
Main body of the script
"""
try:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
self.log(self.LOG_INFO, "Running Chrome in headless mode...")
if self.verbose >= self.LOG_INFO:
print("", file=sys.stderr)
driver = webdriver.Chrome(self.chrome_driver_location, chrome_options=options)
while self.is_running:
self.time_now = datetime.datetime.now()
driver.get(self.url)
with open(self._savefile, 'w', encoding='utf-8') as file:
file.write(driver.page_source)
file.close()
# Parse the page
parser = YTMPageParser(self._savefile)
res = parser.parse()
# Printing the result
self.log(self.LOG_DEBUG, "Printing output...")
if self.verbose >= self.LOG_DEBUG:
print("", file=sys.stderr)
self.print_data(self.out_mode, res)
# Write to database
if self.save_to_database:
parser.set_database(self.db_host,
self.db_port,
self.db_name,
self.db_username,
self.db_password)
self.log(self.LOG_DEBUG, "Saving to database...")
res = parser.write_to_database(self.station_id, self.time_now, parser.data)
if res == YTMPageParser.DB_RESULT_OK:
self.log(self.LOG_DEBUG, "Saved!")
elif res == YTMPageParser.DB_RESULT_NODATA:
self.log(self.LOG_DEBUG, "No data supplied!")
else:
self.log(self.LOG_DEBUG, "This should not happen...")
# Exit if run_once was specified.
if self.run_once:
self.is_running = False
else:
self.log(self.LOG_DEBUG, "Waiting " + str(self.wait_time) + " seconds.")
time.sleep(self.wait_time)
driver.quit()
os.remove(self._savefile)
self.log(self.LOG_INFO, "Program terminated.")
sys.exit(0)
# pylint: disable=C0103
except selenium.common.exceptions.WebDriverException as e:
self.log(self.LOG_ERROR, "Exception:" + str(e))
sys.exit(1)
# pylint: enable=C0103
# pylint: disable=C0103
if __name__ == '__main__':
app = Application()
result = app.configure()
if result == app.RESULT_VERSION:
sys.exit(0)
if result != app.RESULT_OK:
sys.exit(result)
app.run()