-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb_data_scraper.py
38 lines (33 loc) · 1.95 KB
/
web_data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
import urllib.request
import os.path
WANTED_COUNTRIES = ['england', 'germany', 'italy', 'spain', 'france', 'portugal', 'netherlands', 'turkey', 'greece']
SEASON_TO_DOWNLOAD_COUNT = 11
def download_data_from_web(download_to_path):
web_handler: WebDriver = webdriver.Chrome()
try:
web_handler.get("http://www.football-data.co.uk/data.php")
league_link_elements = web_handler.find_elements_by_xpath("//td[@valign='top'][2]//table//a")
league_links = []
for element in league_link_elements:
league_links.append(element.get_attribute("href"))
for link in league_links:
if any((country in link) for country in WANTED_COUNTRIES):
download_info_about_matches(web_handler, link, download_to_path)
finally:
web_handler.close()
def download_info_about_matches(web_handler, link, download_to_path):
web_handler.get(link)
season_headers = web_handler.find_elements_by_xpath("//td/i")
for i, header in enumerate(season_headers):
if i < SEASON_TO_DOWNLOAD_COUNT:
season_years = header.text.split()[1].replace("/", "")
league_in_curr_season_download_elements = header.find_elements_by_xpath("./following-sibling::a[position()<=2 and preceding-sibling::i[text("
")='" + header.text + "' and position() = 1]]")
for league_download_element in league_in_curr_season_download_elements:
download_link = league_download_element.get_attribute("href")
extension = download_link.split(".")[-1]
full_file_name = download_to_path + "\\" + league_download_element.text + season_years + "." + extension
if i == 0 or not(os.path.isfile(full_file_name)):
urllib.request.urlretrieve(download_link, full_file_name)