From 746369d037c5e4aef7ac07fb0a9eb5b53a714787 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:37:18 -0400 Subject: [PATCH] ca_on_markham: Align with original code --- ca_on_markham/people.py | 81 +++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 8fe1e105..276f91f0 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -7,45 +7,50 @@ class MarkhamPersonScraper(CanadianScraper): def scrape(self): + regional_councillor_seat_number = 1 + + page = self.lxmlize(COUNCIL_PAGE) + yield self.scrape_mayor(MAYOR_PAGE) - groups = self.lxmlize(COUNCIL_PAGE).xpath( - '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' + councillors = page.xpath( + '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div' ) - assert len(groups) == 2, "No councillors found" + assert len(councillors), "No councillors found" - regional_councillor_seat_number = 1 - for i, group in enumerate(groups): - for councillor in group: - name = councillor.xpath(".//h3/text()")[0].strip() - district = councillor.xpath(".//p/text()")[0].strip() + for councillor in councillors: + name = councillor.xpath(".//h3/text()")[0].strip() + district = councillor.xpath(".//p/text()")[0].strip() - if i == 0: - role = "Regional Councillor" - district = f"Markham (seat {regional_councillor_seat_number})" - regional_councillor_seat_number += 1 - else: - role = "Councillor" - district = district.replace("Councillor", "").strip() + if "Ward" in district: + district = district.replace("Councillor", "").strip() + role = "Councillor" + elif "Regional" in district: + role = "Regional Councillor" + district = f"Markham (seat {regional_councillor_seat_number})" + regional_councillor_seat_number += 1 + else: + role = district + district = "Markham" - image = councillor.xpath(".//img/@src")[0] - url = councillor.xpath(".//a/@href")[0] + image = councillor.xpath(".//img/@src")[0] + url = councillor.xpath(".//a/@href")[0] - address, phone, email, links = self.get_contact(url) + address, phone, email, links = self.get_contact(url) - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) - p.image = image - p.add_contact("address", address, "legislature") - p.add_contact("voice", phone, "legislature") - p.add_contact("email", email) + p.image = image + p.add_contact("address", address, "legislature") + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) - for link in links: - p.add_link(link) + for link in links: + p.add_link(link) - yield p + yield p def get_contact(self, url): page = self.lxmlize(url) @@ -56,22 +61,20 @@ def get_contact(self, url): links = [] if contact_node.xpath('.//span[@class="address-line1"]/text()'): - address = ( - contact_node.xpath('.//span[@class="address-line1"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="locality"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="postal-code"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="country"]/text()')[0] + address = " ".join( + ( + contact_node.xpath('.//span[@class="address-line1"]/text()')[0], + contact_node.xpath('.//span[@class="locality"]/text()')[0], + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0], + contact_node.xpath('.//span[@class="postal-code"]/text()')[0], + contact_node.xpath('.//span[@class="country"]/text()')[0], + ) ) else: contact_node = page.xpath( '//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]' )[0] - address = contact_node.xpath(".//p/text()")[0] + " " + contact_node.xpath(".//p/text()")[1] + address = f'{contact_node.xpath(".//p/text()")[0]} {contact_node.xpath(".//p/text()")[1]}' links = get_links(contact_node) phone = self.get_phone(contact_node)