From f77ea69f27735b49a0644e28d30a2697c417b8be Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 17 Apr 2024 11:33:43 -0500 Subject: [PATCH 1/3] remove dynamic fields from plugin template --- .../xmls/plugin_indexing_template.xml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index a9b5bd86..b7a9ce63 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -31,22 +31,8 @@ 10 -1 -1 - true - false - false - false - false - false - true - true - false - true - true true true - false - 1 - 1000 ms true no @@ -54,7 +40,6 @@ false false - false false false @@ -224,8 +209,6 @@ false - true - false From 35036ac21f716b44dcbf3a21d4a3f3017d41a33e Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 17 Apr 2024 14:56:04 -0500 Subject: [PATCH 2/3] add strict version of get_tag_value, add dynamic plugin indexer generator --- config_generation/db_to_xml.py | 71 ++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/config_generation/db_to_xml.py b/config_generation/db_to_xml.py index 87269dc0..844f9f9c 100644 --- a/config_generation/db_to_xml.py +++ b/config_generation/db_to_xml.py @@ -23,12 +23,31 @@ def _get_tree(self, xml_string) -> ET.ElementTree: """takes the path of an xml file and opens it as an ElementTree object""" return ET.ElementTree(ET.fromstring(xml_string)) - def get_tag_value(self, tag_name: str) -> list: + def get_tag_value(self, tag_name: str, strict: bool = False) -> str | list[str]: """ - tag_name can be either the top level tag - or you can get a child by saying 'parent/child' + Retrieves the value of the specified XML tag. If 'strict' is True, the function will + raise an error if more than one value is found, and it will return the single value. + + Parameters: + - tag_name (str): Can be either the top level tag or a path specifying a child tag, e.g., 'parent/child'. + - strict (bool): If True, raises an error when more than one value is found, or if no values are found. + + Returns: + - str: The text of the single XML element matching the tag_name if strict is True and exactly one match exists. + + Raises: + - ValueError: If 'strict' is True and either no values or more than one value is found. """ - return [element.text for element in self.xml_tree.findall(tag_name)] + + elements = self.xml_tree.findall(tag_name) + if strict: + if len(elements) == 0: + raise ValueError(f"No elements found for the tag '{tag_name}'") + elif len(elements) > 1: + raise ValueError(f"Multiple elements found for the tag '{tag_name}': expected exactly one.") + return elements[0].text + else: + return [element.text for element in elements] def _add_declaration(self, xml_string: str): """adds xml declaration to xml string""" @@ -129,6 +148,50 @@ def convert_template_to_scraper(self, collection) -> None: scraper_config = self.update_config_xml() return scraper_config + def convert_template_to_plugin_indexer(self, scraper_editor) -> None: + """ + assuming this class has been instantiated with the scraper_template.xml + """ + + transfer_fields = [ + "KeepHashFragmentInUrl", + "CorrectDomainCookies", + "IgnoreSessionCookies", + "DownloadImages", + "DownloadMedia", + "DownloadCss", + "DownloadFtp", + "DownloadFile", + "IndexJs", + "FollowJs", + "CrawlFlash", + "NormalizeSecureSchemesWhenTestingVisited", + "RetryCount", + "RetryPause", + "AddBaseHref", + "AddMetaContentType", + "NormalizeUrls", + ] + + double_transfer_fields = [ + ("UrlAccess", "AllowXPathCookies"), + ("UrlAccess", "UseBrowserForWebRequests"), + ("UrlAccess", "UseHttpClientForWebRequests"), + ] + + for field in transfer_fields: + print(field, scraper_editor.get_tag_value(field, strict=True)) + self.update_or_add_element_value(field, scraper_editor.get_tag_value(field, strict=True)) + + for parent, child in double_transfer_fields: + print(parent, child, scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)) + self.update_or_add_element_value( + f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True) + ) + + scraper_config = self.update_config_xml() + return scraper_config + def convert_template_to_indexer(self, collection) -> None: """ assuming this class has been instantiated with the indexer_template.xml From 8ba0f00cff07957ad3b36930de2527dd09f43740 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 17 Apr 2024 14:57:07 -0500 Subject: [PATCH 3/3] add code to dynamically create the plugin config --- sde_collections/models/collection.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 999203e1..a70ffd0f 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -207,7 +207,20 @@ def create_plugin_config(self, overwrite: bool = False): if overwrite is True, it will overwrite the existing file """ - plugin_config = open("config_generation/xmls/plugin_indexing_template.xml").read() + + # there needs to be a scraper config file before creating the plugin config + gh = GitHubHandler() + scraper_exists = gh.check_file_exists(self._scraper_config_path) + if not scraper_exists: + raise ValueError(f"Scraper does not exist for the collection {self.config_folder}") + else: + scraper_content = gh._get_file_contents(self._scraper_config_path) + scraper_content = scraper_content.decoded_content.decode("utf-8") + scraper_editor = XmlEditor(scraper_content) + + plugin_template = open("config_generation/xmls/plugin_indexing_template.xml").read() + plugin_editor = XmlEditor(plugin_template) + plugin_config = plugin_editor.convert_template_to_plugin_indexer(scraper_editor) self._write_to_github(self._plugin_config_path, plugin_config, overwrite) def create_indexer_config(self, overwrite: bool = False):