[PLGN-405] ExtractIT - Adding in extra logic to better handle wrappin…

…g of lines in pdf (#2089) (#2096) * PLGN-405-Adding in extra logic to better handle wrapping of lines in pdf * PLGN-405-Reformatting to to black format * PLGN-405-Bumping version of validators and making changes to unit tests to reflect changes * PLGN-405-Removing unit test, that is not working with validators 2.20.0 * PLGN-405-Updating the docstring message to make it clearer
rapid7 · Nov 2, 2023 · e4de8b0 · e4de8b0
1 parent a563f05
commit e4de8b0
Show file tree

Hide file tree

Showing 21 changed files with 175 additions and 27 deletions.
diff --git a/plugins/extractit/.CHECKSUM b/plugins/extractit/.CHECKSUM
@@ -1,7 +1,7 @@
 {
-	"spec": "be710f9ada138c8d3846edaa9f980a4d",
-	"manifest": "265bc77dda5ab0fb4ed0197441d8bdde",
-	"setup": "1a6b0e425121a9fec055f16607f73d4d",
+	"spec": "126c6d57195d65c04fead360d7f647c2",
+	"manifest": "53add6ecee5b3ff23eda706c6b9def30",
+	"setup": "4280a169f7e84a9188cb28bc67c31d22",
 	"schemas": [
 		{
 			"identifier": "cve_extractor/schema.py",

diff --git a/plugins/extractit/bin/icon_extractit b/plugins/extractit/bin/icon_extractit
@@ -6,7 +6,7 @@ from sys import argv
 
 Name = "ExtractIt"
 Vendor = "rapid7"
-Version = "3.0.7"
+Version = "3.0.8"
 Description = "The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more"
 
 

diff --git a/plugins/extractit/help.md b/plugins/extractit/help.md
@@ -674,6 +674,7 @@ Example output:
 
 # Version History
 
+* 3.0.8 - Adding in extra logic to handle wrapping of lines in pdfs 
 * 3.0.7 - Resolved issues related to `PDF`, `ODP`, `ODT`, `ODF` files extractions 
 * 3.0.6 - Resolved issue where users experienced a `Not in list` value error when submitted multiple URLs with the same linked URL in the URL Extractor action
 * 3.0.5 - Updated error message in extractor.py | Updated help.md description | Updated URL extraction | Fix issue with extracting dates from `.XLSX` files

diff --git a/plugins/extractit/icon_extractit/util/extractor.py b/plugins/extractit/icon_extractit/util/extractor.py
@@ -114,7 +114,7 @@ def extract(
                 )
                 matches = regex.findall(provided_regex, provided_file)
         except UnicodeDecodeError:
-            file_content = extract_content_from_file(base64.b64decode(provided_file))
+            file_content = extract_content_from_file(base64.b64decode(provided_file), provided_regex)
             matches = regex.findall(provided_regex, file_content)
     return list(dict.fromkeys(matches))
 
@@ -152,14 +152,14 @@ def extract_filepath(provided_regex: str, provided_string: str, provided_file: s
         try:
             new_file = base64.b64decode(provided_file.encode(DEFAULT_ENCODING)).decode(DEFAULT_ENCODING)
         except UnicodeDecodeError:
-            new_file = extract_content_from_file(base64.b64decode(provided_file))
+            new_file = extract_content_from_file(base64.b64decode(provided_file), provided_regex)
         new_file = regex.sub(Regex.URL, "", new_file)
         new_file = regex.sub(Regex.Date, "", new_file)
         matches = regex.findall(provided_regex, new_file)
     return list(dict.fromkeys(matches))
 
 
-def extract_content_from_file(provided_file: bytes) -> str:  # noqa: C901
+def extract_content_from_file(provided_file: bytes, provided_regex: str = "") -> str:  # noqa: C901
     with io.BytesIO(provided_file) as file_:
         try:
             # extracting content from DOCX, PPTX, XLSX, ODT, ODP, ODF files
@@ -193,7 +193,7 @@ def extract_content_from_file(provided_file: bytes) -> str:  # noqa: C901
                 with pdfplumber.open(file_) as pdf_file:
                     for page in pdf_file.pages:
                         page_content = page.extract_text()
-                        for word in extract_wrapped_words_from_pdf_page(page):
+                        for word in extract_wrapped_words_from_pdf_page(page, provided_regex):
                             page_content = page_content.replace(word, word.replace("\n", ""))
                         pdf_content += page_content
                 return pdf_content
@@ -204,13 +204,20 @@ def extract_content_from_file(provided_file: bytes) -> str:  # noqa: C901
                 )
 
 
-def extract_wrapped_words_from_pdf_page(page: Page, tolerance: float = DEFAULT_PDF_WRAPPING_TOLERANCE) -> List[str]:
+def extract_wrapped_words_from_pdf_page(
+    page: Page,
+    provided_regex: str = "",
+    tolerance: float = DEFAULT_PDF_WRAPPING_TOLERANCE,
+) -> List[str]:
     """
     Extract wrapped words from a PDF page.
 
     :param page: The PDF page from which to extract wrapped words.
     :type: Page
 
+    :param provided_regex: The regex for the type of words to be searched for, e.g. email/domain format.
+    :type: str
+
     :param tolerance: The tolerance value for detecting wrapped words. Defaults to DEFAULT_PDF_WRAPPING_TOLERANCE.
     :type: float
 
@@ -220,9 +227,20 @@ def extract_wrapped_words_from_pdf_page(page: Page, tolerance: float = DEFAULT_P
 
     wrapped_words = []
     max_x1 = max(character.get("x1") for character in page.chars)
-    for word in page.extract_words():
+    extracted_words = page.extract_words(use_text_flow=True)
+
+    for index, word in enumerate(extracted_words):
         if (max_x1 - word.get("x1")) < tolerance:
-            wrapped_words.append(f"{word.get('text')}\n")
+            # if the current or next word in the list are valid matches then do not try and join then
+            if provided_regex:
+                if (
+                    not regex.findall(provided_regex, word.get("text", ""))
+                    and (index + 1) < len(extracted_words)
+                    and not regex.findall(provided_regex, extracted_words[index + 1].get("text", ""))
+                ):
+                    wrapped_words.append(f"{word.get('text')}\n")
+            else:
+                wrapped_words.append(f"{word.get('text')}\n")
     return wrapped_words
 
 

diff --git a/plugins/extractit/icon_extractit/util/util.py b/plugins/extractit/icon_extractit/util/util.py
@@ -56,7 +56,6 @@ class Regex:
 
 
 class DateFormatStrings:
-
     human_to_linux_mapping = {
         "dd/mm/yyyy": "%d/%m/%Y",
         "dd\\mm\\yyyy": "%d\\%m\\%Y",

diff --git a/plugins/extractit/plugin.spec.yaml b/plugins/extractit/plugin.spec.yaml
@@ -4,7 +4,7 @@ products: ["insightconnect"]
 name: extractit
 title: ExtractIt
 description: The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more
-version: 3.0.7
+version: 3.0.8
 vendor: rapid7
 support: rapid7
 supported_versions: ["2022-10-19"]

diff --git a/plugins/extractit/requirements.txt b/plugins/extractit/requirements.txt
@@ -3,7 +3,7 @@
 # See: https://pip.pypa.io/en/stable/user_guide/#requirements-files
 tldextract==3.4.4
 regex==2023.8.8
-validators==0.20.0
+validators==0.22.0
 pdfplumber==0.10.2
 openpyxl==3.1.2
 parameterized==0.8.1

diff --git a/plugins/extractit/setup.py b/plugins/extractit/setup.py
@@ -3,7 +3,7 @@
 
 
 setup(name="extractit-rapid7-plugin",
-      version="3.0.7",
+      version="3.0.8",
       description="The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more",
       author="rapid7",
       author_email="",

diff --git a/plugins/extractit/unit_test/payloads/cve_extractor.json.resp b/plugins/extractit/unit_test/payloads/cve_extractor.json.resp
diff --git a/plugins/extractit/unit_test/payloads/date_extractor_positive.json.resp b/plugins/extractit/unit_test/payloads/date_extractor_positive.json.resp
diff --git a/plugins/extractit/unit_test/payloads/domain_extractor.json.resp b/plugins/extractit/unit_test/payloads/domain_extractor.json.resp
diff --git a/plugins/extractit/unit_test/payloads/email_extractor.json.resp b/plugins/extractit/unit_test/payloads/email_extractor.json.resp