Skip to content

Commit

Permalink
[PLGN-405] ExtractIT - Adding in extra logic to better handle wrappin…
Browse files Browse the repository at this point in the history
…g of lines in pdf (#2089) (#2096)

* PLGN-405-Adding in extra logic to better handle wrapping of lines in pdf

* PLGN-405-Reformatting to to black format

* PLGN-405-Bumping version of validators and making changes to unit tests to reflect changes

* PLGN-405-Removing unit test, that is not working with validators 2.20.0

* PLGN-405-Updating the docstring message to make it clearer
  • Loading branch information
rbowden-r7 authored Nov 2, 2023
1 parent a563f05 commit e4de8b0
Show file tree
Hide file tree
Showing 21 changed files with 175 additions and 27 deletions.
6 changes: 3 additions & 3 deletions plugins/extractit/.CHECKSUM
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"spec": "be710f9ada138c8d3846edaa9f980a4d",
"manifest": "265bc77dda5ab0fb4ed0197441d8bdde",
"setup": "1a6b0e425121a9fec055f16607f73d4d",
"spec": "126c6d57195d65c04fead360d7f647c2",
"manifest": "53add6ecee5b3ff23eda706c6b9def30",
"setup": "4280a169f7e84a9188cb28bc67c31d22",
"schemas": [
{
"identifier": "cve_extractor/schema.py",
Expand Down
2 changes: 1 addition & 1 deletion plugins/extractit/bin/icon_extractit
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from sys import argv

Name = "ExtractIt"
Vendor = "rapid7"
Version = "3.0.7"
Version = "3.0.8"
Description = "The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more"


Expand Down
1 change: 1 addition & 0 deletions plugins/extractit/help.md
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ Example output:

# Version History

* 3.0.8 - Adding in extra logic to handle wrapping of lines in pdfs
* 3.0.7 - Resolved issues related to `PDF`, `ODP`, `ODT`, `ODF` files extractions
* 3.0.6 - Resolved issue where users experienced a `Not in list` value error when submitted multiple URLs with the same linked URL in the URL Extractor action
* 3.0.5 - Updated error message in extractor.py | Updated help.md description | Updated URL extraction | Fix issue with extracting dates from `.XLSX` files
Expand Down
32 changes: 25 additions & 7 deletions plugins/extractit/icon_extractit/util/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def extract(
)
matches = regex.findall(provided_regex, provided_file)
except UnicodeDecodeError:
file_content = extract_content_from_file(base64.b64decode(provided_file))
file_content = extract_content_from_file(base64.b64decode(provided_file), provided_regex)
matches = regex.findall(provided_regex, file_content)
return list(dict.fromkeys(matches))

Expand Down Expand Up @@ -152,14 +152,14 @@ def extract_filepath(provided_regex: str, provided_string: str, provided_file: s
try:
new_file = base64.b64decode(provided_file.encode(DEFAULT_ENCODING)).decode(DEFAULT_ENCODING)
except UnicodeDecodeError:
new_file = extract_content_from_file(base64.b64decode(provided_file))
new_file = extract_content_from_file(base64.b64decode(provided_file), provided_regex)
new_file = regex.sub(Regex.URL, "", new_file)
new_file = regex.sub(Regex.Date, "", new_file)
matches = regex.findall(provided_regex, new_file)
return list(dict.fromkeys(matches))


def extract_content_from_file(provided_file: bytes) -> str: # noqa: C901
def extract_content_from_file(provided_file: bytes, provided_regex: str = "") -> str: # noqa: C901
with io.BytesIO(provided_file) as file_:
try:
# extracting content from DOCX, PPTX, XLSX, ODT, ODP, ODF files
Expand Down Expand Up @@ -193,7 +193,7 @@ def extract_content_from_file(provided_file: bytes) -> str: # noqa: C901
with pdfplumber.open(file_) as pdf_file:
for page in pdf_file.pages:
page_content = page.extract_text()
for word in extract_wrapped_words_from_pdf_page(page):
for word in extract_wrapped_words_from_pdf_page(page, provided_regex):
page_content = page_content.replace(word, word.replace("\n", ""))
pdf_content += page_content
return pdf_content
Expand All @@ -204,13 +204,20 @@ def extract_content_from_file(provided_file: bytes) -> str: # noqa: C901
)


def extract_wrapped_words_from_pdf_page(page: Page, tolerance: float = DEFAULT_PDF_WRAPPING_TOLERANCE) -> List[str]:
def extract_wrapped_words_from_pdf_page(
page: Page,
provided_regex: str = "",
tolerance: float = DEFAULT_PDF_WRAPPING_TOLERANCE,
) -> List[str]:
"""
Extract wrapped words from a PDF page.
:param page: The PDF page from which to extract wrapped words.
:type: Page
:param provided_regex: The regex for the type of words to be searched for, e.g. email/domain format.
:type: str
:param tolerance: The tolerance value for detecting wrapped words. Defaults to DEFAULT_PDF_WRAPPING_TOLERANCE.
:type: float
Expand All @@ -220,9 +227,20 @@ def extract_wrapped_words_from_pdf_page(page: Page, tolerance: float = DEFAULT_P

wrapped_words = []
max_x1 = max(character.get("x1") for character in page.chars)
for word in page.extract_words():
extracted_words = page.extract_words(use_text_flow=True)

for index, word in enumerate(extracted_words):
if (max_x1 - word.get("x1")) < tolerance:
wrapped_words.append(f"{word.get('text')}\n")
# if the current or next word in the list are valid matches then do not try and join then
if provided_regex:
if (
not regex.findall(provided_regex, word.get("text", ""))
and (index + 1) < len(extracted_words)
and not regex.findall(provided_regex, extracted_words[index + 1].get("text", ""))
):
wrapped_words.append(f"{word.get('text')}\n")
else:
wrapped_words.append(f"{word.get('text')}\n")
return wrapped_words


Expand Down
1 change: 0 additions & 1 deletion plugins/extractit/icon_extractit/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class Regex:


class DateFormatStrings:

human_to_linux_mapping = {
"dd/mm/yyyy": "%d/%m/%Y",
"dd\\mm\\yyyy": "%d\\%m\\%Y",
Expand Down
2 changes: 1 addition & 1 deletion plugins/extractit/plugin.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ products: ["insightconnect"]
name: extractit
title: ExtractIt
description: The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more
version: 3.0.7
version: 3.0.8
vendor: rapid7
support: rapid7
supported_versions: ["2022-10-19"]
Expand Down
2 changes: 1 addition & 1 deletion plugins/extractit/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# See: https://pip.pypa.io/en/stable/user_guide/#requirements-files
tldextract==3.4.4
regex==2023.8.8
validators==0.20.0
validators==0.22.0
pdfplumber==0.10.2
openpyxl==3.1.2
parameterized==0.8.1
Expand Down
2 changes: 1 addition & 1 deletion plugins/extractit/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


setup(name="extractit-rapid7-plugin",
version="3.0.7",
version="3.0.8",
description="The ExtractIt plugin has a collection of actions used to extract various information from text. These include URLs, domains, emails, IPs, and more",
author="rapid7",
author_email="",
Expand Down
12 changes: 12 additions & 0 deletions plugins/extractit/unit_test/payloads/cve_extractor.json.resp

Large diffs are not rendered by default.

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions plugins/extractit/unit_test/payloads/domain_extractor.json.resp

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions plugins/extractit/unit_test/payloads/email_extractor.json.resp

Large diffs are not rendered by default.

Loading

0 comments on commit e4de8b0

Please sign in to comment.