Skip to content

Commit

Permalink
Merge pull request #16 from ArtLabss/pdfAnonymizer
Browse files Browse the repository at this point in the history
Minor Changes
  • Loading branch information
artkulak authored May 1, 2022
2 parents c7c8a6e + 20120ed commit f275c9f
Show file tree
Hide file tree
Showing 9 changed files with 2,305 additions and 2,380 deletions.
2 changes: 1 addition & 1 deletion anonympy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
such as tabular, text, images and sound.
'''

__version__ = "0.3.5"
__version__ = "0.3.7"
# Check if all dependencies have been installed
hard_dependencies = ("faker", "pandas", "numpy", "cv2", "PyPDF2",
"pytesseract", "pdf2image",
Expand Down
14 changes: 7 additions & 7 deletions anonympy/pdf/core_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,12 @@ def anonymize(self,

if self.number_of_pages == 1:
# return str if 1 page PDF, else a list of str
text = self.images2text(self.images)[0]
ner = self._nlp(text)
self.images2text(self.images)
ner = self._nlp(self.texts[0])

find_emails(text=text, matches=self.PII_objects)
find_numbers(text=text, matches=self.PII_objects)
find_months(text=text, matches=self.PII_objects)
find_emails(text=self.texts[0], matches=self.PII_objects)
find_numbers(text=self.texts[0], matches=self.PII_objects)
find_months(text=self.texts[0], matches=self.PII_objects)

find_EOI(pipeline=ner, matches=self.PII_objects, EOI="PER")
find_EOI(pipeline=ner, matches=self.PII_objects, EOI="ORG")
Expand All @@ -217,9 +217,9 @@ def anonymize(self,
fill=fill,
outline=outline)
else:
text = self.images2text(self.images)
self.images2text(self.images)

for excerpt in text:
for excerpt in self.texts:
temp_pii = []
temp_bbox = []
ner = self._nlp(excerpt)
Expand Down
64 changes: 64 additions & 0 deletions anonympy/tests/pdf/test_core_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# These tests can be run locally, however change
# `pytesseract_path` and `poppler_path`
# arguments within `anonym_obj` function

# import cv2
# import pytest
# import urllib
# import numpy as np
# from anonympy.pdf import pdfAnonymizer


# def fetch_image(url):
# req = urllib.request.urlopen(url)
# arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
# img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
# return img


# @pytest.fixture(scope="session")
# def anonym_obj():
# '''
# Initialize `pdfAnonymizer` object
# '''
# url = ('https://raw.githubusercontent.com/ArtLabss/open-data-anonymizer'
# '/pdfAnonymizer/anonympy/tests/pdf/expected/test.pdf')
# try:
# anonym = pdfAnonymizer(
# url=url,
# pytesseract_path=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
# poppler_path=(r"C:\Users\shakhansho.sabzaliev\Downloads"
# r"\Release-22.01.0-0\\"
# r"poppler-22.01.0\Library\bin"),
# model=("dbmdz/bert-large-cased-"
# "finetuned-conll03-english"),
# tokenizer=("dbmdz/bert-large-cased"
# "-finetuned-conll03-english"))
# except: # noqa: E722
# anonym = None

# return anonym


# def test_anonym_obj(anonym_obj):
# if anonym_obj is None:
# assert False, ("Failed to initialize `pdfAnonymizer` object with "
# "`anonympy/tests/pdf/expected/test.pdf` file")
# assert isinstance(anonym_obj, pdfAnonymizer), ("Expected to return "
# "`pdfAnonymizer` object`")

# anonym_obj.pdf2images()

# assert len(anonym_obj.images) == 1, ("`pdf2images` didn't return"
# " expected value")

# anonym_obj.images2text(anonym_obj.images)

# assert len(anonym_obj.texts) == 1, ("`images2text` method didn't"
# " return expected value")
# assert type(anonym_obj.texts[0]) is str, ("Expected Type `str`")

# assert anonym_obj.number_of_pages == 1, ('Unexpected value returned')

# assert type(anonym_obj.pages_data[0]) is dict, ("Unexpected value"
# " returned")
153 changes: 130 additions & 23 deletions anonympy/tests/pdf/test_utils_pdf.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,137 @@
import pytest
from anonympy.pdf import pdfAnonymizer
from anonympy.pdf.utils_pdf import find_EOI # noqa: F401
# These tests can be run locally, however change
# `pytesseract_path` and `poppler_path`
# arguments within `anonym_obj` function


@pytest.fixture(scope="module")
def anonym_pdf():
'''
Initialize `pdfAnonymizer` object
'''
url = 'https://raw.githubusercontent.com/ArtLabss/open-data-anonymizer'\
'/pdfAnonymizer/anonympy/tests/pdf/expected/test.pdf'
try:
anonym = pdfAnonymizer(url=url)
except: # noqa: E722
anonym = None
# import cv2
# import pytest
# import urllib
# import numpy as np
# from PIL import ImageDraw
# from anonympy.pdf import pdfAnonymizer
# from anonympy.pdf.utils_pdf import draw_black_box_pytesseract, find_EOI
# from anonympy.pdf.utils_pdf import find_coordinates_pytesseract
# from anonympy.pdf.utils_pdf import find_months, find_emails, find_numbers

return anonym

# def fetch_image(url):
# req = urllib.request.urlopen(url)
# arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
# img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
# return img

def test_anonym_pdf(anonym_pdf):
if anonym_pdf is None:
assert False, "Failed to initialize `pdfAnonymizer` object with "\
"`anonympy/tests/pdf/expected/test.pdf` file"
assert isinstance(anonym_pdf, pdfAnonymizer), "Didn't return "\
"`pdfAnonymizer` object`"

# def is_similar(image1, image2):
# return (image1.size == image2.size) and \
# not(np.bitwise_xor(image1, image2).any())

# def test_find_EOI():
# pass

# @pytest.fixture(scope="session")
# def anonym_obj():
# '''
# Initialize `pdfAnonymizer` object
# '''
# url = ('https://raw.githubusercontent.com/ArtLabss/open-data-anonymizer'
# '/pdfAnonymizer/anonympy/tests/pdf/expected/test.pdf')
# try:
# anonym = pdfAnonymizer(
# url=url,
# pytesseract_path=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
# poppler_path=(r"C:\Users\shakhansho.sabzaliev\Downloads"
# r"\Release-22.01.0-0\\"
# r"poppler-22.01.0\Library\bin"),
# model=("dbmdz/bert-large-cased-"
# "finetuned-conll03-english"),
# tokenizer=("dbmdz/bert-large-cased"
# "-finetuned-conll03-english"))
# except: # noqa: E722
# anonym = None

# return anonym


# def test_draw_black_box_pytesseract(anonym_obj):
# anonym_obj.pdf2images()
# bbox = [(570, 280, 561, 28)]

# expected = anonym_obj.images[0].copy()
# draw = ImageDraw.Draw(expected)

# for box in bbox:
# x, y, w, h = box
# draw.rectangle([x, y, x + w, y + h], fill='black', outline='black')

# output = anonym_obj.images[0].copy()
# draw_black_box_pytesseract(bbox, output)

# assert is_similar(expected, output), ("Method `cover_box`"
# "didn't return expected values")


# def test_find_coordinates_pytesseract(anonym_obj):
# anonym_obj.images2text(anonym_obj.images)

# expected = [(570, 280, 561, 28)]
# PII_object = ['shakhansho.sabzaliev_2023@ucentralasia.org']

# output = []
# find_coordinates_pytesseract(PII_object,
# anonym_obj.pages_data[0],
# output)

# assert output == expected, ('Expected email coordinates (570, 280, 561'
# f', 28), but function returned {output[0]}')


# def test_find_EOI(anonym_obj):
# pipeline = anonym_obj._nlp(anonym_obj.texts[0])

# expected_PER = ['hansho', 'Sabzal', 'Elvira', 'Sagyntay', 'hansh']
# output_PER = []

# find_EOI(pipeline=pipeline, matches=output_PER, EOI='PER')
# assert expected_PER == output_PER, ("`find_EOI` returned unexpected "
# "values for `EOI='PER'`")

# expected_LOC = ['Parkovaya', 'Moscow', 'Russia', 'Bishkek', 'Kyrgystan']
# output_LOC = []

# find_EOI(pipeline=pipeline, matches=output_LOC, EOI='LOC')
# assert expected_LOC == output_LOC, ("`find_EOI` returned unexpected "
# "values for `EOI='LOC'`")

# expected_ORG = ['Shak', 'ucent', 'CRM', 'Technologies', 'Panphilova',
# 'Inter', 'CRM', 'Technologies']
# output_ORG = []

# find_EOI(pipeline=pipeline, matches=output_ORG, EOI='ORG')
# assert expected_ORG == output_ORG, ("`find_EOI` returned unexpected "
# "values for `EOI='ORG'`")


# def test_find_emails(anonym_obj):
# expected = ['shakhansho.sabzaliev_2023@ucentralasia.org']
# output = []

# find_emails(anonym_obj.texts[0], output)
# assert expected == output, ("Method `find_emails`"
# "didn't return expected values")


# def test_find_months(anonym_obj):
# expected = ['November', 'November']
# output = []

# find_months(anonym_obj.texts[0], output)
# assert expected == output, ("Method `find_months`"
# "didn't return expected values")


# def test_find_numbers(anonym_obj):
# expected = ['11', '39', '1', '105264', '2023',
# '19', '2020', '1', '18', '2020']
# output = []

# find_numbers(anonym_obj.texts[0], output)
# assert expected == output, ("Method `find_numbers`"
# "didn't return expected values")
17 changes: 3 additions & 14 deletions examples/PDF/sample_pdf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,26 +81,15 @@
}
],
"source": [
"!pip install anonympy==0.3.3"
"!pip install anonympy"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 2,
"id": "a80d15e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.3.3'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import anonympy\n",
"\n",
Expand Down
Loading

0 comments on commit f275c9f

Please sign in to comment.