From 51af1fe7b9b19e577466d476fb369cb78e0c97b0 Mon Sep 17 00:00:00 2001 From: anonmanak2000 Date: Wed, 9 Oct 2024 23:41:28 +0530 Subject: [PATCH 1/2] Initial Commit --- __init__.py | 0 main.py | 9 +++++++++ mask/__init__.py | 1 + mask/mask.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 __init__.py create mode 100644 main.py create mode 100644 mask/__init__.py create mode 100644 mask/mask.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..ac40bbc --- /dev/null +++ b/main.py @@ -0,0 +1,9 @@ +from mask import Mask + +if __name__ == "__main__": + mask_pii = Mask() + try: + masked_text = mask_pii.mask_file('file.txt') + print("Masked Text: " + masked_text) + except: + print('Unexpected error!!') \ No newline at end of file diff --git a/mask/__init__.py b/mask/__init__.py new file mode 100644 index 0000000..986ae4f --- /dev/null +++ b/mask/__init__.py @@ -0,0 +1 @@ +from .mask import Mask \ No newline at end of file diff --git a/mask/mask.py b/mask/mask.py new file mode 100644 index 0000000..8cf2d57 --- /dev/null +++ b/mask/mask.py @@ -0,0 +1,35 @@ +import spacy +import re + +class Mask: + def __init__(self) -> None: + self.patterns = { + 'phone': r'(?:\+\d{1,3}[-\s]?)?\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}', + 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', + 'ipv6': r'\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b', + 'ipv4': r'\b((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}\b' + } + + def mask_text(self,text:str) -> str: + + for category,pattern in self.patterns.items(): + text = re.sub(pattern, f'[REDACTED {category.upper()}]', text) + + text = self.mask_nlp(text) + + return text + + def mask_nlp(self,text:str) -> str: + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + + for ent in doc.ents: + print(f'Label: {ent.label_}: Value: {ent.text}') + text = re.sub(ent.text,f'[REDACTED {ent.label_}]',text) + return text + + def mask_file(self,file_name:str) -> str: + print('File Name: ' + file_name) + with open(file_name,'r') as file: + file_text = file.read() + return self.mask_text(file_text) \ No newline at end of file From c1918e982ce94448f5b4cfc8b437bb81ba1aaa1c Mon Sep 17 00:00:00 2001 From: Manak Preet Singh <55709213+anonmanak2000@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:46:08 +0530 Subject: [PATCH 2/2] Update README.md --- README.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 599c1fb..c7c566d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,46 @@ -# Mask-PII -Mask Personally identifiable information +# PII Masking Tool + +![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg) +![Spacy](https://img.shields.io/badge/Spacy-3.0%2B-green.svg) +![Re](https://img.shields.io/badge/Regex-Enabled-yellow.svg) + +## Overview + +The PII Masking Tool is designed to help mask Personally Identifiable Information (PII) such as phone numbers, email addresses, and IP addresses in text documents. It uses regular expressions and the SpaCy library for Named Entity Recognition (NER) to identify and redact sensitive information. + +## Installation + +To get started with the PII Masking Tool, ensure you have Python 3.8 or higher installed. Then, install the required libraries: + +```bash +pip install spacy +python -m spacy download en_core_web_sm +``` + +## Example + +To use the `Mask` class, create an instance and call the `mask_text` method with your input text: + +```python +masker = Mask() +masked_text = masker.mask_text("Contact me at john.doe@example.com or call me at (555) 123-4567.") +print(masked_text) +``` +## Using the `mask_file` Method + +The `mask_file` method allows you to read a text file, mask any PII present, and return the masked content. Here’s how to use it: + +### Example + +1. Create a text file (e.g., `example.txt`) with the following content: +```bash +Please reach out to me at jane.doe@example.com or at (555) 987-6543. +``` +2. Use the `mask_file` method to mask the PII in the file: + +```python +masker = Mask() +masked_file_text = masker.mask_file("example.txt") +print(masked_file_text) +``` +