-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from anonmanak2000/feature
Feature
- Loading branch information
Showing
5 changed files
with
91 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,46 @@ | ||
# Mask-PII | ||
Mask Personally identifiable information | ||
# PII Masking Tool | ||
|
||
![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg) | ||
![Spacy](https://img.shields.io/badge/Spacy-3.0%2B-green.svg) | ||
![Re](https://img.shields.io/badge/Regex-Enabled-yellow.svg) | ||
|
||
## Overview | ||
|
||
The PII Masking Tool is designed to help mask Personally Identifiable Information (PII) such as phone numbers, email addresses, and IP addresses in text documents. It uses regular expressions and the SpaCy library for Named Entity Recognition (NER) to identify and redact sensitive information. | ||
|
||
## Installation | ||
|
||
To get started with the PII Masking Tool, ensure you have Python 3.8 or higher installed. Then, install the required libraries: | ||
|
||
```bash | ||
pip install spacy | ||
python -m spacy download en_core_web_sm | ||
``` | ||
|
||
## Example | ||
|
||
To use the `Mask` class, create an instance and call the `mask_text` method with your input text: | ||
|
||
```python | ||
masker = Mask() | ||
masked_text = masker.mask_text("Contact me at john.doe@example.com or call me at (555) 123-4567.") | ||
print(masked_text) | ||
``` | ||
## Using the `mask_file` Method | ||
|
||
The `mask_file` method allows you to read a text file, mask any PII present, and return the masked content. Here’s how to use it: | ||
|
||
### Example | ||
|
||
1. Create a text file (e.g., `example.txt`) with the following content: | ||
```bash | ||
Please reach out to me at jane.doe@example.com or at (555) 987-6543. | ||
``` | ||
2. Use the `mask_file` method to mask the PII in the file: | ||
|
||
```python | ||
masker = Mask() | ||
masked_file_text = masker.mask_file("example.txt") | ||
print(masked_file_text) | ||
``` | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from mask import Mask | ||
|
||
if __name__ == "__main__": | ||
mask_pii = Mask() | ||
try: | ||
masked_text = mask_pii.mask_file('file.txt') | ||
print("Masked Text: " + masked_text) | ||
except: | ||
print('Unexpected error!!') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .mask import Mask |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import spacy | ||
import re | ||
|
||
class Mask: | ||
def __init__(self) -> None: | ||
self.patterns = { | ||
'phone': r'(?:\+\d{1,3}[-\s]?)?\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}', | ||
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', | ||
'ipv6': r'\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b', | ||
'ipv4': r'\b((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}\b' | ||
} | ||
|
||
def mask_text(self,text:str) -> str: | ||
|
||
for category,pattern in self.patterns.items(): | ||
text = re.sub(pattern, f'[REDACTED {category.upper()}]', text) | ||
|
||
text = self.mask_nlp(text) | ||
|
||
return text | ||
|
||
def mask_nlp(self,text:str) -> str: | ||
nlp = spacy.load("en_core_web_sm") | ||
doc = nlp(text) | ||
|
||
for ent in doc.ents: | ||
print(f'Label: {ent.label_}: Value: {ent.text}') | ||
text = re.sub(ent.text,f'[REDACTED {ent.label_}]',text) | ||
return text | ||
|
||
def mask_file(self,file_name:str) -> str: | ||
print('File Name: ' + file_name) | ||
with open(file_name,'r') as file: | ||
file_text = file.read() | ||
return self.mask_text(file_text) |