Skip to content

Commit

Permalink
Added json data masking
Browse files Browse the repository at this point in the history
  • Loading branch information
anonmanak2000 committed Nov 9, 2024
1 parent b8549c0 commit 82ec00e
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 36 deletions.
27 changes: 7 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,15 @@ python -m spacy download en_core_web_sm

## Example

To use the `Mask` class, create an instance and call the `mask_text` method with your input text:
To use the `Mask` class, create an instance and call the `mask` method with your input text and desired format. The default format is `txt`, so you don't need to pass format parameter if input data is simple text:

```python
masker = Mask()
masked_text = masker.mask_text("Contact me at john.doe@example.com or call me at (555) 123-4567.")
print(masked_text)
```
## Using the `mask_file` Method

The `mask_file` method allows you to read a text file, mask any PII present, and return the masked content. Here’s how to use it:

### Example
# Mask Json Data
masked_json = masker.mask(input_data='{"phone" : "(988) 888 9821"}',format='json')
print("Masked Json: " + masked_json)

1. Create a text file (e.g., `example.txt`) with the following content:
```bash
Please reach out to me at jane.doe@example.com or at (555) 987-6543.
# Mask text
masked_text = masker.mask(input_data='My name is John Doe and I live in Canada.')
print("Masked Text: " + masked_text)
```
2. Use the `mask_file` method to mask the PII in the file:

```python
masker = Mask()
masked_file_text = masker.mask_file("example.txt")
print(masked_file_text)
```

11 changes: 8 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
if __name__ == "__main__":
mask_pii = Mask()
try:
masked_text = mask_pii.mask_file('file.txt')
# Mask Json Data
masked_json = mask_pii.mask(input_data='{"phone" : "(988) 888 9821"}',format='json')
print("Masked Json: " + masked_json)

# Mask text
masked_text = mask_pii.mask(input_data='My name is John Doe and I live in Canada.')
print("Masked Text: " + masked_text)
except:
print('Unexpected error!!')
except Exception as e:
print(e)
39 changes: 26 additions & 13 deletions mask/mask.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import spacy
import re
from typing import Union
import json

class Mask:
def __init__(self) -> None:
self.patterns = {
__patterns = {
'phone': r'(?:\+\d{1,3}[-\s]?)?\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}',
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'ipv6': r'\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b',
'ipv4': r'\b((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}\b'
}
}

def mask_text(self,text:str) -> str:

for category,pattern in self.patterns.items():
text = re.sub(pattern, f'[REDACTED {category.upper()}]', text)
for category,pattern in self.__patterns.items():
text = re.sub(pattern, f'[MASKED {category.upper()}]', text)

text = self.mask_nlp(text)

Expand All @@ -24,12 +24,25 @@ def mask_nlp(self,text:str) -> str:
doc = nlp(text)

for ent in doc.ents:
print(f'Label: {ent.label_}: Value: {ent.text}')
text = re.sub(ent.text,f'[REDACTED {ent.label_}]',text)
# print(f'Label: {ent.label_}: Value: {ent.text}')
text = re.sub(ent.text,f'[MASKED {ent.label_}]',text)
return text

def mask_file(self,file_name:str) -> str:
print('File Name: ' + file_name)
with open(file_name,'r') as file:
file_text = file.read()
return self.mask_text(file_text)
def mask_dict(self, input_data: dict) -> dict:
for key,value in input_data.items():

if isinstance(value,dict):
input_data[key] = self.mask_dict(value)
elif isinstance(value,list):
input_data[key] = [self.mask_dict(sub_val) if isinstance(sub_val,dict) else sub_val for sub_val in value]
elif isinstance(value,str):
input_data[key] = self.mask_text(value)
return input_data

def mask(self, input_data: Union[str,dict], format: str = "txt") -> Union[str,dict]:
if format == "txt":
masked_data = self.mask_text(input_data)
elif format == "json":
masked_data = json.loads(input_data)
masked_data = json.dumps(self.mask_dict(masked_data),separators=(",",":"))
return masked_data

0 comments on commit 82ec00e

Please sign in to comment.