diff --git a/README.md b/README.md index c7c566d..1e3895f 100644 --- a/README.md +++ b/README.md @@ -19,28 +19,15 @@ python -m spacy download en_core_web_sm ## Example -To use the `Mask` class, create an instance and call the `mask_text` method with your input text: +To use the `Mask` class, create an instance and call the `mask` method with your input text and desired format. The default format is `txt`, so you don't need to pass format parameter if input data is simple text: ```python masker = Mask() -masked_text = masker.mask_text("Contact me at john.doe@example.com or call me at (555) 123-4567.") -print(masked_text) -``` -## Using the `mask_file` Method - -The `mask_file` method allows you to read a text file, mask any PII present, and return the masked content. Here’s how to use it: - -### Example +# Mask Json Data +masked_json = masker.mask(input_data='{"phone" : "(988) 888 9821"}',format='json') +print("Masked Json: " + masked_json) -1. Create a text file (e.g., `example.txt`) with the following content: -```bash -Please reach out to me at jane.doe@example.com or at (555) 987-6543. +# Mask text +masked_text = masker.mask(input_data='My name is John Doe and I live in Canada.') +print("Masked Text: " + masked_text) ``` -2. Use the `mask_file` method to mask the PII in the file: - -```python -masker = Mask() -masked_file_text = masker.mask_file("example.txt") -print(masked_file_text) -``` - diff --git a/main.py b/main.py index ac40bbc..73ad7d7 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,12 @@ if __name__ == "__main__": mask_pii = Mask() try: - masked_text = mask_pii.mask_file('file.txt') + # Mask Json Data + masked_json = mask_pii.mask(input_data='{"phone" : "(988) 888 9821"}',format='json') + print("Masked Json: " + masked_json) + + # Mask text + masked_text = mask_pii.mask(input_data='My name is John Doe and I live in Canada.') print("Masked Text: " + masked_text) - except: - print('Unexpected error!!') \ No newline at end of file + except Exception as e: + print(e) \ No newline at end of file diff --git a/mask/mask.py b/mask/mask.py index 8cf2d57..2ba5135 100644 --- a/mask/mask.py +++ b/mask/mask.py @@ -1,19 +1,19 @@ import spacy import re +from typing import Union +import json class Mask: - def __init__(self) -> None: - self.patterns = { + __patterns = { 'phone': r'(?:\+\d{1,3}[-\s]?)?\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}', 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'ipv6': r'\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b', 'ipv4': r'\b((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}\b' - } + } def mask_text(self,text:str) -> str: - - for category,pattern in self.patterns.items(): - text = re.sub(pattern, f'[REDACTED {category.upper()}]', text) + for category,pattern in self.__patterns.items(): + text = re.sub(pattern, f'[MASKED {category.upper()}]', text) text = self.mask_nlp(text) @@ -24,12 +24,25 @@ def mask_nlp(self,text:str) -> str: doc = nlp(text) for ent in doc.ents: - print(f'Label: {ent.label_}: Value: {ent.text}') - text = re.sub(ent.text,f'[REDACTED {ent.label_}]',text) + # print(f'Label: {ent.label_}: Value: {ent.text}') + text = re.sub(ent.text,f'[MASKED {ent.label_}]',text) return text - def mask_file(self,file_name:str) -> str: - print('File Name: ' + file_name) - with open(file_name,'r') as file: - file_text = file.read() - return self.mask_text(file_text) \ No newline at end of file + def mask_dict(self, input_data: dict) -> dict: + for key,value in input_data.items(): + + if isinstance(value,dict): + input_data[key] = self.mask_dict(value) + elif isinstance(value,list): + input_data[key] = [self.mask_dict(sub_val) if isinstance(sub_val,dict) else sub_val for sub_val in value] + elif isinstance(value,str): + input_data[key] = self.mask_text(value) + return input_data + + def mask(self, input_data: Union[str,dict], format: str = "txt") -> Union[str,dict]: + if format == "txt": + masked_data = self.mask_text(input_data) + elif format == "json": + masked_data = json.loads(input_data) + masked_data = json.dumps(self.mask_dict(masked_data),separators=(",",":")) + return masked_data \ No newline at end of file