Skip to content

Commit

Permalink
Add more augmentation methods (#519)
Browse files Browse the repository at this point in the history
* fix version update error and import errors

* clear output

* fix typo in data_select example

* add typo replacement augmentation and its example

* add copyright text

* fix documentations for typo replacement

* combine op and generator files and add unit tests

* allow dict_path to be a local file or an url

* commit deleted files and fix init in generator

* fix naming, add code-blocks and format python file

Co-authored-by: wanglechuan-gif <wanglechuan@evocolabs.com>
  • Loading branch information
wanglec and wanglechuan-gif authored Sep 8, 2021
1 parent c65814b commit 8bbf1a0
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 5 deletions.
57 changes: 53 additions & 4 deletions examples/data_augmentation/tutorial_for_data_augmentation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
"metadata": {},
"outputs": [],
"source": [
"from forte.processors.base.data_augment_processor import ReplacementDataAugmentProcessor\n",
"from forte.processors.data_augment import ReplacementDataAugmentProcessor\n",
"from forte.pipeline import Pipeline\n",
"from forte.data.multi_pack import MultiPack\n",
"\n",
"nlp = Pipeline[MultiPack]()\n",
"\n",
Expand All @@ -48,7 +49,43 @@
"}\n",
"\n",
"processor = ReplacementDataAugmentProcessor()\n",
"nlp.add(component=processor, configs=processor_config)"
"nlp.add(component=processor, config=processor_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here is another example for typo data augmentation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from forte.data.data_pack import DataPack\n",
"from ft.onto.base_ontology import Token\n",
"from forte.processors.data_augment.algorithms.typo_replacement_op import (\n",
" TypoReplacementOp,\n",
")\n",
"\n",
"opr = TypoReplacementOp(\n",
" configs={\n",
" \"prob\": 0.6,\n",
" 'typo_generator': 'uniform',\n",
" 'dict_path': 'https://raw.githubusercontent.com/wanglec/temporaryJson/main/misspelling.json'\n",
" }\n",
")\n",
"data_pack = DataPack()\n",
"data_pack.set_text(\"commonly addressable\")\n",
"token_1 = Token(data_pack, 0, 8)\n",
"token_2 = Token(data_pack, 9, 20)\n",
"data_pack.add_entry(token_1)\n",
"data_pack.add_entry(token_2)\n",
"print(opr.replace(token_1))\n",
"print(opr.replace(token_2))"
]
},
{
Expand Down Expand Up @@ -145,13 +182,25 @@
"\n",
"To see how to use these two classes to build the RL-based DA model, and to see an example that uses this algorithm for text classification, please refer to `examples/data_augmentation/reinforcemennt/README.md` for details."
]
},
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion forte/processors/base/data_selector_for_da.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
self.index = create_class_with_kwargs(
self.configs.indexer_class,
class_args={"config": self.configs.index_configs},
class_args={"config": self.configs.index_config},
)

def _create_search_key(self, data: Optional[str]) -> Dict[str, Any]:
Expand Down
117 changes: 117 additions & 0 deletions forte/processors/data_augment/algorithms/typo_replacement_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import random
import json
from typing import Tuple, Union, Dict, Any

import requests
from forte.data.ontology import Annotation
from forte.processors.data_augment.algorithms.text_replacement_op import (
TextReplacementOp,
)
from forte.common.configuration import Config

__all__ = [
"UniformTypoGenerator",
"TypoReplacementOp",
]


class UniformTypoGenerator:
r"""
A uniform generateor that generates a typo from a typo dictionary.
Args:
word: input word that needs to be replaced,
dict_path: the url or the path to the pre-defined typo json file.
The key is a word we want to replace. The value is a list
containing various typos of the corresponding key.
.. code-block:: python
{
"apparent": ["aparent", "apparant"],
"bankruptcy": ["bankrupcy", "banruptcy"],
"barbecue": ["barbeque"]
}
"""

def __init__(self, dict_path: str):
try:
r = requests.get(dict_path)
self.data = r.json()
except requests.exceptions.RequestException:
with open(dict_path, encoding="utf8") as json_file:
self.data = json.load(json_file)

def generate(self, word: str) -> str:
if word in self.data.keys():
result: str = random.choice(self.data[word])
return result
else:
return word


class TypoReplacementOp(TextReplacementOp):
r"""
This class is a replacement op using a pre-defined
spelling mistake dictionary to simulate spelling mistake.
Args:
configs:
The config should contain
`prob`(float): The probability of replacement,
should fall in [0, 1].
dict_path (str): the url or the path to the pre-defined
typo json file. The key is a word we want to replace.
The value is a list containing various typos
of the corresponding key.
typo_generator (str): A generator that takes in a word and
outputs the replacement typo.
"""

def __init__(self, configs: Union[Config, Dict[str, Any]]):
super().__init__(configs)
if "dict_path" in configs.keys():
self.dict_path = configs["dict_path"]
else:
# default typo dictionary
self.dict_path = (
"https://raw.githubusercontent.com/wanglec/"
+ "temporaryJson/main/misspelling.json"
)
if configs["typo_generator"] == "uniform":
self.typo_generator = UniformTypoGenerator(self.dict_path)
else:
raise ValueError(
"The valid options for typo_generator are [uniform]"
)

def replace(self, input_anno: Annotation) -> Tuple[bool, str]:
r"""
This function replaces a word from a typo dictionary.
Args:
input_anno (Annotation): The input annotation.
Returns:
A tuple, where the first element is a boolean value indicating
whether the replacement happens, and the second element is the
replaced string.
"""
# If the replacement does not happen, return False.
if random.random() > self.configs.prob:
return False, input_anno.text
word: str = self.typo_generator.generate(input_anno.text)
return True, word
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Unit tests for dictionary word replacement op.
"""

import unittest
from forte.data.data_pack import DataPack
from ft.onto.base_ontology import Token
from forte.processors.data_augment.algorithms.typo_replacement_op import (
TypoReplacementOp,
)


class TestTypoReplacementOp(unittest.TestCase):
def setUp(self):
self.tyre = TypoReplacementOp(
configs={
"prob": 1.0,
"typo_generator": "uniform",
}
)

def test_replace(self):
data_pack = DataPack()
data_pack.set_text("auxiliary colleague apple")
token_1 = Token(data_pack, 0, 9)
token_2 = Token(data_pack, 10, 19)
token_3 = Token(data_pack, 20, 25)
data_pack.add_entry(token_1)
data_pack.add_entry(token_2)
data_pack.add_entry(token_3)

self.assertIn(
self.tyre.replace(token_1)[1],
["auxilliary", "auxilary", "auxillary"],
)
self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"])
self.assertIn(self.tyre.replace(token_3)[1], ["apple"])


if __name__ == "__main__":
unittest.main()

0 comments on commit 8bbf1a0

Please sign in to comment.