diff --git a/examples/data_augmentation/tutorial_for_data_augmentation.ipynb b/examples/data_augmentation/tutorial_for_data_augmentation.ipynb index b488cb164..9acf3a75f 100644 --- a/examples/data_augmentation/tutorial_for_data_augmentation.ipynb +++ b/examples/data_augmentation/tutorial_for_data_augmentation.ipynb @@ -20,8 +20,9 @@ "metadata": {}, "outputs": [], "source": [ - "from forte.processors.base.data_augment_processor import ReplacementDataAugmentProcessor\n", + "from forte.processors.data_augment import ReplacementDataAugmentProcessor\n", "from forte.pipeline import Pipeline\n", + "from forte.data.multi_pack import MultiPack\n", "\n", "nlp = Pipeline[MultiPack]()\n", "\n", @@ -48,7 +49,43 @@ "}\n", "\n", "processor = ReplacementDataAugmentProcessor()\n", - "nlp.add(component=processor, configs=processor_config)" + "nlp.add(component=processor, config=processor_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is another example for typo data augmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from forte.data.data_pack import DataPack\n", + "from ft.onto.base_ontology import Token\n", + "from forte.processors.data_augment.algorithms.typo_replacement_op import (\n", + " TypoReplacementOp,\n", + ")\n", + "\n", + "opr = TypoReplacementOp(\n", + " configs={\n", + " \"prob\": 0.6,\n", + " 'typo_generator': 'uniform',\n", + " 'dict_path': 'https://raw.githubusercontent.com/wanglec/temporaryJson/main/misspelling.json'\n", + " }\n", + ")\n", + "data_pack = DataPack()\n", + "data_pack.set_text(\"commonly addressable\")\n", + "token_1 = Token(data_pack, 0, 8)\n", + "token_2 = Token(data_pack, 9, 20)\n", + "data_pack.add_entry(token_1)\n", + "data_pack.add_entry(token_2)\n", + "print(opr.replace(token_1))\n", + "print(opr.replace(token_2))" ] }, { @@ -145,13 +182,25 @@ "\n", "To see how to use these two classes to build the RL-based DA model, and to see an example that uses this algorithm for text classification, please refer to `examples/data_augmentation/reinforcemennt/README.md` for details." ] - }, + } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" } }, "nbformat": 4, diff --git a/forte/processors/base/data_selector_for_da.py b/forte/processors/base/data_selector_for_da.py index ba8ca0a07..cc378465e 100644 --- a/forte/processors/base/data_selector_for_da.py +++ b/forte/processors/base/data_selector_for_da.py @@ -54,7 +54,7 @@ def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.index = create_class_with_kwargs( self.configs.indexer_class, - class_args={"config": self.configs.index_configs}, + class_args={"config": self.configs.index_config}, ) def _create_search_key(self, data: Optional[str]) -> Dict[str, Any]: diff --git a/forte/processors/data_augment/algorithms/typo_replacement_op.py b/forte/processors/data_augment/algorithms/typo_replacement_op.py new file mode 100644 index 000000000..16350faec --- /dev/null +++ b/forte/processors/data_augment/algorithms/typo_replacement_op.py @@ -0,0 +1,117 @@ +# Copyright 2020 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +import json +from typing import Tuple, Union, Dict, Any + +import requests +from forte.data.ontology import Annotation +from forte.processors.data_augment.algorithms.text_replacement_op import ( + TextReplacementOp, +) +from forte.common.configuration import Config + +__all__ = [ + "UniformTypoGenerator", + "TypoReplacementOp", +] + + +class UniformTypoGenerator: + r""" + A uniform generateor that generates a typo from a typo dictionary. + + Args: + word: input word that needs to be replaced, + dict_path: the url or the path to the pre-defined typo json file. + The key is a word we want to replace. The value is a list + containing various typos of the corresponding key. + + .. code-block:: python + { + "apparent": ["aparent", "apparant"], + "bankruptcy": ["bankrupcy", "banruptcy"], + "barbecue": ["barbeque"] + } + """ + + def __init__(self, dict_path: str): + try: + r = requests.get(dict_path) + self.data = r.json() + except requests.exceptions.RequestException: + with open(dict_path, encoding="utf8") as json_file: + self.data = json.load(json_file) + + def generate(self, word: str) -> str: + if word in self.data.keys(): + result: str = random.choice(self.data[word]) + return result + else: + return word + + +class TypoReplacementOp(TextReplacementOp): + r""" + This class is a replacement op using a pre-defined + spelling mistake dictionary to simulate spelling mistake. + + Args: + configs: + The config should contain + `prob`(float): The probability of replacement, + should fall in [0, 1]. + dict_path (str): the url or the path to the pre-defined + typo json file. The key is a word we want to replace. + The value is a list containing various typos + of the corresponding key. + typo_generator (str): A generator that takes in a word and + outputs the replacement typo. + """ + + def __init__(self, configs: Union[Config, Dict[str, Any]]): + super().__init__(configs) + if "dict_path" in configs.keys(): + self.dict_path = configs["dict_path"] + else: + # default typo dictionary + self.dict_path = ( + "https://raw.githubusercontent.com/wanglec/" + + "temporaryJson/main/misspelling.json" + ) + if configs["typo_generator"] == "uniform": + self.typo_generator = UniformTypoGenerator(self.dict_path) + else: + raise ValueError( + "The valid options for typo_generator are [uniform]" + ) + + def replace(self, input_anno: Annotation) -> Tuple[bool, str]: + r""" + This function replaces a word from a typo dictionary. + + Args: + input_anno (Annotation): The input annotation. + Returns: + A tuple, where the first element is a boolean value indicating + whether the replacement happens, and the second element is the + replaced string. + """ + # If the replacement does not happen, return False. + if random.random() > self.configs.prob: + return False, input_anno.text + word: str = self.typo_generator.generate(input_anno.text) + return True, word diff --git a/tests/forte/processors/data_augment/algorithms/typo_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/typo_replacement_op_test.py new file mode 100644 index 000000000..d9f0fccd1 --- /dev/null +++ b/tests/forte/processors/data_augment/algorithms/typo_replacement_op_test.py @@ -0,0 +1,54 @@ +# Copyright 2020 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for dictionary word replacement op. +""" + +import unittest +from forte.data.data_pack import DataPack +from ft.onto.base_ontology import Token +from forte.processors.data_augment.algorithms.typo_replacement_op import ( + TypoReplacementOp, +) + + +class TestTypoReplacementOp(unittest.TestCase): + def setUp(self): + self.tyre = TypoReplacementOp( + configs={ + "prob": 1.0, + "typo_generator": "uniform", + } + ) + + def test_replace(self): + data_pack = DataPack() + data_pack.set_text("auxiliary colleague apple") + token_1 = Token(data_pack, 0, 9) + token_2 = Token(data_pack, 10, 19) + token_3 = Token(data_pack, 20, 25) + data_pack.add_entry(token_1) + data_pack.add_entry(token_2) + data_pack.add_entry(token_3) + + self.assertIn( + self.tyre.replace(token_1)[1], + ["auxilliary", "auxilary", "auxillary"], + ) + self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"]) + self.assertIn(self.tyre.replace(token_3)[1], ["apple"]) + + +if __name__ == "__main__": + unittest.main()