Add monocleaner (#991)

* Add monocleaner * Install packages * Add more filtering stats * Clean temporary files * Configure monocleaner thresholds * Fix dataset configuration * Fix test config * Update test configs * Fix test * Add tests for cleaning parameters * Move packages required for PyICU to the base image * Revert "Move packages required for PyICU to the base image" This reverts commit cba9c04. * Roll back unrelated config changes
mozilla · Jan 21, 2025 · a9bf6d9 · a9bf6d9
1 parent 74c6b8a
commit a9bf6d9
Show file tree

Hide file tree

Showing 13 changed files with 298 additions and 28 deletions.
diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh
@@ -21,6 +21,7 @@ input_prefix=$2     # $MOZ_FETCHES_DIR/news_2007
 output_prefix=$3    # /builds/worker/artifacts/news_2007
 threads=$4          # auto
 dataset=$5          # news-crawl_news.2007
+fluency_threshold=$6 # 0.7
 
 # Example output: /builds/worker/artifacts/news_2007.en.zst
 
@@ -74,14 +75,36 @@ zstdmt -dc "${output_prefix}.${lang}.langid.zst" |
 parallel --no-notice --pipe -k -j "${threads}" --block 50M \
   "python3 tools/clean_mono.py -l ${lang} --debug" \
   2>"${output_prefix}.${lang}.clean.debug.txt" |
-zstdmt >"${output_prefix}.${lang}.zst"
+zstdmt >"${output_prefix}.${lang}.rule-based.zst"
 
-test -s "${output_prefix}.${lang}.zst" || exit 1
+test -s "${output_prefix}.${lang}.rule-based.zst" || exit 1
+
+######################################################################
+echo "### Filter by fluency score"
+
+if [ "${fluency_threshold}" == "0" ] || [ "${fluency_threshold}" == "0.0" ]; then
+  echo "Threshold is 0, skipping filtering"
+  cp "${output_prefix}.${lang}.rule-based.zst" "${output_prefix}.${lang}.zst"
+else
+  # the model is 125MB, similar in size to the fastText one, so it's ok to download it here
+  monocleaner-download $lang ${dir}/monocleaner
+  test -s "${output_prefix}.${lang}.zst" ||
+    zstd -dc "${output_prefix}.${lang}.rule-based.zst" |
+    # memory intensive
+    parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "monocleaner --disable_hardrules ${dir}/monocleaner/${lang}" |
+    awk -F'\t' '$2>'${fluency_threshold} | cut -f1 |
+    zstdmt >"${output_prefix}.${lang}.zst"
+
+  test -s "${output_prefix}.${lang}.zst" || exit 1
+fi
+echo "Lines before filtering: $(zstdmt -dc "${input_prefix}.${lang}.zst" | wc -l)"
+echo "Lines after rule-based filtering: $(zstdmt -dc "${output_prefix}.${lang}.rule-based.zst" | wc -l)"
+echo "Lines after fluency filtering: $(zstdmt -dc "${output_prefix}.${lang}.zst" | wc -l)"
 
 ######################################################################
 echo "### Remove data from intermediate steps"
 rm -rf "${output_prefix}".*.nrm.zst "${output_prefix}".*.langid.zst \
-  "${output_prefix}".*.monofix.zst
+  "${output_prefix}".*.monofix.zst "${output_prefix}".*.rule-based.zst ${dir}/monocleaner
 
 echo "### Rule-based cleaning log written to: ${output_prefix}.${lang}.clean.debug.txt"
 echo "### Clean data is written to: ${output_prefix}.${lang}.zst"

diff --git a/taskcluster/configs/config.ci.yml b/taskcluster/configs/config.ci.yml
@@ -30,8 +30,19 @@ experiment:
   bicleaner:
     default-threshold: 0.5
     dataset-thresholds:
-      opus_ada83/v1: 0.0
-      opus_ELRC-3075-wikipedia_health/v1: 0.6
+      opus_ada83_v1: 0.0
+      opus_ELRC-3075-wikipedia_health_v1: 0.6
+
+  monocleaner:
+    mono-src:
+      default-threshold: 0.5
+      dataset-thresholds:
+        news-crawl_news_2008: 0.0
+        opus_tldr-pages_v2023-08-29: 0.7
+    mono-trg:
+      default-threshold: 0.9
+      dataset-thresholds:
+        opus_tldr-pages_v2023-08-29: 0.6
 
   min-fluency-threshold:
     mono-src: 0.8

diff --git a/taskcluster/configs/config.prod.yml b/taskcluster/configs/config.prod.yml
@@ -29,17 +29,37 @@ experiment:
   opuscleaner-mode: "defaults"
 
   # Bicleaner is a tool that aims at detecting noisy sentence pairs in a parallel corpus.
+  # Use sanitized dataset names for compatibility with Taskcluster (replace ".", "/", ":", "[", "]" to "_")
   # See: docs/bicleaner.md
   bicleaner:
     default-threshold: 0.5
     dataset-thresholds:
-      opus_CCAligned/v1: 0.7
-      opus_LinguaTools-WikiTitles/v2014: 0.7
-      opus_OpenSubtitles/v2018: 0.8           # Example of a higher filtering level (for noisier datasets).
-      opus_ParaCrawl/v9: 0.7
-      opus_WikiMatrix/v1: 0.7
-      opus_bible-uedin/v1: 0.7
-#      opus_ParaCrawl/v9: 0.0                 # Example of disabled filtering (if the corpus is already filtered)
+      opus_CCAligned_v1: 0.7
+      opus_LinguaTools-WikiTitles_v2014: 0.7
+      opus_OpenSubtitles_v2018: 0.8           # Example of a higher filtering level (for noisier datasets).
+      opus_ParaCrawl_v9: 0.7
+      opus_WikiMatrix_v1: 0.7
+      opus_bible-uedin_v1: 0.7
+#      opus_ParaCrawl_v9: 0.0                 # Example of disabled filtering (if the corpus is already filtered)
+
+  # Monocleaner filters sentences in monolingual corpus based on language fluency
+  # Use sanitized dataset names for compatibility with Taskcluster (replace ".", "/", ":", "[", "]" to "_")
+  monocleaner:
+    mono-src:
+      # News-crawl is typically clean, enable on dataset by dataset basis
+      default-threshold: 0.0
+      dataset-thresholds:
+        # Enable for HPLT, the models were recently updated
+        hplt_mono_v1_2: 0.8
+        # Filter only garbage from NLLB
+        opus_NLLB_v1: 0.5
+    mono-trg:
+      # News-crawl is typically clean, enable on dataset by dataset basis
+      default-threshold: 0.0
+      dataset-thresholds:
+        # Sentences for back-translations should be in fluent language
+        hplt_mono_v1_2: 0.9
+        opus_NLLB_v1: 0.9
 
   # Limits the maximum sentences use in the monolingual data for both the source and target languages.
   mono-max-sentences-src:

diff --git a/taskcluster/kinds/bicleaner/kind.yml b/taskcluster/kinds/bicleaner/kind.yml
@@ -33,7 +33,7 @@ tasks:
                     - pipeline/bicleaner/requirements/bicleaner-ai.txt
                 from-parameters:
                     bicleaner_threshold:
-                        - training_config.experiment.bicleaner.dataset-thresholds.{dataset}
+                        - training_config.experiment.bicleaner.dataset-thresholds.{provider}_{dataset_sanitized}
                         - training_config.experiment.bicleaner.default-threshold
 
         dataset-config:
@@ -52,7 +52,7 @@ tasks:
         task-context:
             from-parameters:
                 bicleaner_threshold:
-                    - training_config.experiment.bicleaner.dataset-thresholds.{dataset}
+                    - training_config.experiment.bicleaner.dataset-thresholds.{provider}_{dataset_sanitized}
                     - training_config.experiment.bicleaner.default-threshold
             substitution-fields:
                 - run.command

diff --git a/taskcluster/kinds/clean-mono/kind.yml b/taskcluster/kinds/clean-mono/kind.yml
@@ -8,12 +8,15 @@ loader: taskgraph.loader.transform:loader
 transforms:
     - translations_taskgraph.transforms.worker_selection
     - translations_taskgraph.transforms.from_datasets:mono
+    - taskgraph.transforms.task_context
     - taskgraph.transforms.run:transforms
     - translations_taskgraph.transforms.cached_tasks:transforms
     - taskgraph.transforms.task:transforms
 
 kind-dependencies:
     - dataset
+    - fetch
+    - toolchain
 
 task-defaults:
     attributes:
@@ -54,7 +57,14 @@ task-defaults:
             - name
             - dependencies
             - fetches
+            - attributes.cache.from-parameters.monocleaner_threshold
+            - task-context.from-parameters.monocleaner_threshold
             - run.command
+
+    task-context:
+        substitution-fields:
+            - run.command
+
     worker:
         docker-image: {"in-tree": "train"}
         # 7 days. yes, it can take a while to clean a huge dataset
@@ -75,10 +85,26 @@ task-defaults:
         command:
             - bash
             - -c
-            - $VCS_PATH/pipeline/clean/clean-mono.sh {locale} $MOZ_FETCHES_DIR/{dataset_sanitized} $TASK_WORKDIR/artifacts/{dataset_sanitized} auto {dataset}
+            - >-                
+                pip install $MOZ_FETCHES_DIR/cyhunspell-2.0.3-cp310-cp310-linux_x86_64.whl &&
+                pip install $MOZ_FETCHES_DIR/kenlm-0.0.0-cp310-cp310-linux_x86_64.whl &&
+                pip install monocleaner==1.7.0 &&
+                export PATH=$PATH:~/.local/bin &&
+                $VCS_PATH/pipeline/clean/clean-mono.sh 
+                {locale} 
+                $MOZ_FETCHES_DIR/{dataset_sanitized} 
+                $TASK_WORKDIR/artifacts/{dataset_sanitized} 
+                auto 
+                {dataset} 
+                {monocleaner_threshold}
     dependencies:
         "{provider}-{locale}": dataset-{provider}-{dataset_sanitized}-{locale}
     fetches:
+        toolchain:
+            - artifact: cyhunspell
+              extract: false
+            - artifact: kenlm
+              extract: false
         "{provider}-{locale}":
             - artifact: "{dataset_sanitized}.{locale}.zst"
               extract: false
@@ -88,12 +114,33 @@ tasks:
         description: Clean {provider} {dataset_sanitized} dataset mono-src {src_locale}
         attributes:
             dataset-category: mono-src
+            cache:
+                from-parameters:
+                    monocleaner_threshold:
+                        - training_config.experiment.monocleaner.mono-src.dataset-thresholds.{provider}_{dataset_sanitized}
+                        - training_config.experiment.monocleaner.mono-src.default-threshold
         dataset-config:
             category: mono-src
+        task-context:
+            from-parameters:
+                monocleaner_threshold:
+                    - training_config.experiment.monocleaner.mono-src.dataset-thresholds.{provider}_{dataset_sanitized}
+                    - training_config.experiment.monocleaner.mono-src.default-threshold
+
 
     "{provider}-{trg_locale}-{dataset_sanitized}-mono-trg":
         description: Clean {provider} {dataset_sanitized} dataset mono-trg {trg_locale}
         attributes:
             dataset-category: mono-trg
+            cache:
+                from-parameters:
+                    monocleaner_threshold:
+                        - training_config.experiment.monocleaner.mono-trg.dataset-thresholds.{provider}_{dataset_sanitized}
+                        - training_config.experiment.monocleaner.mono-trg.default-threshold
         dataset-config:
             category: mono-trg
+        task-context:
+            from-parameters:
+                monocleaner_threshold:
+                    - training_config.experiment.monocleaner.mono-trg.dataset-thresholds.{provider}_{dataset_sanitized}
+                    - training_config.experiment.monocleaner.mono-trg.default-threshold
diff --git a/taskcluster/test/params/large-lt-en.yml b/taskcluster/test/params/large-lt-en.yml
@@ -128,12 +128,22 @@ training_config:
     bicleaner:
       dataset-thresholds:
         mtdata_Statmt-wiki_titles-1-lit-eng: 0.7
-        opus_CCAligned/v1: 0.7
-        opus_OpenSubtitles/v2018: 0.8
-        opus_ParaCrawl/v9: 0.0
-        opus_WikiMatrix/v1: 0.7
-        opus_bible-uedin/v1: 0.7
+        opus_CCAligned_v1: 0.7
+        opus_OpenSubtitles_v2018: 0.8
+        opus_ParaCrawl_v9: 0.0
+        opus_WikiMatrix_v1: 0.7
+        opus_bible-uedin_v1: 0.7
       default-threshold: 0.5
+    monocleaner:
+      mono-src:
+        default-threshold: 0.5
+        dataset-thresholds:
+          news-crawl_news_2008: 0.0
+      mono-trg:
+        default-threshold: 0.9
+        dataset-thresholds:
+          news-crawl_news_2007: 0.0
+          opus_tldr-pages_v2023-08-29: 0.7
     mono-max-sentences-src:
       total: 500_000_000
       per-dataset: 200_000_000

diff --git a/taskcluster/test/params/small-ru-en.yml b/taskcluster/test/params/small-ru-en.yml
@@ -43,9 +43,19 @@ training_config:
     best-model: chrf
     bicleaner:
       dataset-thresholds:
-        opus_ELRC-3075-wikipedia_health/v1: 0.6
-        opus_ada83/v1: 0.0
+        opus_ELRC-3075-wikipedia_health_v1: 0.6
+        opus_ada83_v1: 0.0
       default-threshold: 0.5
+    monocleaner:
+      mono-src:
+        default-threshold: 0.5
+        dataset-thresholds:
+          news-crawl_news_2008: 0.0
+      mono-trg:
+        default-threshold: 0.9
+        dataset-thresholds:
+          news-crawl_news_2007: 0.0
+          opus_tldr-pages_v2023-08-29: 0.7
     min-fluency-threshold:
       mono-src: 0.8
       mono-trg: 0.9

diff --git a/taskcluster/test/test_cleaning_params.py b/taskcluster/test/test_cleaning_params.py
@@ -0,0 +1,69 @@
+from copy import deepcopy
+
+from taskgraph.taskgraph import TaskGraph
+
+from translations_taskgraph.parameters import get_ci_training_config
+
+PARAMS = deepcopy(get_ci_training_config())
+
+
+def test_monocleaner_params(full_task_graph: TaskGraph):
+    tasks = {t.label: t for t in full_task_graph.tasks.values()}
+
+    assert (
+        float(
+            tasks["clean-mono-news-crawl-ru-news_2008-mono-src"].task["payload"]["command"][-1][
+                -3:
+            ]
+        )
+        == PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][
+            "dataset-thresholds"
+        ]["news-crawl_news_2008"]
+    )
+    assert (
+        float(
+            tasks["clean-mono-news-crawl-en-news_2007-mono-trg"].task["payload"]["command"][-1][
+                -3:
+            ]
+        )
+        == PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"]["default-threshold"]
+    )
+    assert (
+        float(
+            tasks["clean-mono-opus-ru-tldr-pages_v2023-08-29-mono-src"].task["payload"]["command"][
+                -1
+            ][-3:]
+        )
+        == PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][
+            "dataset-thresholds"
+        ]["opus_tldr-pages_v2023-08-29"]
+    )
+    assert (
+        float(
+            tasks["clean-mono-opus-en-tldr-pages_v2023-08-29-mono-trg"].task["payload"]["command"][
+                -1
+            ][-3:]
+        )
+        == PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"][
+            "dataset-thresholds"
+        ]["opus_tldr-pages_v2023-08-29"]
+    )
+
+
+def test_bicleaner_params(full_task_graph: TaskGraph):
+    tasks = {t.label: t for t in full_task_graph.tasks.values()}
+
+    assert (
+        str(PARAMS["training_config"]["experiment"]["bicleaner"]["default-threshold"])
+        in tasks["bicleaner-ai-mtdata-Tilde-airbaltic-1-eng-rus-ru-en"].task["payload"]["command"][
+            -1
+        ][-1][-50:]
+    )
+    assert (
+        str(
+            PARAMS["training_config"]["experiment"]["bicleaner"]["dataset-thresholds"][
+                "opus_ada83_v1"
+            ]
+        )
+        in tasks["bicleaner-ai-opus-ada83_v1-ru-en"].task["payload"]["command"][-1][-1][-50:]
+    )
diff --git a/taskcluster/test/test_training_continuation_teacher.py b/taskcluster/test/test_training_continuation_teacher.py
@@ -2,9 +2,9 @@
 
 from taskgraph.taskgraph import TaskGraph
 
-from translations_taskgraph.parameters import get_defaults
+from translations_taskgraph.parameters import get_ci_training_config
 
-PARAMS = deepcopy(get_defaults(None))
+PARAMS = deepcopy(get_ci_training_config(None))
 PARAMS["target_tasks_method"] = "train-target-tasks"
 PARAMS["training_config"]["experiment"]["pretrained-models"] = {
     "train-teacher": {