Skip to content

Commit

Permalink
Add monocleaner (#991)
Browse files Browse the repository at this point in the history
* Add monocleaner

* Install packages

* Add more filtering stats

* Clean temporary files

* Configure monocleaner thresholds

* Fix dataset configuration

* Fix test config

* Update test configs

* Fix test

* Add tests for cleaning parameters

* Move packages required for PyICU to the base image

* Revert "Move packages required for PyICU to the base image"

This reverts commit cba9c04.

* Roll back unrelated config changes
  • Loading branch information
eu9ene authored Jan 21, 2025
1 parent 74c6b8a commit a9bf6d9
Show file tree
Hide file tree
Showing 13 changed files with 298 additions and 28 deletions.
29 changes: 26 additions & 3 deletions pipeline/clean/clean-mono.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ input_prefix=$2 # $MOZ_FETCHES_DIR/news_2007
output_prefix=$3 # /builds/worker/artifacts/news_2007
threads=$4 # auto
dataset=$5 # news-crawl_news.2007
fluency_threshold=$6 # 0.7

# Example output: /builds/worker/artifacts/news_2007.en.zst

Expand Down Expand Up @@ -74,14 +75,36 @@ zstdmt -dc "${output_prefix}.${lang}.langid.zst" |
parallel --no-notice --pipe -k -j "${threads}" --block 50M \
"python3 tools/clean_mono.py -l ${lang} --debug" \
2>"${output_prefix}.${lang}.clean.debug.txt" |
zstdmt >"${output_prefix}.${lang}.zst"
zstdmt >"${output_prefix}.${lang}.rule-based.zst"

test -s "${output_prefix}.${lang}.zst" || exit 1
test -s "${output_prefix}.${lang}.rule-based.zst" || exit 1

######################################################################
echo "### Filter by fluency score"

if [ "${fluency_threshold}" == "0" ] || [ "${fluency_threshold}" == "0.0" ]; then
echo "Threshold is 0, skipping filtering"
cp "${output_prefix}.${lang}.rule-based.zst" "${output_prefix}.${lang}.zst"
else
# the model is 125MB, similar in size to the fastText one, so it's ok to download it here
monocleaner-download $lang ${dir}/monocleaner
test -s "${output_prefix}.${lang}.zst" ||
zstd -dc "${output_prefix}.${lang}.rule-based.zst" |
# memory intensive
parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "monocleaner --disable_hardrules ${dir}/monocleaner/${lang}" |
awk -F'\t' '$2>'${fluency_threshold} | cut -f1 |
zstdmt >"${output_prefix}.${lang}.zst"

test -s "${output_prefix}.${lang}.zst" || exit 1
fi
echo "Lines before filtering: $(zstdmt -dc "${input_prefix}.${lang}.zst" | wc -l)"
echo "Lines after rule-based filtering: $(zstdmt -dc "${output_prefix}.${lang}.rule-based.zst" | wc -l)"
echo "Lines after fluency filtering: $(zstdmt -dc "${output_prefix}.${lang}.zst" | wc -l)"

######################################################################
echo "### Remove data from intermediate steps"
rm -rf "${output_prefix}".*.nrm.zst "${output_prefix}".*.langid.zst \
"${output_prefix}".*.monofix.zst
"${output_prefix}".*.monofix.zst "${output_prefix}".*.rule-based.zst ${dir}/monocleaner

echo "### Rule-based cleaning log written to: ${output_prefix}.${lang}.clean.debug.txt"
echo "### Clean data is written to: ${output_prefix}.${lang}.zst"
Expand Down
15 changes: 13 additions & 2 deletions taskcluster/configs/config.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,19 @@ experiment:
bicleaner:
default-threshold: 0.5
dataset-thresholds:
opus_ada83/v1: 0.0
opus_ELRC-3075-wikipedia_health/v1: 0.6
opus_ada83_v1: 0.0
opus_ELRC-3075-wikipedia_health_v1: 0.6

monocleaner:
mono-src:
default-threshold: 0.5
dataset-thresholds:
news-crawl_news_2008: 0.0
opus_tldr-pages_v2023-08-29: 0.7
mono-trg:
default-threshold: 0.9
dataset-thresholds:
opus_tldr-pages_v2023-08-29: 0.6

min-fluency-threshold:
mono-src: 0.8
Expand Down
34 changes: 27 additions & 7 deletions taskcluster/configs/config.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,37 @@ experiment:
opuscleaner-mode: "defaults"

# Bicleaner is a tool that aims at detecting noisy sentence pairs in a parallel corpus.
# Use sanitized dataset names for compatibility with Taskcluster (replace ".", "/", ":", "[", "]" to "_")
# See: docs/bicleaner.md
bicleaner:
default-threshold: 0.5
dataset-thresholds:
opus_CCAligned/v1: 0.7
opus_LinguaTools-WikiTitles/v2014: 0.7
opus_OpenSubtitles/v2018: 0.8 # Example of a higher filtering level (for noisier datasets).
opus_ParaCrawl/v9: 0.7
opus_WikiMatrix/v1: 0.7
opus_bible-uedin/v1: 0.7
# opus_ParaCrawl/v9: 0.0 # Example of disabled filtering (if the corpus is already filtered)
opus_CCAligned_v1: 0.7
opus_LinguaTools-WikiTitles_v2014: 0.7
opus_OpenSubtitles_v2018: 0.8 # Example of a higher filtering level (for noisier datasets).
opus_ParaCrawl_v9: 0.7
opus_WikiMatrix_v1: 0.7
opus_bible-uedin_v1: 0.7
# opus_ParaCrawl_v9: 0.0 # Example of disabled filtering (if the corpus is already filtered)

# Monocleaner filters sentences in monolingual corpus based on language fluency
# Use sanitized dataset names for compatibility with Taskcluster (replace ".", "/", ":", "[", "]" to "_")
monocleaner:
mono-src:
# News-crawl is typically clean, enable on dataset by dataset basis
default-threshold: 0.0
dataset-thresholds:
# Enable for HPLT, the models were recently updated
hplt_mono_v1_2: 0.8
# Filter only garbage from NLLB
opus_NLLB_v1: 0.5
mono-trg:
# News-crawl is typically clean, enable on dataset by dataset basis
default-threshold: 0.0
dataset-thresholds:
# Sentences for back-translations should be in fluent language
hplt_mono_v1_2: 0.9
opus_NLLB_v1: 0.9

# Limits the maximum sentences use in the monolingual data for both the source and target languages.
mono-max-sentences-src:
Expand Down
4 changes: 2 additions & 2 deletions taskcluster/kinds/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ tasks:
- pipeline/bicleaner/requirements/bicleaner-ai.txt
from-parameters:
bicleaner_threshold:
- training_config.experiment.bicleaner.dataset-thresholds.{dataset}
- training_config.experiment.bicleaner.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.bicleaner.default-threshold

dataset-config:
Expand All @@ -52,7 +52,7 @@ tasks:
task-context:
from-parameters:
bicleaner_threshold:
- training_config.experiment.bicleaner.dataset-thresholds.{dataset}
- training_config.experiment.bicleaner.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.bicleaner.default-threshold
substitution-fields:
- run.command
Expand Down
49 changes: 48 additions & 1 deletion taskcluster/kinds/clean-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ loader: taskgraph.loader.transform:loader
transforms:
- translations_taskgraph.transforms.worker_selection
- translations_taskgraph.transforms.from_datasets:mono
- taskgraph.transforms.task_context
- taskgraph.transforms.run:transforms
- translations_taskgraph.transforms.cached_tasks:transforms
- taskgraph.transforms.task:transforms

kind-dependencies:
- dataset
- fetch
- toolchain

task-defaults:
attributes:
Expand Down Expand Up @@ -54,7 +57,14 @@ task-defaults:
- name
- dependencies
- fetches
- attributes.cache.from-parameters.monocleaner_threshold
- task-context.from-parameters.monocleaner_threshold
- run.command

task-context:
substitution-fields:
- run.command

worker:
docker-image: {"in-tree": "train"}
# 7 days. yes, it can take a while to clean a huge dataset
Expand All @@ -75,10 +85,26 @@ task-defaults:
command:
- bash
- -c
- $VCS_PATH/pipeline/clean/clean-mono.sh {locale} $MOZ_FETCHES_DIR/{dataset_sanitized} $TASK_WORKDIR/artifacts/{dataset_sanitized} auto {dataset}
- >-
pip install $MOZ_FETCHES_DIR/cyhunspell-2.0.3-cp310-cp310-linux_x86_64.whl &&
pip install $MOZ_FETCHES_DIR/kenlm-0.0.0-cp310-cp310-linux_x86_64.whl &&
pip install monocleaner==1.7.0 &&
export PATH=$PATH:~/.local/bin &&
$VCS_PATH/pipeline/clean/clean-mono.sh
{locale}
$MOZ_FETCHES_DIR/{dataset_sanitized}
$TASK_WORKDIR/artifacts/{dataset_sanitized}
auto
{dataset}
{monocleaner_threshold}
dependencies:
"{provider}-{locale}": dataset-{provider}-{dataset_sanitized}-{locale}
fetches:
toolchain:
- artifact: cyhunspell
extract: false
- artifact: kenlm
extract: false
"{provider}-{locale}":
- artifact: "{dataset_sanitized}.{locale}.zst"
extract: false
Expand All @@ -88,12 +114,33 @@ tasks:
description: Clean {provider} {dataset_sanitized} dataset mono-src {src_locale}
attributes:
dataset-category: mono-src
cache:
from-parameters:
monocleaner_threshold:
- training_config.experiment.monocleaner.mono-src.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.monocleaner.mono-src.default-threshold
dataset-config:
category: mono-src
task-context:
from-parameters:
monocleaner_threshold:
- training_config.experiment.monocleaner.mono-src.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.monocleaner.mono-src.default-threshold


"{provider}-{trg_locale}-{dataset_sanitized}-mono-trg":
description: Clean {provider} {dataset_sanitized} dataset mono-trg {trg_locale}
attributes:
dataset-category: mono-trg
cache:
from-parameters:
monocleaner_threshold:
- training_config.experiment.monocleaner.mono-trg.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.monocleaner.mono-trg.default-threshold
dataset-config:
category: mono-trg
task-context:
from-parameters:
monocleaner_threshold:
- training_config.experiment.monocleaner.mono-trg.dataset-thresholds.{provider}_{dataset_sanitized}
- training_config.experiment.monocleaner.mono-trg.default-threshold
20 changes: 15 additions & 5 deletions taskcluster/test/params/large-lt-en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,22 @@ training_config:
bicleaner:
dataset-thresholds:
mtdata_Statmt-wiki_titles-1-lit-eng: 0.7
opus_CCAligned/v1: 0.7
opus_OpenSubtitles/v2018: 0.8
opus_ParaCrawl/v9: 0.0
opus_WikiMatrix/v1: 0.7
opus_bible-uedin/v1: 0.7
opus_CCAligned_v1: 0.7
opus_OpenSubtitles_v2018: 0.8
opus_ParaCrawl_v9: 0.0
opus_WikiMatrix_v1: 0.7
opus_bible-uedin_v1: 0.7
default-threshold: 0.5
monocleaner:
mono-src:
default-threshold: 0.5
dataset-thresholds:
news-crawl_news_2008: 0.0
mono-trg:
default-threshold: 0.9
dataset-thresholds:
news-crawl_news_2007: 0.0
opus_tldr-pages_v2023-08-29: 0.7
mono-max-sentences-src:
total: 500_000_000
per-dataset: 200_000_000
Expand Down
14 changes: 12 additions & 2 deletions taskcluster/test/params/small-ru-en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,19 @@ training_config:
best-model: chrf
bicleaner:
dataset-thresholds:
opus_ELRC-3075-wikipedia_health/v1: 0.6
opus_ada83/v1: 0.0
opus_ELRC-3075-wikipedia_health_v1: 0.6
opus_ada83_v1: 0.0
default-threshold: 0.5
monocleaner:
mono-src:
default-threshold: 0.5
dataset-thresholds:
news-crawl_news_2008: 0.0
mono-trg:
default-threshold: 0.9
dataset-thresholds:
news-crawl_news_2007: 0.0
opus_tldr-pages_v2023-08-29: 0.7
min-fluency-threshold:
mono-src: 0.8
mono-trg: 0.9
Expand Down
69 changes: 69 additions & 0 deletions taskcluster/test/test_cleaning_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from copy import deepcopy

from taskgraph.taskgraph import TaskGraph

from translations_taskgraph.parameters import get_ci_training_config

PARAMS = deepcopy(get_ci_training_config())


def test_monocleaner_params(full_task_graph: TaskGraph):
tasks = {t.label: t for t in full_task_graph.tasks.values()}

assert (
float(
tasks["clean-mono-news-crawl-ru-news_2008-mono-src"].task["payload"]["command"][-1][
-3:
]
)
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][
"dataset-thresholds"
]["news-crawl_news_2008"]
)
assert (
float(
tasks["clean-mono-news-crawl-en-news_2007-mono-trg"].task["payload"]["command"][-1][
-3:
]
)
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"]["default-threshold"]
)
assert (
float(
tasks["clean-mono-opus-ru-tldr-pages_v2023-08-29-mono-src"].task["payload"]["command"][
-1
][-3:]
)
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][
"dataset-thresholds"
]["opus_tldr-pages_v2023-08-29"]
)
assert (
float(
tasks["clean-mono-opus-en-tldr-pages_v2023-08-29-mono-trg"].task["payload"]["command"][
-1
][-3:]
)
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"][
"dataset-thresholds"
]["opus_tldr-pages_v2023-08-29"]
)


def test_bicleaner_params(full_task_graph: TaskGraph):
tasks = {t.label: t for t in full_task_graph.tasks.values()}

assert (
str(PARAMS["training_config"]["experiment"]["bicleaner"]["default-threshold"])
in tasks["bicleaner-ai-mtdata-Tilde-airbaltic-1-eng-rus-ru-en"].task["payload"]["command"][
-1
][-1][-50:]
)
assert (
str(
PARAMS["training_config"]["experiment"]["bicleaner"]["dataset-thresholds"][
"opus_ada83_v1"
]
)
in tasks["bicleaner-ai-opus-ada83_v1-ru-en"].task["payload"]["command"][-1][-1][-50:]
)
4 changes: 2 additions & 2 deletions taskcluster/test/test_training_continuation_teacher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from taskgraph.taskgraph import TaskGraph

from translations_taskgraph.parameters import get_defaults
from translations_taskgraph.parameters import get_ci_training_config

PARAMS = deepcopy(get_defaults(None))
PARAMS = deepcopy(get_ci_training_config(None))
PARAMS["target_tasks_method"] = "train-target-tasks"
PARAMS["training_config"]["experiment"]["pretrained-models"] = {
"train-teacher": {
Expand Down
Loading

0 comments on commit a9bf6d9

Please sign in to comment.