-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add monocleaner * Install packages * Add more filtering stats * Clean temporary files * Configure monocleaner thresholds * Fix dataset configuration * Fix test config * Update test configs * Fix test * Add tests for cleaning parameters * Move packages required for PyICU to the base image * Revert "Move packages required for PyICU to the base image" This reverts commit cba9c04. * Roll back unrelated config changes
- Loading branch information
Showing
13 changed files
with
298 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from copy import deepcopy | ||
|
||
from taskgraph.taskgraph import TaskGraph | ||
|
||
from translations_taskgraph.parameters import get_ci_training_config | ||
|
||
PARAMS = deepcopy(get_ci_training_config()) | ||
|
||
|
||
def test_monocleaner_params(full_task_graph: TaskGraph): | ||
tasks = {t.label: t for t in full_task_graph.tasks.values()} | ||
|
||
assert ( | ||
float( | ||
tasks["clean-mono-news-crawl-ru-news_2008-mono-src"].task["payload"]["command"][-1][ | ||
-3: | ||
] | ||
) | ||
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][ | ||
"dataset-thresholds" | ||
]["news-crawl_news_2008"] | ||
) | ||
assert ( | ||
float( | ||
tasks["clean-mono-news-crawl-en-news_2007-mono-trg"].task["payload"]["command"][-1][ | ||
-3: | ||
] | ||
) | ||
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"]["default-threshold"] | ||
) | ||
assert ( | ||
float( | ||
tasks["clean-mono-opus-ru-tldr-pages_v2023-08-29-mono-src"].task["payload"]["command"][ | ||
-1 | ||
][-3:] | ||
) | ||
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-src"][ | ||
"dataset-thresholds" | ||
]["opus_tldr-pages_v2023-08-29"] | ||
) | ||
assert ( | ||
float( | ||
tasks["clean-mono-opus-en-tldr-pages_v2023-08-29-mono-trg"].task["payload"]["command"][ | ||
-1 | ||
][-3:] | ||
) | ||
== PARAMS["training_config"]["experiment"]["monocleaner"]["mono-trg"][ | ||
"dataset-thresholds" | ||
]["opus_tldr-pages_v2023-08-29"] | ||
) | ||
|
||
|
||
def test_bicleaner_params(full_task_graph: TaskGraph): | ||
tasks = {t.label: t for t in full_task_graph.tasks.values()} | ||
|
||
assert ( | ||
str(PARAMS["training_config"]["experiment"]["bicleaner"]["default-threshold"]) | ||
in tasks["bicleaner-ai-mtdata-Tilde-airbaltic-1-eng-rus-ru-en"].task["payload"]["command"][ | ||
-1 | ||
][-1][-50:] | ||
) | ||
assert ( | ||
str( | ||
PARAMS["training_config"]["experiment"]["bicleaner"]["dataset-thresholds"][ | ||
"opus_ada83_v1" | ||
] | ||
) | ||
in tasks["bicleaner-ai-opus-ada83_v1-ru-en"].task["payload"]["command"][-1][-1][-50:] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.