Skip to content

Commit

Permalink
Adjust data cleaning for CJK (#900)
Browse files Browse the repository at this point in the history
* Add custom cleaning configs for CJK

* Make custom filters direction specific

* Update tests

* Fix mono cleaning for CJK

* Update Taskcluster kind

* Update config generator to use custom cleaning configs for CJK
  • Loading branch information
eu9ene authored Nov 6, 2024
1 parent 57b1d2c commit 5c3f8cc
Show file tree
Hide file tree
Showing 11 changed files with 513 additions and 7 deletions.
77 changes: 77 additions & 0 deletions pipeline/clean/opuscleaner/configs/en-ja/default.filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"version": 1,
"files": [
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "<src>"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "<src>"
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "<src>",
"LANG2": "<trg>",
"SRCWORDRAT": 0.4,
"TRGWORDRAT": 0.0,
"SRCALPHARAT": 0.5,
"TRGALPHARAT": 0.0,
"DEBUG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "<src>",
"LANG2": "<trg>"
},
"language": null
}
]
}
77 changes: 77 additions & 0 deletions pipeline/clean/opuscleaner/configs/en-ko/default.filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"version": 1,
"files": [
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "<src>"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "<src>"
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "<src>",
"LANG2": "<trg>",
"SRCWORDRAT": 0.4,
"TRGWORDRAT": 0.0,
"SRCALPHARAT": 0.5,
"TRGALPHARAT": 0.0,
"DEBUG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "<src>",
"LANG2": "<trg>"
},
"language": null
}
]
}
77 changes: 77 additions & 0 deletions pipeline/clean/opuscleaner/configs/en-zh/default.filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"version": 1,
"files": [
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "<src>"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "<src>"
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "<src>",
"LANG2": "<trg>",
"SRCWORDRAT": 0.4,
"TRGWORDRAT": 0.0,
"SRCALPHARAT": 0.5,
"TRGALPHARAT": 0.0,
"DEBUG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "<src>",
"LANG2": "<trg>"
},
"language": null
}
]
}
77 changes: 77 additions & 0 deletions pipeline/clean/opuscleaner/configs/ja-en/default.filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"version": 1,
"files": [
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "<trg>"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "<trg>"
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "<src>",
"LANG2": "<trg>",
"SRCWORDRAT": 0.0,
"TRGWORDRAT": 0.4,
"SRCALPHARAT": 0.0,
"TRGALPHARAT": 0.5,
"DEBUG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "<src>",
"LANG2": "<trg>"
},
"language": null
}
]
}
77 changes: 77 additions & 0 deletions pipeline/clean/opuscleaner/configs/ko-en/default.filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"version": 1,
"files": [
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "<trg>"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "<trg>"
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "<src>",
"LANG2": "<trg>",
"SRCWORDRAT": 0.0,
"TRGWORDRAT": 0.4,
"SRCALPHARAT": 0.0,
"TRGALPHARAT": 0.5,
"DEBUG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "<src>",
"LANG2": "<trg>"
},
"language": null
}
]
}
Loading

0 comments on commit 5c3f8cc

Please sign in to comment.