diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py index 233ab17..b5c02f9 100644 --- a/src/sotoki/archives.py +++ b/src/sotoki/archives.py @@ -90,14 +90,13 @@ def _run(url, fpath): # remove other files from ark that we won't need for fp in self.build_dir.iterdir(): - if fp.suffix == ".xml": - if fp.stem not in self.dump_parts: - fp.unlink() - else: - reencode_file(fp) - else: + if fp.suffix != ".xml" or fp.stem not in self.dump_parts: fp.unlink() + # reencode xml files + for fp in self.build_dir.iterdir(): + reencode_file(fp) + futures = {} executor = cf.ThreadPoolExecutor(max_workers=len(self.archives)) diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py index bd61692..351bef7 100644 --- a/src/sotoki/utils/preparation.py +++ b/src/sotoki/utils/preparation.py @@ -26,15 +26,16 @@ def reencode_file(src: pathlib.Path): This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure. - During few instants, there will be two versions of the same content on the filesystem, one - in previous encoding and one in target encoding, filesystem needs enough space for that. + Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster. + + During reencoding, there will be two versions of the same content on the filesystem, one in + previous encoding and one in target encoding, filesystem needs enough space for that. """ - BLOCKSIZE = 1048576 tmp = src.with_suffix(src.suffix + ".tmp") with open(src, "r", encoding=UTF16LE) as sourceFile: with open(tmp, "w", encoding=UTF8) as targetFile: while True: - contents = sourceFile.read(BLOCKSIZE) + contents = sourceFile.readline() if not contents: break targetFile.write(contents)