Skip to content

Commit

Permalink
fixup! Reencode all XML files
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed May 7, 2024
1 parent 314314a commit 9513e57
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
11 changes: 5 additions & 6 deletions src/sotoki/archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,13 @@ def _run(url, fpath):

# remove other files from ark that we won't need
for fp in self.build_dir.iterdir():
if fp.suffix == ".xml":
if fp.stem not in self.dump_parts:
fp.unlink()
else:
reencode_file(fp)
else:
if fp.suffix != ".xml" or fp.stem not in self.dump_parts:
fp.unlink()

# reencode xml files
for fp in self.build_dir.iterdir():
reencode_file(fp)

futures = {}
executor = cf.ThreadPoolExecutor(max_workers=len(self.archives))

Expand Down
9 changes: 5 additions & 4 deletions src/sotoki/utils/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@ def reencode_file(src: pathlib.Path):
This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure.
During few instants, there will be two versions of the same content on the filesystem, one
in previous encoding and one in target encoding, filesystem needs enough space for that.
Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster.
During reencoding, there will be two versions of the same content on the filesystem, one in
previous encoding and one in target encoding, filesystem needs enough space for that.
"""
BLOCKSIZE = 1048576
tmp = src.with_suffix(src.suffix + ".tmp")
with open(src, "r", encoding=UTF16LE) as sourceFile:
with open(tmp, "w", encoding=UTF8) as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
contents = sourceFile.readline()
if not contents:
break
targetFile.write(contents)
Expand Down

0 comments on commit 9513e57

Please sign in to comment.