fixup! Reencode all XML files

openzim · May 7, 2024 · 9513e57 · 9513e57
1 parent 314314a
commit 9513e57
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 10 deletions.
diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py
@@ -90,14 +90,13 @@ def _run(url, fpath):
 
             # remove other files from ark that we won't need
             for fp in self.build_dir.iterdir():
-                if fp.suffix == ".xml":
-                    if fp.stem not in self.dump_parts:
-                        fp.unlink()
-                    else:
-                        reencode_file(fp)
-                else:
+                if fp.suffix != ".xml" or fp.stem not in self.dump_parts:
                     fp.unlink()
 
+            # reencode xml files
+            for fp in self.build_dir.iterdir():
+                reencode_file(fp)
+
         futures = {}
         executor = cf.ThreadPoolExecutor(max_workers=len(self.archives))
 

diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py
@@ -26,15 +26,16 @@ def reencode_file(src: pathlib.Path):
 
     This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure.
 
-    During few instants, there will be two versions of the same content on the filesystem, one
-    in previous encoding and one in target encoding, filesystem needs enough space for that.
+    Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster.
+
+    During reencoding, there will be two versions of the same content on the filesystem, one in
+    previous encoding and one in target encoding, filesystem needs enough space for that.
     """
-    BLOCKSIZE = 1048576
     tmp = src.with_suffix(src.suffix + ".tmp")
     with open(src, "r", encoding=UTF16LE) as sourceFile:
         with open(tmp, "w", encoding=UTF8) as targetFile:
             while True:
-                contents = sourceFile.read(BLOCKSIZE)
+                contents = sourceFile.readline()
                 if not contents:
                     break
                 targetFile.write(contents)