-
Notifications
You must be signed in to change notification settings - Fork 2
/
report_check_sums.py
130 lines (101 loc) · 3.77 KB
/
report_check_sums.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
import hashlib
import math
from pathlib import Path
from typing import Union
def file_size_formatter(i: int, binary: bool = True, precision: int = 1) -> str:
"""Format byte size into an appropriate nomenclature for prettier printing.
Notes
-----
Adapted from https://github.com/Ouranosinc/miranda/blob/main/miranda/storage.py
"""
_CONVERSIONS = ["B", "k{}B", "M{}B"]
# Determine the appropriate conversion factor
base = 1024 if binary else 1000
if i == 0:
return "0 B"
multiple = math.trunc(math.log2(i) / math.log2(base))
value = i / math.pow(base, multiple)
suffix = _CONVERSIONS[multiple].format("i" if binary else "")
return f"{value:.{precision}f} {suffix}"
def file_sha256_checksum(filename: Path) -> str:
"""Return sha256 checksum for file."""
hash_sha256 = hashlib.sha256()
with filename.open("rb") as f:
hash_sha256.update(f.read())
return hash_sha256.hexdigest()
def valid(path: Path) -> bool:
"""Return True if path should be considered for the creation of sha256 checksum.
Parameters
----------
path : Path
The path to the file.
"""
# Exclude top-level files
if len(path.parts) == 1:
return False
# Exclude hidden files
if any([p.startswith(".") for p in path.parts]):
return False
# Exclude the registry
if path.name == "registry.txt":
return False
if path.suffix == ".py":
return False
if path.is_file():
return True
def main(dry_run: bool = False, readme: Union[str, Path] = "README.md"):
"""Create checksum files."""
data_folder = Path(".").joinpath("data")
files = list(filter(valid, data_folder.rglob("**/*")))
file_checksums_tmp = dict()
for file in files:
if valid(file):
file_checksums_tmp[file] = file_sha256_checksum(file)
# Sort the dictionary by key
file_checksums = dict(sorted(file_checksums_tmp.items()))
# Write the checksums dictionary to the bottom of the README.md file, replacing the existing table
readme = Path(readme)
with readme.open() as f:
lines = f.readlines()
# Find the index of the existing checksum table
start_index, end_index = None, None
for i, line in enumerate(lines):
if line.startswith("### Files"):
start_index = i
# Remove existing checksum table
if start_index is not None:
del lines[start_index:]
i = None
for i, line in enumerate(lines):
if line.startswith("## Available datasets"):
break
if not i:
raise ValueError("Could not find '## Available datasets' in README.md")
# Insert new checksum table
lines.insert(i + 1, "\n")
lines.insert(i + 2, "### Files\n")
lines.insert(i + 3, "\n")
lines.insert(i + 4, "| File | Size | Checksum |\n")
lines.insert(i + 5, "| ---- | ---- | -------- |\n")
for file, checksum in file_checksums.items():
lines.insert(
i + 6,
f"| {file.relative_to(data_folder).as_posix()} "
f"| {file_size_formatter(file.stat().st_size)} "
f"| sha256:{checksum} |\n",
)
# Remove trailing newline
if lines[-1].startswith("\n"):
del lines[-1]
with readme.open("w", encoding="utf-8") as r:
r.writelines(lines)
print(f"Successfully wrote {len(file_checksums)} checksums to {readme}.")
# Update the data registry file
registry = Path("data/registry.txt")
with registry.open("w", encoding="utf-8") as out:
for file, checksum in file_checksums.items():
out.write(f"{file.relative_to(data_folder).as_posix()} sha256:{checksum}\n")
print(f"Successfully wrote {len(file_checksums)} checksums to {registry}.")
if __name__ == "__main__":
main()