-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiy-1-preprocess-words.py
44 lines (37 loc) · 1.64 KB
/
diy-1-preprocess-words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import re
# Put all the CLIP 'opinions' .txt (from gradient ascent) in a folder (can have subfolders):
words_folder = "texts"
output_folder = "words_processing"
raw_words_file = os.path.join(output_folder, "raw_words.txt")
# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)
# Helper function to clean words
def clean_word(word):
# Remove non-printable characters and ANSI escape codes
ansi_escape = re.compile(r'(?:\x1B[@-_][0-?]*[ -/]*[@-~])|[^\x20-\x7E]+')
return ansi_escape.sub('', word).strip()
# Process all files
all_words = set() # Use a set for deduplication
total_words_count = 0
for root, _, files in os.walk(words_folder):
for file in files:
if file.endswith(".txt"):
file_path = os.path.join(root, file)
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
for line in f:
# Split words by space and clean them
words = [clean_word(word) for word in line.split()]
# Filter out empty words after cleaning
words = [word for word in words if word]
# Add to set and update total count
all_words.update(words)
total_words_count += len(words)
# Save unique words to raw_words.txt
with open(raw_words_file, "w", encoding="utf-8") as f:
f.write("\n".join(sorted(all_words)))
# Print stats
unique_words_count = len(all_words)
print(f"Saved words to {raw_words_file}")
print(f"Total words processed: {total_words_count}")
print(f"Unique words found: {unique_words_count}")