-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclip-classic-interrogator.py
747 lines (622 loc) · 34.6 KB
/
clip-classic-interrogator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
"""
Original CLIP Interrogator file:
https://colab.research.google.com/github/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator.ipynb
# CLIP Interrogator 2.4 by [@pharmapsychotic](https://twitter.com/pharmapsychotic)
https://github.com/pharmapsychotic/clip-interrogator
Refactored to use HuggingFace transformers instead of open_clip by zer0int
https://github.com/zer0int
"""
import os
import subprocess
import contextlib
import io
import sys
import re
from colorama import Fore, Style
from functools import wraps
import argparse
import csv
import requests
import time
import hashlib
import math
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from typing import List, Optional
from dataclasses import dataclass
from safetensors.numpy import load_file, save_file
from transformers import AutoProcessor, AutoConfig, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer
from transformers import CLIPVisionModel, CLIPVisionConfig
from transformers.utils import logging
logging.set_verbosity_error()
"""
------------- TO-DO --------------
- Check flaves correctly working with normal interrogate (not just fast)
- Make quiet so that it still prints the prompt AND the current image
- Ensure custom HF can be loaded (for CLIP; check caption models?)
- Check num_features and how that may be used?!
"""
def validate_clip_model(model_name):
predefined_models = [
'openai/clip-vit-large-patch14',# Original CLIP
'zer0int/CLIP-GmP-ViT-L-14',# My GmP finetune
'zer0int/CLIP-SAE-ViT-L-14',# My SAE finetune
'zer0int/LongCLIP-L-Diffusers',# Original LongCLIP
'zer0int/LongCLIP-GmP-ViT-L-14',# My GmP-LongCLIP
'zer0int/LongCLIP-SAE-ViT-L-14'# My SAE-LongCLIP
# Just use this argument to load a different CLIP from HuggingFace: --m_clip SomeDev/Some-CLIP-model
]
if model_name in predefined_models:
return model_name
else:
# Attempt to check if the user-defined model is accessible
try:
AutoConfig.from_pretrained(model_name)
return model_name # Valid custom model
except Exception as e:
if "404" in str(e):
print(f"Error: Model '{model_name}' not found.")
elif "gated" in str(e).lower():
print(f"Error: Model '{model_name}' is gated and requires access permissions.")
else:
print(f"{Fore.RED + Style.BRIGHT}Error: Unable to load model '{model_name}' due to: {e}{Style.RESET_ALL}")
print(f"{Fore.YELLOW + Style.BRIGHT}Falling back to default model: 'openai/clip-vit-large-patch14'{Style.RESET_ALL}")
return 'openai/clip-vit-large-patch14'
def parse_arguments():
parser = argparse.ArgumentParser(description='CLIP interrogator 2025')
parser.add_argument('--m_caption', choices=['blip-base', 'blip-large', 'blip2-2.7b', 'blip2-flan-t5-xl', 'git-large-coco'], default='blip-large', type=str, help="Caption Model to use")
parser.add_argument('--m_clip', type=validate_clip_model, default='openai/clip-vit-large-patch14', help="Specify the HuggingFace CLIP model to use. For example: --m_clip zer0int/CLIP-GmP-ViT-L-14")
parser.add_argument('--mode', choices=['best', 'classic', 'fast', 'negative'], default='best', type=str, help="Mode to use for Captioning")
parser.add_argument('--ownwords', action='store_true', help="Use ownwords.txt (put your own file in 'data/ownwords.txt' first!)")
parser.add_argument('--output', choices=['rename', 'csv', 'txt', 'both'], default='csv', type=str, help="Rename image filenames, save captions as .csv, as individual .txt files, or as both .csv + .txt")
parser.add_argument('--outfile', type=str, default='all', help="Filename for .csv; defaults to 'all' -> 'all.csv'")
parser.add_argument('--maxfilename', default=48, type=int, help="Maximum caption / filename length (default: 48), only applies if used with: --output rename")
parser.add_argument('--maxcaption', default=32, type=int, help="Maximum BLIP caption length, default: 32")
parser.add_argument('--chunk_size', default=2048, type=int, help="Batch size, default: 2048; reduce to e.g. 1024 or 512 to use less VRAM")
parser.add_argument('--max_flavors', default=64, type=int, help="Maximum flavors in the Flavor Chain; default: 64")
parser.add_argument('--image_folder', type=str, default='images', help="Defaults to 'images'; expects: /path/to/image/folder")
parser.add_argument('--quiet', action='store_true', help="Run quietly, without verbose output")
parser.add_argument('--ranklooker', action='store_true', help="Watch the ranking process, live! Can slow down the process a bit; only use if you're actually looking. =)")
parser.add_argument('--debug', action='store_true', help="Spams everything with tensors & current stats. Not useful - except for DEBUGGING.")
return parser.parse_args()
args = parse_arguments()
maxcaption = args.maxcaption
caption_model_name = args.m_caption
clip_model_name = args.m_clip
max_flavors=args.max_flavors
default_max_tokens = 77
max_tokens = default_max_tokens
dumptokensconfig = AutoConfig.from_pretrained(clip_model_name)
if hasattr(dumptokensconfig, "text_config") and hasattr(dumptokensconfig.text_config, "max_position_embeddings"):
max_tokens = dumptokensconfig.text_config.max_position_embeddings
else:
max_tokens = default_max_tokens
del dumptokensconfig
print(f"{Fore.MAGENTA + Style.BRIGHT}\nSet {clip_model_name} with max_tokens: {max_tokens}{Style.RESET_ALL}")
CAPTION_MODELS = {
'blip-base': 'Salesforce/blip-image-captioning-base', # 990MB
'blip-large': 'Salesforce/blip-image-captioning-large', # 1.9GB
'blip2-2.7b': 'Salesforce/blip2-opt-2.7b', # 15.5GB
'blip2-flan-t5-xl': 'Salesforce/blip2-flan-t5-xl', # 15.77GB
'git-large-coco': 'microsoft/git-large-coco', # 1.58GB
}
@dataclass
class Config:
# models can optionally be passed in directly
caption_model = None
caption_processor = None
clip_model = None
clip_preprocess = None
# blip settings
caption_max_length: int = maxcaption
caption_model_name: Optional[str] = caption_model_name
caption_offload: bool = False
# clip settings
clip_model_name: str = clip_model_name
clip_model_path: Optional[str] = None
clip_offload: bool = False
# interrogator settings
cache_path: str = 'cache' # path to store cached text embeddings
chunk_size: int = args.chunk_size # batch size for CLIP, use smaller for lower VRAM
data_path: str = os.path.join(os.path.dirname(__file__), 'data')
device: str = ("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
flavor_intermediate_count: int = 2048
quiet: bool = args.quiet # when quiet, progress bars are not shown
def create_tokenizer(clip_model_name: str, max_length: int = max_tokens):
"""
Creates a tokenizer function that replicates open_clip's behavior using the Hugging Face CLIPTokenizer.
"""
tokenizer = CLIPTokenizer.from_pretrained(clip_model_name)
tokenizer.pad_token = "0" # Ensure padding token is set to 0 (instead of eos_token)
def tokenize_text(text_array):
if isinstance(text_array, str):
text_array = [text_array] # Convert single string to a list, if needed
# Tokenize and pad/truncate
tokenized = tokenizer(
text_array,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
# Ensure that the padding token is explicitly set to 0
input_ids = tokenized["input_ids"]
input_ids[input_ids == tokenizer.pad_token_id] = 0
# Convert to padded tensor format matching open_clip
padded_tensor = torch.zeros((input_ids.size(0), max_length), dtype=torch.long)
padded_tensor[:, :input_ids.size(1)] = input_ids
return padded_tensor
return tokenize_text
class Interrogator():
def __init__(self, config: Config):
self.config = config
self.device = config.device
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
self.caption_offloaded = True
self.clip_offloaded = True
self.load_caption_model()
self.load_clip_model()
def load_caption_model(self):
if self.config.caption_model is None and self.config.caption_model_name:
if not self.config.quiet:
print(f"{Fore.GREEN + Style.BRIGHT}Loading caption model {self.config.caption_model_name}...{Style.RESET_ALL}")
model_path = CAPTION_MODELS[self.config.caption_model_name]
if self.config.caption_model_name.startswith('git-'):
caption_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)
elif self.config.caption_model_name.startswith('blip2-'):
caption_model = Blip2ForConditionalGeneration.from_pretrained(model_path, torch_dtype=self.dtype)
else:
caption_model = BlipForConditionalGeneration.from_pretrained(model_path, torch_dtype=self.dtype)
self.caption_processor = AutoProcessor.from_pretrained(model_path)
caption_model.eval()
if not self.config.caption_offload:
caption_model = caption_model.to(self.config.device)
self.caption_model = caption_model
else:
self.caption_model = self.config.caption_model
self.caption_processor = self.config.caption_processor
def load_clip_model(self):
start_time = time.time()
config = self.config
if config.clip_model is None:
if not config.quiet:
print(f"{Fore.GREEN + Style.BRIGHT}Loading CLIP model {config.clip_model_name}...{Style.RESET_ALL}")
processor = CLIPProcessor.from_pretrained(config.clip_model_name)
clip_model = CLIPModel.from_pretrained(config.clip_model_name)
clip_model.eval()
if not config.clip_offload:
clip_model = clip_model.to(config.device)
self.clip_model = clip_model
self.clip_processor = processor
self.clip_model.eval()
else:
self.clip_model = config.clip_model
self.clip_preprocess = config.clip_preprocess
self.tokenize = create_tokenizer(clip_model_name)
sites = ['Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart', 'dribble',
'flickr', 'instagram', 'pexels', 'pinterest', 'pixabay', 'pixiv', 'polycount',
'reddit', 'shutterstock', 'tumblr', 'unsplash', 'zbrush central']
trending_list = [site for site in sites]
trending_list.extend(["trending on "+site for site in sites])
trending_list.extend(["featured on "+site for site in sites])
trending_list.extend([site+" contest winner" for site in sites])
raw_artists = load_list(config.data_path, 'artists.txt')
artists = [f"by {a}" for a in raw_artists]
artists.extend([f"inspired by {a}" for a in raw_artists])
self._prepare_clip()
self.artists = LabelTable(artists, "artists", self)
self.flavors = LabelTable(load_list(config.data_path, 'flavors.txt'), "flavors", self)
self.mediums = LabelTable(load_list(config.data_path, 'mediums.txt'), "mediums", self)
self.movements = LabelTable(load_list(config.data_path, 'movements.txt'), "movements", self)
self.trendings = LabelTable(trending_list, "trendings", self)
self.negative = LabelTable(load_list(config.data_path, 'negative.txt'), "negative", self)
if args.ownwords:
self.ownwords = LabelTable(load_list(config.data_path, 'ownwords.txt'), "ownwords", self)
end_time = time.time()
if not config.quiet:
print(f"{Fore.MAGENTA + Style.BRIGHT}Loaded CLIP model and data in {end_time-start_time:.2f} seconds.{Style.RESET_ALL}\n")
def chain(
self,
image_features: torch.Tensor,
phrases: List[str],
best_prompt: str="",
best_sim: float=0,
min_count: int=8,
max_count: int=64,
desc="Chaining",
reverse: bool=False
) -> str:
self._prepare_clip()
phrases = set(phrases)
if not best_prompt:
best_prompt = self.rank_top(image_features, [f for f in phrases], reverse=reverse)
best_sim = self.similarity(image_features, best_prompt)
phrases.remove(best_prompt)
curr_prompt, curr_sim = best_prompt, best_sim
def check(addition: str, idx: int) -> bool:
nonlocal best_prompt, best_sim, curr_prompt, curr_sim
prompt = curr_prompt + ", " + addition
sim = self.similarity(image_features, prompt)
if reverse:
sim = -sim
if sim > best_sim:
best_prompt, best_sim = prompt, sim
if sim > curr_sim or idx < min_count:
curr_prompt, curr_sim = prompt, sim
return True
return False
if args.ranklooker:
pbar = tqdm(range(max_count), desc=desc, disable=self.config.quiet)
for idx in pbar:
processed_phrases = [f"{curr_prompt}, {f} " for f in phrases]
pbar.set_description(f"{Fore.MAGENTA + Style.BRIGHT}Ranklooker: {Fore.CYAN + Style.BRIGHT}{processed_phrases[:5]}{Style.RESET_ALL}") # Update the description dynamically
best = self.rank_top(image_features, processed_phrases, reverse=reverse)
flave = best[len(curr_prompt)+2:]
flavealt = best[len(curr_prompt)+2:-1]
if not check(flave, idx):
break
if _prompt_at_max_len(curr_prompt, self.tokenize):
break
try:
phrases.remove(flave)
except KeyError:
flave = flavealt
phrases.remove(flave)
pbar.close() # Explicitly close the progress bar
return best_prompt
else:
for idx in tqdm(range(max_count), desc=desc, disable=self.config.quiet):
best = self.rank_top(image_features, [f"{curr_prompt}, {f}" for f in phrases], reverse=reverse)
flave = best[len(curr_prompt)+2:]
flavealt = best[len(curr_prompt)+2:-1]
if not check(flave, idx):
break
if _prompt_at_max_len(curr_prompt, self.tokenize):
break
try:
phrases.remove(flave)
except KeyError:
flave = flavealt
phrases.remove(flave)
return best_prompt
def generate_caption(self, pil_image: Image) -> str:
assert self.caption_model is not None, "No caption model loaded."
self._prepare_caption()
inputs = self.caption_processor(images=pil_image, return_tensors="pt").to(self.device)
if not self.config.caption_model_name.startswith('git-'):
inputs = inputs.to(self.dtype)
tokens = self.caption_model.generate(**inputs, max_new_tokens=self.config.caption_max_length)
return self.caption_processor.batch_decode(tokens, skip_special_tokens=True)[0].strip()
def image_to_features(self, image: Image) -> torch.Tensor:
self._prepare_clip()
inputs = self.clip_processor(images=image, return_tensors="pt", padding=True)
images = inputs["pixel_values"].to(self.device)
with torch.no_grad(), torch.amp.autocast('cuda'):
image_features = self.clip_model.get_image_features(images)
image_features /= image_features.norm(dim=-1, keepdim=True)
return image_features
def interrogate_classic(self, image: Image, max_flavors: int=max_flavors, caption: Optional[str]=None) -> str:
"""Classic mode creates a prompt in a standard format first describing the image,
then listing the artist, trending, movement, and flavor text modifiers."""
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
medium = self.mediums.rank(image_features, 1)[0]
artist = self.artists.rank(image_features, 1)[0]
trending = self.trendings.rank(image_features, 1)[0]
movement = self.movements.rank(image_features, 1)[0]
flaves = ", ".join(self.flavors.rank(image_features, max_flavors))
if args.ownwords:
ownwords = ", ".join(self.ownwords.rank(image_features, max_flavors))
if args.debug:
print(f"def interrogate_classic, Ranked labels: {len(flaves)}, Top-ranked: {flaves[:5]}")
if caption.startswith(medium):
prompt = f"{caption} {artist}, {trending}, {movement}, {flaves}"
elif caption.startswith(medium) and args.ownwords:
prompt = f"{caption} {artist}, {ownwords}, {trending}, {movement}, {flaves}"
elif args.ownwords:
prompt = f"{caption}, {ownwords}, {medium} {artist}, {trending}, {movement}, {flaves}"
else:
prompt = f"{caption}, {medium} {artist}, {trending}, {movement}, {flaves}"
return _truncate_to_fit(prompt, self.tokenize)
def interrogate_fast(self, image: Image, max_flavors: int=max_flavors, caption: Optional[str]=None) -> str:
"""Fast mode simply adds the top ranked terms after a caption. It generally results in
better similarity between generated prompt and image than classic mode, but the prompts
are less readable."""
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
if args.ownwords:
merged = _merge_tables([self.artists, self.ownwords, self.flavors, self.mediums, self.movements, self.trendings], self)
else:
merged = _merge_tables([self.artists, self.flavors, self.mediums, self.movements, self.trendings], self)
tops = merged.rank(image_features, max_flavors)
return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize)
def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
"""Negative mode chains together the most dissimilar terms to the image. It can be used
to help build a negative prompt to pair with the regular positive prompt and often
improve the results of generated images."""
image_features = self.image_to_features(image)
flaves = self.flavors.rank(image_features, self.config.flavor_intermediate_count, reverse=True)
flaves = flaves + self.negative.labels
return self.chain(image_features, flaves, max_count=max_flavors, reverse=True, desc="Negative chain")
def interrogate(self, image: Image, min_flavors: int=8, max_flavors: int=max_flavors, caption: Optional[str]=None) -> str:
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
if args.ownwords:
merged = _merge_tables([self.artists, self.ownwords, self.flavors, self.mediums, self.movements, self.trendings], self)
else:
merged = _merge_tables([self.artists, self.flavors, self.mediums, self.movements, self.trendings], self)
if args.debug:
print(f"def interrogate, Merged labels: {len(merged.labels)}")
flaves = merged.rank(image_features, self.config.flavor_intermediate_count)
if not self.config.quiet:
print(f"{Fore.CYAN + Style.BRIGHT}Flaves (total): {Fore.YELLOW + Style.BRIGHT}{len(flaves)}. {Fore.CYAN + Style.BRIGHT}Tops: {Fore.YELLOW + Style.BRIGHT}{flaves[:8]}{Style.RESET_ALL}")
best_prompt, best_sim = caption, self.similarity(image_features, caption)
best_prompt = self.chain(image_features, flaves, best_prompt, best_sim, min_count=min_flavors, max_count=max_flavors, desc="Flavor chain")
fast_prompt = self.interrogate_fast(image, max_flavors, caption=caption)
classic_prompt = self.interrogate_classic(image, max_flavors, caption=caption)
candidates = [caption, classic_prompt, fast_prompt, best_prompt]
return candidates[np.argmax(self.similarities(image_features, candidates))]
def rank_top(self, image_features: torch.Tensor, text_array: List[str], reverse: bool=False) -> str:
self._prepare_clip()
text_tokens = self.tokenize([text for text in text_array]).to(self.device)
with torch.no_grad(), torch.amp.autocast('cuda'):
text_features = self.clip_model.get_text_features(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
if reverse:
similarity = -similarity
return text_array[similarity.argmax().item()]
def similarity(self, image_features: torch.Tensor, text: str) -> float:
self._prepare_clip()
text_tokens = self.tokenize([text]).to(self.device)
with torch.no_grad(), torch.amp.autocast('cuda'):
text_features = self.clip_model.get_text_features(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
return similarity[0][0].item()
def similarities(self, image_features: torch.Tensor, text_array: List[str]) -> List[float]:
self._prepare_clip()
text_tokens = self.tokenize([text for text in text_array]).to(self.device)
with torch.no_grad(), torch.amp.autocast('cuda'):
text_features = self.clip_model.get_text_features(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
return similarity.T[0].tolist()
def _prepare_caption(self):
if self.config.clip_offload and not self.clip_offloaded:
self.clip_model = self.clip_model.to('cpu')
self.clip_offloaded = True
if self.caption_offloaded:
self.caption_model = self.caption_model.to(self.device)
self.caption_offloaded = False
def _prepare_clip(self):
if self.config.caption_offload and not self.caption_offloaded:
self.caption_model = self.caption_model.to('cpu')
self.caption_offloaded = True
if self.clip_offloaded:
self.clip_model = self.clip_model.to(self.device)
self.clip_offloaded = False
class LabelTable():
def __init__(self, labels:List[str], desc:str, ci: Interrogator):
clip_model, config = ci.clip_model, ci.config
self.chunk_size = config.chunk_size
self.config = config
self.device = config.device
self.embeds = []
self.labels = labels
self.tokenize = ci.tokenize
self.clip_processor = ci.clip_processor
hash = hashlib.sha256(",".join(labels).encode()).hexdigest()
sanitized_name = self.config.clip_model_name.replace('/', '_').replace('@', '_')
self._load_cached(desc, hash, sanitized_name)
if len(self.labels) != len(self.embeds):
self.embeds = []
chunks = np.array_split(self.labels, max(1, len(self.labels) / config.chunk_size))
if args.debug:
print(f"Total labels: {len(self.labels)}, Chunk size: {self.chunk_size}, Num chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: {len(chunk)} items")
for chunk in tqdm(chunks, desc=f"Preprocessing {desc}" if desc else None, disable=self.config.quiet):
chunk = list(chunk)
text_tokens = self.tokenize(chunk).to(self.device)
with torch.no_grad(), torch.amp.autocast('cuda'):
text_features = clip_model.get_text_features(text_tokens)
if args.debug:
print(f"LabelTable Embedding norms: {np.linalg.norm(text_features, axis=-1)}")
text_features /= text_features.norm(dim=-1, keepdim=True)
if args.debug:
print(f"LabelTable Embedding norms after .norm: {np.linalg.norm(text_features, axis=-1)}")
text_features = text_features.half().cpu().numpy()
for i in range(text_features.shape[0]):
self.embeds.append(text_features[i])
if desc and self.config.cache_path:
os.makedirs(self.config.cache_path, exist_ok=True)
cache_filepath = os.path.join(self.config.cache_path, f"{sanitized_name}_{desc}.safetensors")
tensors = {
"embeds": np.stack(self.embeds),
"hash": np.array([ord(c) for c in hash], dtype=np.int8)
}
save_file(tensors, cache_filepath)
torch.cuda.empty_cache()
if self.device == 'cpu' or self.device == torch.device('cpu'):
self.embeds = [e.astype(np.float32) for e in self.embeds]
def _load_cached(self, desc:str, hash:str, sanitized_name:str) -> bool:
if self.config.cache_path is None or desc is None:
return False
cached_safetensors = os.path.join(self.config.cache_path, f"{sanitized_name}_{desc}.safetensors")
if os.path.exists(cached_safetensors):
try:
tensors = load_file(cached_safetensors)
except Exception as e:
print(f"Failed to load {cached_safetensors}")
print(e)
return False
if 'hash' in tensors and 'embeds' in tensors:
if np.array_equal(tensors['hash'], np.array([ord(c) for c in hash], dtype=np.int8)):
self.embeds = tensors['embeds']
if len(self.embeds.shape) == 2:
self.embeds = [self.embeds[i] for i in range(self.embeds.shape[0])]
return True
return False
def _rank(self, image_features: torch.Tensor, text_embeds: torch.Tensor, top_count: int=1, reverse: bool=False) -> str:
top_count = min(top_count, len(text_embeds))
text_embeds = torch.stack([torch.from_numpy(t) for t in text_embeds]).to(self.device)
if args.debug:
print(f"def _rank: Image features norm: {torch.norm(image_features, dim=-1).cpu().numpy()}")
print(f"def _rank: Text embeddings norm (batch): {torch.norm(text_embeds, dim=-1).cpu().numpy()}")
with torch.amp.autocast('cuda'):
similarity = image_features @ text_embeds.T
if args.debug:
print(f"Similarity scores (batch): {similarity.cpu().numpy()}")
if reverse:
similarity = -similarity
_, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
return [top_labels[0][i].numpy() for i in range(top_count)]
def rank(self, image_features: torch.Tensor, top_count: int=5, reverse: bool=False) -> List[str]:
if len(self.labels) <= self.chunk_size:
tops = self._rank(image_features, self.embeds, top_count=top_count, reverse=reverse)
if args.debug:
print(f"def rank: Top labels: {tops}")
return [self.labels[i] for i in tops]
num_chunks = int(math.ceil(len(self.labels)/self.chunk_size))
if args.debug:
print(f"Splitting into {num_chunks} chunks...")
keep_per_chunk = int(self.chunk_size / num_chunks)
top_labels, top_embeds = [], []
for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
start = chunk_idx*self.chunk_size
stop = min(start+self.chunk_size, len(self.embeds))
chunk_embeds = self.embeds[start:stop]
if args.debug:
print(f"Processing chunk {chunk_idx}: {len(chunk_embeds)} embeddings")
tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk, reverse=reverse)
if args.debug:
print(f"Chunk {chunk_idx} top labels count: {len(tops)}")
top_labels.extend([self.labels[start+i] for i in tops])
top_embeds.extend([self.embeds[start+i] for i in tops])
tops = self._rank(image_features, top_embeds, top_count=top_count)
if args.debug:
print(f"Final top labels count: {len(tops)}")
return [top_labels[i] for i in tops]
def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
m = LabelTable([], None, ci)
for table in tables:
if args.debug:
print(f"Merging {len(table.labels)} labels from table: {table}")
m.labels.extend(table.labels)
m.embeds.extend(table.embeds)
if args.debug:
print(f"Total merged labels: {len(m.labels)}, Total embeddings: {len(m.embeds)}")
return m
def _prompt_at_max_len(text: str, tokenize) -> bool:
tokens = tokenize([text])
return tokens[0][-1] != 0
def _truncate_to_fit(text: str, tokenize) -> str:
parts = text.split(', ')
new_text = parts[0]
for part in parts[1:]:
candidate = new_text + ', ' + part
if _prompt_at_max_len(candidate, tokenize):
break
new_text = candidate
return new_text
def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
if filename is not None:
data_path = os.path.join(data_path, filename)
with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
items = [line.strip() for line in f.readlines()]
return items
def image_to_prompt(image, imgname, mode):
ci.config.chunk_size = args.chunk_size
ci.config.flavor_intermediate_count = 2048
image = image.convert('RGB')
if mode == 'best':
print(f"{Fore.YELLOW + Style.BRIGHT}\n\nProcessing image: {Fore.MAGENTA + Style.BRIGHT}{imgname}{Style.RESET_ALL}")
return ci.interrogate(image)
elif mode == 'classic':
print(f"{Fore.YELLOW + Style.BRIGHT}\n\nProcessing image: {Fore.MAGENTA + Style.BRIGHT}{imgname}{Style.RESET_ALL}")
return ci.interrogate_classic(image)
elif mode == 'fast':
print(f"{Fore.YELLOW + Style.BRIGHT}\n\nProcessing image: {Fore.MAGENTA + Style.BRIGHT}{imgname}{Style.RESET_ALL}")
return ci.interrogate_fast(image)
elif mode == 'negative':
print(f"{Fore.YELLOW + Style.BRIGHT}\n\nProcessing image: {Fore.MAGENTA + Style.BRIGHT}{imgname}{Style.RESET_ALL}")
return ci.interrogate_negative(image)
def sanitize_for_filename(prompt: str, max_len: int) -> str:
name = "".join(c for c in prompt if (c.isalnum() or c in ",._-! "))
name = name.strip()[:(max_len-4)] # space for extension
return name
def get_csv_filename(outfile, default="all.csv"):
if not outfile:
return default
if not outfile.endswith(".csv"):
outfile = os.path.splitext(outfile)[0] + ".csv"
return outfile
config = Config()
config.clip_model_name = clip_model_name
config.caption_model_name = caption_model_name
ci = Interrogator(config)
folder_path = args.image_folder
prompt_mode = args.mode
output_mode = args.output
max_filename_len = args.maxfilename
files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] if os.path.exists(folder_path) else []
prompts = []
for idx, file in enumerate(tqdm(files, desc='Generating prompts')):
image = Image.open(os.path.join(folder_path, file)).convert('RGB')
imgname = os.path.basename(os.path.splitext(file)[0])
prompt = image_to_prompt(image, imgname, prompt_mode)
prompts.append(prompt)
print(f"{Fore.GREEN + Style.BRIGHT}{prompt}{Style.RESET_ALL}")
if output_mode == 'rename':
name = sanitize_for_filename(prompt, max_filename_len)
ext = os.path.splitext(file)[1]
filename = name + ext
idx = 1
while os.path.exists(os.path.join(folder_path, filename)):
print(f'{Fore.YELLOW + Style.BRIGHT}File {filename} already exists, trying {idx+1}...{Style.RESET_ALL}"')
filename = f"{name}_{idx}{ext}"
idx += 1
os.rename(os.path.join(folder_path, file), os.path.join(folder_path, filename))
if len(prompts):
if output_mode == 'csv':
csvfile = get_csv_filename(args.outfile)
csv_path = os.path.join(folder_path, csvfile)
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
w.writerow(['image', 'prompt'])
for file, prompt in zip(files, prompts):
cleaned_prompt = re.sub(r'\s+,', ',', prompt)
w.writerow([file, cleaned_prompt])
print(f"{Fore.GREEN + Style.BRIGHT}\n\n\n\nGenerated {len(prompts)} prompts and saved to '{csv_path}', enjoy!{Style.RESET_ALL}")
elif output_mode == 'txt':
for file, prompt in zip(files, prompts):
txt_filename = os.path.splitext(file)[0] + ".txt"
txt_path = os.path.join(folder_path, txt_filename)
formatted_prompt = "\n".join([line.strip() for line in prompt.split(",")])
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(formatted_prompt)
print(f"{Fore.GREEN + Style.BRIGHT}\n\n\n\nGenerated {len(prompts)} prompts and saved to '{folder_path}' as .txt files, enjoy!{Style.RESET_ALL}")
elif output_mode == 'both':
for file, prompt in zip(files, prompts):
txt_filename = os.path.splitext(file)[0] + ".txt"
txt_path = os.path.join(folder_path, txt_filename)
formatted_prompt = "\n".join([line.strip() for line in prompt.split(",")])
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(formatted_prompt)
csvfile = get_csv_filename(args.outfile)
csv_path = os.path.join(folder_path, csvfile)
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
w.writerow(['image', 'prompt'])
for file, prompt in zip(files, prompts):
cleaned_prompt = re.sub(r'\s+,', ',', prompt)
w.writerow([file, cleaned_prompt])
print(f"{Fore.GREEN + Style.BRIGHT}\n\n\n\nGenerated {len(prompts)} prompts and saved to '{folder_path}' as .txt files + saved to '{csv_path}', enjoy!{Style.RESET_ALL}")
else:
print(f"{Fore.GREEN + Style.BRIGHT}\n\n\n\nGenerated {len(prompts)} prompts and renamed your files, enjoy!{Style.RESET_ALL}")
else:
print(f"{Fore.RED + Style.BRIGHT}No images in {folder_path}!{Style.RESET_ALL}")