-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
78 lines (63 loc) · 2.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, MllamaProcessor
from huggingface_hub import login
import argparse
# Function to log into Hugging Face
def login_huggingface(token: str):
"""Logs into Hugging Face using the provided token."""
login(token=token)
# Function to load the model and processor
def load_model_and_processor(model_id: str):
"""Loads the specified model and processor from Hugging Face."""
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16
)
processor = MllamaProcessor.from_pretrained(model_id)
return model, processor
# Function to process the input for the Gradio interface
def process_input(image: Image.Image, text_input: str) -> str:
"""
Processes the input image and text prompt to generate a response from the LLAMA 3.2 Vision model.
Args:
image (PIL.Image): The input image.
text_input (str): The text prompt to accompany the image.
Returns:
str: The generated text from the model.
"""
# Combine image and text input into the final prompt
prompt = f"<|image|><|begin_of_text|>{text_input}"
# Preprocess inputs for the model
inputs = processor(image, prompt, return_tensors="pt").to(model.device)
# Generate the model's output based on the inputs
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Decode the generated output into readable text
generated_text = processor.decode(output[0], skip_special_tokens=True).replace("\\n", "\n").replace("\\", "")
return generated_text
# Main execution
if __name__ == "__main__":
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description="Run Llama 3.2 Vision model with Gradio.")
parser.add_argument('--token', type=str, required=True, help='Hugging Face token for authentication.')
args = parser.parse_args()
# Log into Hugging Face
login_huggingface(args.token)
# Load model and processor
model_id = "meta-llama/Llama-3.2-11B-Vision"
model, processor = load_model_and_processor(model_id)
# Define and set up the Gradio interface
demo = gr.Interface(
fn=process_input, # Function that handles the inputs
inputs=[
gr.Image(type="pil", label="Input Image"), # Image input
gr.Textbox(label="Text Prompt") # Text input box for prompt
],
outputs=gr.Textbox(label="Generated Output"), # Output box for generated text
title="Llama 3.2 Vision: Image & Text Understanding", # Title for UI
description="Upload an image and provide a text prompt. The Llama 3.2 Vision model will generate a response based on the combined inputs."
)
# Launch the interface
demo.launch(share=False)