|
import gradio as gr |
|
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils |
|
import torch |
|
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') |
|
model_id="eltorio/IDEFICS3_ROCO" |
|
|
|
base_model_path="HuggingFaceM4/Idefics3-8B-Llama3" |
|
processor = AutoProcessor.from_pretrained(base_model_path) |
|
model = Idefics3ForConditionalGeneration.from_pretrained( |
|
base_model_path, torch_dtype=torch.bfloat16 |
|
).to(device) |
|
|
|
model.load_adapter(model_id) |
|
|
|
def infere(image): |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": "What do we see in this image?"}, |
|
] |
|
}, |
|
] |
|
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor(text=prompt, images=[image], return_tensors="pt") |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
generated_ids = model.generate(**inputs, max_new_tokens=8192) |
|
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
return generated_texts |
|
|
|
demo = gr.Interface(fn=infere, inputs="image", outputs="text") |
|
demo.launch() |