import os
import re
import subprocess
import numpy as np
from PIL import Image
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
# Load model and processor, enabling trust_remote_code if needed
model_name = "PJMixers-Images/Florence-2-base-Castollux-v0.5"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).eval()
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
TITLE = f"# [{model_name}](https://huggingface.co/{model_name})"
def process_image(image, num_beams=5, min_p=0.0, top_p=1.0):
"""
Process a single image to generate a caption.
Supports image input as file path, numpy array, or PIL Image.
Generation settings (num_beams, min_p, top_p) can be customized.
"""
try:
# Convert input to PIL image if necessary
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
elif isinstance(image, str):
image = Image.open(image)
if image.mode != "RGB":
image = image.convert("RGB")
# Prepare inputs for the model
inputs = processor(
text="
",
images=image,
return_tensors="pt"
)
# Move tensors to the appropriate device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Disable gradients during inference
with torch.no_grad():
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=num_beams,
do_sample=True,
top_p=top_p,
min_p=min_p,
)
# Decode and post-process the generated text
return processor.batch_decode(
generated_ids,
skip_special_tokens=False
)[0].replace('', '').replace('', '').replace('', '').strip()
except Exception as e:
return f"Error processing image: {e}"
# Custom CSS to style the output box
css = """
#output { height: 500px; overflow: auto; border: 1px solid #ccc; }
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(TITLE)
with gr.Tab(label="Single Image Processing"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
submit_btn = gr.Button(value="Submit")
num_beams_slider = gr.Slider(
minimum=1,
maximum=5,
step=1,
value=5,
label="Number of Beams"
)
min_p_slider = gr.Slider(
minimum=0,
maximum=1,
step=0.01,
value=0.0,
label="Min-P"
)
top_p_slider = gr.Slider(
minimum=0,
maximum=1,
step=0.01,
value=1.0,
label="Top-P"
)
gr.Examples(
[
["eval_img_1.jpg", 5, 0.0, 1.0],
["eval_img_2.jpg", 5, 0.0, 1.0],
["eval_img_3.jpg", 5, 0.0, 1.0],
["eval_img_4.jpg", 5, 0.0, 1.0],
["eval_img_5.jpg", 5, 0.0, 1.0],
["eval_img_6.jpg", 5, 0.0, 1.0],
["eval_img_7.png", 5, 0.0, 1.0],
["eval_img_8.jpg", 5, 0.0, 1.0],
],
inputs=[input_img, num_beams_slider, min_p_slider, top_p_slider],
outputs=[output_text],
fn=process_image,
label="Try captioning on below examples",
)
submit_btn.click(
process_image,
[input_img, num_beams_slider, min_p_slider, top_p_slider],
[output_text]
)
if __name__ == "__main__":
demo.launch(debug=True)