import os import re import subprocess import numpy as np from PIL import Image import gradio as gr import torch from transformers import AutoProcessor, AutoModelForCausalLM # Load model and processor, enabling trust_remote_code if needed model_name = "PJMixers-Images/Florence-2-base-Castollux-v0.5" model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).eval() processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) # Set device (GPU if available) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) TITLE = f"# [{model_name}](https://huggingface.co/{model_name})" def process_image(image, num_beams=5, min_p=0.0, top_p=1.0): """ Process a single image to generate a caption. Supports image input as file path, numpy array, or PIL Image. Generation settings (num_beams, min_p, top_p) can be customized. """ try: # Convert input to PIL image if necessary if isinstance(image, np.ndarray): image = Image.fromarray(image) elif isinstance(image, str): image = Image.open(image) if image.mode != "RGB": image = image.convert("RGB") # Prepare inputs for the model inputs = processor( text="", images=image, return_tensors="pt" ) # Move tensors to the appropriate device inputs = {k: v.to(device) for k, v in inputs.items()} # Disable gradients during inference with torch.no_grad(): generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=num_beams, do_sample=True, top_p=top_p, min_p=min_p, ) # Decode and post-process the generated text return processor.batch_decode( generated_ids, skip_special_tokens=False )[0].replace('', '').replace('', '').replace('', '').strip() except Exception as e: return f"Error processing image: {e}" # Custom CSS to style the output box css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.Markdown(TITLE) with gr.Tab(label="Single Image Processing"): with gr.Row(): with gr.Column(): input_img = gr.Image(label="Input Picture") with gr.Column(): output_text = gr.Textbox(label="Output Text") submit_btn = gr.Button(value="Submit") num_beams_slider = gr.Slider( minimum=1, maximum=5, step=1, value=5, label="Number of Beams" ) min_p_slider = gr.Slider( minimum=0, maximum=1, step=0.01, value=0.0, label="Min-P" ) top_p_slider = gr.Slider( minimum=0, maximum=1, step=0.01, value=1.0, label="Top-P" ) gr.Examples( [ ["eval_img_1.jpg", 5, 0.0, 1.0], ["eval_img_2.jpg", 5, 0.0, 1.0], ["eval_img_3.jpg", 5, 0.0, 1.0], ["eval_img_4.jpg", 5, 0.0, 1.0], ["eval_img_5.jpg", 5, 0.0, 1.0], ["eval_img_6.jpg", 5, 0.0, 1.0], ["eval_img_7.png", 5, 0.0, 1.0], ["eval_img_8.jpg", 5, 0.0, 1.0], ], inputs=[input_img, num_beams_slider, min_p_slider, top_p_slider], outputs=[output_text], fn=process_image, label="Try captioning on below examples", ) submit_btn.click( process_image, [input_img, num_beams_slider, min_p_slider, top_p_slider], [output_text] ) if __name__ == "__main__": demo.launch(debug=True)