from PIL import Image import spaces import gradio as gr from transformers import ( AutoProcessor, AutoModelForCausalLM, ) import torch import subprocess subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 Florence_models = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device) Florence_processors = AutoProcessor.from_pretrained( "microsoft/Florence-2-large", trust_remote_code=True) @spaces.GPU def feifeiflorence( image, progress=gr.Progress(track_tqdm=True), ): image = Image.fromarray(image) task_prompt = "" if image.mode != "RGB": image = image.convert("RGB") inputs = Florence_processors(text=task_prompt, images=image, return_tensors="pt").to(device, torch_dtype) generated_ids = Florence_models.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, do_sample=False, ) generated_text = Florence_processors.batch_decode( generated_ids, skip_special_tokens=False)[0] parsed_answer = Florence_processors.post_process_generation( generated_text, task=task_prompt, image_size=(image.width, image.height)) return parsed_answer[""]