import base64 import os import re import shutil import time import uuid from pathlib import Path import cv2 import gradio as gr import numpy as np import spaces import torch from globe import description, title from PIL import Image from render import render_ocr_text from transformers import AutoModelForImageTextToText, AutoProcessor from transformers.image_utils import load_image model_name = "stepfun-ai/GOT-OCR-2.0-hf" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForImageTextToText.from_pretrained( model_name, low_cpu_mem_usage=True, device_map=device ) model = model.eval().to(device) UPLOAD_FOLDER = "./uploads" RESULTS_FOLDER = "./results" stop_str = "<|im_end|>" for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]: if not os.path.exists(folder): os.makedirs(folder) input_index = 0 @spaces.GPU() def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None): if image is None: return "Error: No image provided", None, None unique_id = str(uuid.uuid4()) image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.png") result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}.html") try: if not isinstance(image, (tuple, list)): image = [image] else: image = [img[0] for img in image] for i, img in enumerate(image): if isinstance(img, dict): composite_image = img.get("composite") if composite_image is not None: if isinstance(composite_image, np.ndarray): cv2.imwrite( image_path, cv2.cvtColor(composite_image, cv2.COLOR_RGB2BGR) ) elif isinstance(composite_image, Image.Image): composite_image.save(image_path) else: return ( "Error: Unsupported image format from ImageEditor", None, None, ) else: return ( "Error: No composite image found in ImageEditor output", None, None, ) elif isinstance(img, np.ndarray): cv2.imwrite(image_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) elif isinstance(img, str): shutil.copy(img, image_path) else: return "Error: Unsupported image format", None, None image[i] = load_image(image_path) if task == "Plain Text OCR": inputs = processor(image, return_tensors="pt").to("cuda") generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) return res, None, unique_id else: if task == "Format Text OCR": inputs = processor(image, return_tensors="pt", format=True).to("cuda") generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) ocr_type = "format" elif task == "Fine-grained OCR (Box)": inputs = processor(image, return_tensors="pt", box=ocr_box).to("cuda") generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) elif task == "Fine-grained OCR (Color)": inputs = processor(image, return_tensors="pt", color=ocr_color).to( "cuda" ) generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) elif task == "Multi-crop OCR": inputs = processor( image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=5, ).to("cuda") generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) ocr_type = "format" elif task == "Multi-page OCR": inputs = processor( image, return_tensors="pt", multi_page=True, format=True ).to("cuda") generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings=stop_str, max_new_tokens=4096, ) res = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) ocr_type = "format" render_ocr_text(res, result_path, format_text=ocr_type == "format") if os.path.exists(result_path): with open(result_path, "r") as f: html_content = f.read() return res, html_content, unique_id else: return res, None, unique_id except Exception as e: return f"Error: {str(e)}", None, None finally: if os.path.exists(image_path): os.remove(image_path) def update_image_input(task): if task == "Fine-grained OCR (Color)": return ( gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ) elif task == "Multi-page OCR": return ( gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), ) else: return ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ) def update_inputs(task): if task in [ "Plain Text OCR", "Format Text OCR", "Multi-crop OCR", ]: return [ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ] elif task == "Fine-grained OCR (Box)": return [ gr.update(visible=True, choices=["ocr", "format"]), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ] elif task == "Fine-grained OCR (Color)": return [ gr.update(visible=True, choices=["ocr", "format"]), gr.update(visible=False), gr.update(visible=True, choices=["red", "green", "blue"]), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ] elif task == "Multi-page OCR": return [ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), ] def parse_latex_output(res): # Split the input, preserving newlines and empty lines lines = re.split(r"(\$\$.*?\$\$)", res, flags=re.DOTALL) parsed_lines = [] in_latex = False latex_buffer = [] for line in lines: if line == "\n": if in_latex: latex_buffer.append(line) else: parsed_lines.append(line) continue line = line.strip() latex_patterns = [r"\{", r"\}", r"\[", r"\]", r"\\", r"\$", r"_", r"^", r'"'] contains_latex = any(re.search(pattern, line) for pattern in latex_patterns) if contains_latex: if not in_latex: in_latex = True latex_buffer = ["$$"] latex_buffer.append(line) else: if in_latex: latex_buffer.append("$$") parsed_lines.extend(latex_buffer) in_latex = False latex_buffer = [] parsed_lines.append(line) if in_latex: latex_buffer.append("$$") parsed_lines.extend(latex_buffer) return "$$\\$$\n".join(parsed_lines) def ocr_demo(image, task, ocr_type, ocr_box, ocr_color): res, html_content, unique_id = process_image( image, task, ocr_type, ocr_box, ocr_color ) if isinstance(res, str) and res.startswith("Error:"): return res, None res = res.replace("\\title", "\\title ") formatted_res = res # formatted_res = parse_latex_output(res) if html_content: encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8") iframe_src = f"data:text/html;base64,{encoded_html}" iframe = f'' download_link = f'Download Full Result' return formatted_res, f"{download_link}
{iframe}" return formatted_res, None def cleanup_old_files(): current_time = time.time() for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]: for file_path in Path(folder).glob("*"): if current_time - file_path.stat().st_mtime > 3600: # 1 hour file_path.unlink() with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(scale=1): with gr.Group(): image_input = gr.Image(type="filepath", label="Input Image") gallery_input = gr.Gallery( type="filepath", label="Input images", visible=False ) image_editor = gr.ImageEditor( label="Image Editor", type="pil", visible=False ) task_dropdown = gr.Dropdown( choices=[ "Plain Text OCR", "Format Text OCR", "Fine-grained OCR (Box)", "Fine-grained OCR (Color)", "Multi-crop OCR", "Multi-page OCR", ], label="Select Task", value="Plain Text OCR", ) ocr_type_dropdown = gr.Dropdown( choices=["ocr", "format"], label="OCR Type", visible=False ) ocr_box_input = gr.Textbox( label="OCR Box (x1,y1,x2,y2)", placeholder="[100,100,200,200]", visible=False, ) ocr_color_dropdown = gr.Dropdown( choices=["red", "green", "blue"], label="OCR Color", visible=False ) # with gr.Row(): # max_new_tokens_slider = gr.Slider(50, 500, step=10, value=150, label="Max New Tokens") # no_repeat_ngram_size_slider = gr.Slider(1, 10, step=1, value=2, label="No Repeat N-gram Size") submit_button = gr.Button("Process", variant="primary") editor_submit_button = gr.Button("Process Edited Image", visible=False, variant="primary") gallery_submit_button = gr.Button( "Process Multiple Images", visible=False, variant="primary" ) with gr.Column(scale=1): with gr.Group(): output_markdown = gr.Textbox(label="Text output") output_html = gr.HTML(label="HTML output") input_types = [ image_input, image_editor, gallery_input, ] task_dropdown.change( update_inputs, inputs=[task_dropdown], outputs=[ ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, image_input, image_editor, submit_button, editor_submit_button, gallery_input, gallery_submit_button, ], ) task_dropdown.change( update_image_input, inputs=[task_dropdown], outputs=[ image_input, image_editor, editor_submit_button, gallery_input, gallery_submit_button, ], ) submit_button.click( ocr_demo, inputs=[ image_input, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, ], outputs=[output_markdown, output_html], ) editor_submit_button.click( ocr_demo, inputs=[ image_editor, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, ], outputs=[output_markdown, output_html], ) gallery_submit_button.click( ocr_demo, inputs=[ gallery_input, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, ], outputs=[output_markdown, output_html], ) example = gr.Examples( examples=[ [ "./sheet_music.png", "Format Text OCR", "format", None, None, ], [ "./latex.png", "Format Text OCR", "format", None, None, ], ], inputs=[ image_input, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, ], outputs=[output_markdown, output_html], ) example_finegrained = gr.Examples( examples=[ [ "./multi_box.png", "Fine-grained OCR (Color)", "ocr", None, "red", ] ], inputs=[ image_editor, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, ], outputs=[output_markdown, output_html], label="Fine-grained example", ) gr.Markdown( "Space based on [Tonic's GOT-OCR](https://huggingface.co/spaces/Tonic/GOT-OCR)" ) if __name__ == "__main__": cleanup_old_files() demo.launch()