Spaces:

davanstrien
/

ColPali-Query-Generator

Running on Zero

App Files Files Community

davanstrien HF staff commited on 14 days ago

Commit

30ae6d8

verified ·

1 Parent(s): d334b52

regex parsing

Browse files

Files changed (1) hide show

app.py +26 -10

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ subprocess.run(
 )
 import spaces
 import gradio as gr
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
@@ -18,15 +19,15 @@ from typing import Tuple
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
@@ -36,6 +37,17 @@ class GeneralRetrievalQuery(BaseModel):
     visual_element_query: str
     visual_element_explanation: str
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
@@ -76,11 +88,9 @@ Generate the queries based on this image and provide the response in the specifi
     return prompt, GeneralRetrievalQuery
 # defined like this so we can later add more prompting options
 prompt, pydantic_model = get_retrieval_prompt("general")
 def _prep_data_for_input(image):
     messages = [
         {
@@ -109,7 +119,6 @@ def _prep_data_for_input(image):
         return_tensors="pt",
     )
 @spaces.GPU
 def generate_response(image):
     inputs = _prep_data_for_input(image)
@@ -125,13 +134,20 @@ def generate_response(image):
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
-    )
     try:
-        return json.loads(output_text[0])
     except Exception:
         gr.Warning("Failed to parse JSON from output")
-        return output_text[0]
 title = "ColPali Query Generator using Qwen2.5-VL"
 description = """[ColPali](https://huggingface.co/papers/2407.01449) is a very exciting new approach to multimodal document retrieval which aims to replace existing document retrievers which often rely on an OCR step with an end-to-end multimodal approach.

 )
 import spaces
 import gradio as gr
+import re
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
+processor = AutoProcessor.from_pretrained(
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+)
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
     visual_element_query: str
     visual_element_explanation: str
+def extract_json_with_regex(text):
+    # Pattern to match content between code backticks
+    pattern = r'```(?:json)?\s*(.+?)\s*```'
+    # Find all matches (should typically be one)
+    matches = re.findall(pattern, text, re.DOTALL)
+    if matches:
+        # Return the first match
+        return matches[0]
+    return None
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
     return prompt, GeneralRetrievalQuery
 # defined like this so we can later add more prompting options
 prompt, pydantic_model = get_retrieval_prompt("general")
 def _prep_data_for_input(image):
     messages = [
         {
         return_tensors="pt",
     )
 @spaces.GPU
 def generate_response(image):
     inputs = _prep_data_for_input(image)
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
+    )[0]
     try:
+        # Try to extract JSON from code block first
+        json_str = extract_json_with_regex(output_text)
+        if json_str:
+            parsed = json.loads(json_str)
+            return json.dumps(parsed, indent=2)
+        # If no code block found, try direct JSON parsing
+        parsed = json.loads(output_text)
+        return json.dumps(parsed, indent=2)
     except Exception:
         gr.Warning("Failed to parse JSON from output")
+        return output_text
 title = "ColPali Query Generator using Qwen2.5-VL"
 description = """[ColPali](https://huggingface.co/papers/2407.01449) is a very exciting new approach to multimodal document retrieval which aims to replace existing document retrievers which often rely on an OCR step with an end-to-end multimodal approach.