prithivMLmods commited on
Commit
09dd649
·
verified ·
1 Parent(s): 341a429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -97
app.py CHANGED
@@ -1,98 +1,98 @@
1
- import gradio as gr
2
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
3
- from transformers.image_utils import load_image
4
- from threading import Thread
5
- import time
6
- import torch
7
- import spaces
8
-
9
- MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
10
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
11
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
- MODEL_ID,
13
- trust_remote_code=True,
14
- torch_dtype=torch.bfloat16
15
- ).to("cuda").eval()
16
-
17
- @spaces.GPU
18
- def model_inference(input_dict, history):
19
- text = input_dict["text"]
20
- files = input_dict["files"]
21
-
22
- # Load images if provided
23
- if len(files) > 1:
24
- images = [load_image(image) for image in files]
25
- elif len(files) == 1:
26
- images = [load_image(files[0])]
27
- else:
28
- images = []
29
-
30
- # Validate input
31
- if text == "" and not images:
32
- gr.Error("Please input a query and optionally image(s).")
33
- return
34
- if text == "" and images:
35
- gr.Error("Please input a text query along with the image(s).")
36
- return
37
-
38
- # Prepare messages for the model
39
- messages = [
40
- {
41
- "role": "user",
42
- "content": [
43
- *[{"type": "image", "image": image} for image in images],
44
- {"type": "text", "text": text},
45
- ],
46
- }
47
- ]
48
-
49
- # Apply chat template and process inputs
50
- prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
- inputs = processor(
52
- text=[prompt],
53
- images=images if images else None,
54
- return_tensors="pt",
55
- padding=True,
56
- ).to("cuda")
57
-
58
- # Set up streamer for real-time output
59
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
60
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
61
-
62
- # Start generation in a separate thread
63
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
64
- thread.start()
65
-
66
- # Stream the output
67
- buffer = ""
68
- yield "Thinking..."
69
- for new_text in streamer:
70
- buffer += new_text
71
- time.sleep(0.01)
72
- yield buffer
73
-
74
-
75
- # Example inputs
76
- examples = [
77
- [{"text": "Explain the movie scene; screen board", "files": ["example_images/int.png"]}],
78
- [{"text": "Describe the document?", "files": ["example_images/document.jpg"]}],
79
- [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
80
- [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
81
- [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
82
- [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
83
- [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
84
- [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
85
-
86
- ]
87
-
88
- demo = gr.ChatInterface(
89
- fn=model_inference,
90
- description="# **Qwen2.5-VL-3B-Instruct**",
91
- examples=examples,
92
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
93
- stop_btn="Stop Generation",
94
- multimodal=True,
95
- cache_examples=False,
96
- )
97
-
98
  demo.launch(debug=True)
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
3
+ from transformers.image_utils import load_image
4
+ from threading import Thread
5
+ import time
6
+ import torch
7
+ import spaces
8
+
9
+ MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" #else ; MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
10
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
11
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
+ MODEL_ID,
13
+ trust_remote_code=True,
14
+ torch_dtype=torch.bfloat16
15
+ ).to("cuda").eval()
16
+
17
+ @spaces.GPU
18
+ def model_inference(input_dict, history):
19
+ text = input_dict["text"]
20
+ files = input_dict["files"]
21
+
22
+ # Load images if provided
23
+ if len(files) > 1:
24
+ images = [load_image(image) for image in files]
25
+ elif len(files) == 1:
26
+ images = [load_image(files[0])]
27
+ else:
28
+ images = []
29
+
30
+ # Validate input
31
+ if text == "" and not images:
32
+ gr.Error("Please input a query and optionally image(s).")
33
+ return
34
+ if text == "" and images:
35
+ gr.Error("Please input a text query along with the image(s).")
36
+ return
37
+
38
+ # Prepare messages for the model
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ *[{"type": "image", "image": image} for image in images],
44
+ {"type": "text", "text": text},
45
+ ],
46
+ }
47
+ ]
48
+
49
+ # Apply chat template and process inputs
50
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
+ inputs = processor(
52
+ text=[prompt],
53
+ images=images if images else None,
54
+ return_tensors="pt",
55
+ padding=True,
56
+ ).to("cuda")
57
+
58
+ # Set up streamer for real-time output
59
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
60
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
61
+
62
+ # Start generation in a separate thread
63
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
64
+ thread.start()
65
+
66
+ # Stream the output
67
+ buffer = ""
68
+ yield "Thinking..."
69
+ for new_text in streamer:
70
+ buffer += new_text
71
+ time.sleep(0.01)
72
+ yield buffer
73
+
74
+
75
+ # Example inputs
76
+ examples = [
77
+ [{"text": "Explain the movie scene; screen board", "files": ["example_images/int.png"]}],
78
+ [{"text": "Describe the document?", "files": ["example_images/document.jpg"]}],
79
+ [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
80
+ [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
81
+ [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
82
+ [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
83
+ [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
84
+ [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
85
+
86
+ ]
87
+
88
+ demo = gr.ChatInterface(
89
+ fn=model_inference,
90
+ description="# **Qwen2.5-VL-3B-Instruct**",
91
+ examples=examples,
92
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
93
+ stop_btn="Stop Generation",
94
+ multimodal=True,
95
+ cache_examples=False,
96
+ )
97
+
98
  demo.launch(debug=True)