lixin4ever commited on
Commit
504eebe
·
verified ·
1 Parent(s): 1ddcb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -71
app.py CHANGED
@@ -44,78 +44,78 @@ def _on_video_upload(messages, video):
44
  messages.append({"role": "user", "content": {"path": video}})
45
  return messages, None
46
 
47
- def _on_image_upload(messages, image):
48
- if image is not None:
49
- # messages.append({"role": "user", "content": gr.Image(image)})
50
- messages.append({"role": "user", "content": {"path": image}})
51
- return messages, None
52
-
53
- def _on_text_submit(messages, text):
54
- messages.append({"role": "user", "content": text})
55
- return messages, ""
56
-
57
- @spaces.GPU(duration=120)
58
- def _predict(messages, input_text, do_sample, temperature, top_p, max_new_tokens,
59
- fps, max_frames):
60
- if len(input_text) > 0:
61
- messages.append({"role": "user", "content": input_text})
62
- new_messages = []
63
- contents = []
64
- for message in messages:
65
- if message["role"] == "assistant":
66
- if len(contents):
67
- new_messages.append({"role": "user", "content": contents})
68
- contents = []
69
- new_messages.append(message)
70
- elif message["role"] == "user":
71
- if isinstance(message["content"], str):
72
- contents.append(message["content"])
 
 
 
 
 
 
73
  else:
74
- media_path = message["content"][0]
75
- if media_path.endswith(video_formats):
76
- contents.append({"type": "video", "video": {"video_path": media_path, "fps": fps, "max_frames": max_frames}})
77
- elif media_path.endswith(image_formats):
78
- contents.append({"type": "image", "image": {"image_path": media_path}})
79
- else:
80
- raise ValueError(f"Unsupported media type: {media_path}")
81
-
82
- if len(contents):
83
- new_messages.append({"role": "user", "content": contents})
84
-
85
- if len(new_messages) == 0 or new_messages[-1]["role"] != "user":
86
- return messages
87
-
88
- generation_config = {
89
- "do_sample": do_sample,
90
- "temperature": temperature,
91
- "top_p": top_p,
92
- "max_new_tokens": max_new_tokens
93
- }
94
-
95
- inputs = processor(
96
- conversation=new_messages,
97
- add_system_prompt=True,
98
- add_generation_prompt=True,
99
- return_tensors="pt"
100
- )
101
- inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
102
- if "pixel_values" in inputs:
103
- inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
104
-
105
- streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
106
- generation_kwargs = {
107
- **inputs,
108
- **generation_config,
109
- "streamer": streamer,
110
- }
111
-
112
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
113
- thread.start()
114
-
115
- messages.append({"role": "assistant", "content": ""})
116
- for token in streamer:
117
- messages[-1]['content'] += token
118
- yield messages
119
 
120
 
121
  with gr.Blocks() as interface:
 
44
  messages.append({"role": "user", "content": {"path": video}})
45
  return messages, None
46
 
47
+ def _on_image_upload(messages, image):
48
+ if image is not None:
49
+ # messages.append({"role": "user", "content": gr.Image(image)})
50
+ messages.append({"role": "user", "content": {"path": image}})
51
+ return messages, None
52
+
53
+ def _on_text_submit(messages, text):
54
+ messages.append({"role": "user", "content": text})
55
+ return messages, ""
56
+
57
+ @spaces.GPU(duration=120)
58
+ def _predict(messages, input_text, do_sample, temperature, top_p, max_new_tokens,
59
+ fps, max_frames):
60
+ if len(input_text) > 0:
61
+ messages.append({"role": "user", "content": input_text})
62
+ new_messages = []
63
+ contents = []
64
+ for message in messages:
65
+ if message["role"] == "assistant":
66
+ if len(contents):
67
+ new_messages.append({"role": "user", "content": contents})
68
+ contents = []
69
+ new_messages.append(message)
70
+ elif message["role"] == "user":
71
+ if isinstance(message["content"], str):
72
+ contents.append(message["content"])
73
+ else:
74
+ media_path = message["content"][0]
75
+ if media_path.endswith(video_formats):
76
+ contents.append({"type": "video", "video": {"video_path": media_path, "fps": fps, "max_frames": max_frames}})
77
+ elif media_path.endswith(image_formats):
78
+ contents.append({"type": "image", "image": {"image_path": media_path}})
79
  else:
80
+ raise ValueError(f"Unsupported media type: {media_path}")
81
+
82
+ if len(contents):
83
+ new_messages.append({"role": "user", "content": contents})
84
+
85
+ if len(new_messages) == 0 or new_messages[-1]["role"] != "user":
86
+ return messages
87
+
88
+ generation_config = {
89
+ "do_sample": do_sample,
90
+ "temperature": temperature,
91
+ "top_p": top_p,
92
+ "max_new_tokens": max_new_tokens
93
+ }
94
+
95
+ inputs = processor(
96
+ conversation=new_messages,
97
+ add_system_prompt=True,
98
+ add_generation_prompt=True,
99
+ return_tensors="pt"
100
+ )
101
+ inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
102
+ if "pixel_values" in inputs:
103
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
104
+
105
+ streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
106
+ generation_kwargs = {
107
+ **inputs,
108
+ **generation_config,
109
+ "streamer": streamer,
110
+ }
111
+
112
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
113
+ thread.start()
114
+
115
+ messages.append({"role": "assistant", "content": ""})
116
+ for token in streamer:
117
+ messages[-1]['content'] += token
118
+ yield messages
 
 
 
 
 
 
119
 
120
 
121
  with gr.Blocks() as interface: