Spaces:

prithivMLmods
/

QwQ-Edge

Running on Zero

prithivMLmods commited on Jan 22

Commit

f8af0ad

verified ·

1 Parent(s): 1d74de7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -57,19 +57,22 @@ def generate(
     conversation = chat_history.copy()
     conversation.append({"role": "user", "content": message})
-    # Apply chat template and get input_ids and attention_mask
-    inputs = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     attention_mask = attention_mask.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
@@ -87,6 +90,7 @@ def generate(
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
@@ -148,4 +152,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

     conversation = chat_history.copy()
     conversation.append({"role": "user", "content": message})
+    # Apply chat template and get input_ids
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    # Create attention mask
+    attention_mask = torch.ones_like(input_ids)
+    # Trim input if it exceeds the maximum token length
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     attention_mask = attention_mask.to(model.device)
+    # Set up the streamer for real-time text generation
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    # Stream the output tokens
     outputs = []
     for text in streamer:
         outputs.append(text)
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)  # Set `share=True` for a public link