Spaces:

prithivMLmods
/

QwQ-Edge

Running on Zero

App Files Files Community

prithivMLmods commited on 17 days ago

Commit

26f7b76

verified ·

1 Parent(s): 3a6718d

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -163

app.py CHANGED Viewed

@@ -6,34 +6,10 @@ import torch
 import edge_tts
 import asyncio
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 import time
-from gradio_client import Client  # For image generation API
-DESCRIPTION = """
-# QwQ Edge 💬
-"""
-css = '''
-h1 {
-  text-align: center;
-  display: block;
-}
-#duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-'''
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
@@ -45,11 +21,6 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
-]
 # Load multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -59,8 +30,19 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Image generation client
-image_gen_client = Client("prithivMLmods/STABLE-HAMSTER")
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
@@ -68,155 +50,86 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     await communicate.save(output_file)
     return output_file
-def image_gen(prompt: str):
-    """Generate an image using the Stable Hamster API"""
-    result = image_gen_client.predict("Image Generation", None, prompt, api_name="/stable_hamster")
-    return result[1]  # Return the generated image
 def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
 @spaces.GPU
-def generate(
-    input_dict: dict,
-    chat_history: list[dict],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-):
-    """
-    Generates chatbot responses with support for multimodal input, TTS, and image generation.
-    If the query starts with an @tts or @image command, previous chat history is cleared.
-    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Process image files if provided
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    # Check for TTS or Image Generation commands
-    tts_prefix = "@tts"
-    image_prefix = "@image"
-    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    is_image = text.strip().lower().startswith(image_prefix)
-    if is_tts:
-        voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-        voice = TTS_VOICES[voice_index - 1] if voice_index else None
-        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear any previous chat history to avoid concatenation issues
-        conversation = [{"role": "user", "content": text}]
-    elif is_image:
-        text = text.replace(image_prefix, "").strip()
-        conversation = [{"role": "user", "content": text}]
     else:
-        voice = None
-        text = text.replace(tts_prefix, "").strip()
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
-    if is_image:
-        # Image generation branch
-        yield "Generating image, please wait..."
-        try:
-            image = image_gen(text)
-            yield gr.Image(image)
-        except Exception as e:
-            yield f"Failed to generate image: {str(e)}"
-    elif images:
-        # Multimodal branch using the OCR model
-        messages = [{
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield "Thinking..."
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    else:
-        # Text-only branch using the text model
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "top_p": top_p,
-            "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
-            "repetition_penalty": repetition_penalty,
-        }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        outputs = []
-        for new_text in streamer:
-            outputs.append(new_text)
-            yield "".join(outputs)
-        final_response = "".join(outputs)
-        yield final_response
-        if is_tts and voice:
-            output_file = asyncio.run(text_to_speech(final_response, voice))
-            yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        ["@image A futuristic cityscape at sunset"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
-        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-        ["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],
-        ["Write a Python function to check if a number is prime."],
-        ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
-    type="messages",
-    description=DESCRIPTION,
-    css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",

 import edge_tts
 import asyncio
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from transformers.image_utils import load_image
+from huggingface_hub import InferenceClient
 import time
 # Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 )
 model.eval()
 # Load multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to("cuda").eval()
+TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
+]
+def image_gen(prompt):
+    """Generate image using API"""
+    try:
+        client = InferenceClient("prithivMLmods/STABLE-HAMSTER")
+        return client.text_to_image(prompt)
+    except:
+        client_flux = InferenceClient("black-forest-labs/FLUX.1-schnell")
+        return client_flux.text_to_image(prompt)
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
+    return [msg for msg in chat_history if isinstance(msg, dict) and isinstance(msg.get("content"), str)]
 @spaces.GPU
+def generate(input_dict: dict, chat_history: list[dict], max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2):
+    """Generates chatbot responses with multimodal input, TTS, and image generation."""
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    images = [load_image(file) for file in files] if files else []
+    if text.startswith("@tts"):
+        voice_index = next((i for i in range(1, 3) if text.startswith(f"@tts{i}")), None)
+        if voice_index:
+            voice = TTS_VOICES[voice_index - 1]
+            text = text.replace(f"@tts{voice_index}", "").strip()
+            conversation = [{"role": "user", "content": text}]
+        else:
+            voice = None
+    elif text.startswith("@image"):
+        query = text.replace("@image", "").strip()
+        yield "Generating Image, Please wait..."
+        image = image_gen(query)
+        yield gr.Image(image)
     else:
+        conversation = clean_chat_history(chat_history) + [{"role": "user", "content": text}]
+        if images:
+            messages = [{
+                "role": "user",
+                "content": [
+                    *[{"type": "image", "image": img} for img in images],
+                    {"type": "text", "text": text},
+                ]
+            }]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
+            streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+            thread = Thread(target=model_m.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens})
+            thread.start()
+            buffer = ""
+            for new_text in streamer:
+                buffer += new_text.replace("<|im_end|>", "")
+                yield buffer
+        else:
+            input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
+            streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+            thread = Thread(target=model.generate, kwargs={
+                "input_ids": input_ids,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "top_p": top_p,
+                "top_k": top_k,
+                "temperature": temperature,
+                "num_beams": 1,
+                "repetition_penalty": repetition_penalty,
+            })
+            thread.start()
+            response = "".join([new_text for new_text in streamer])
+            yield response
+            if voice:
+                output_file = asyncio.run(text_to_speech(response, voice))
+                yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=1024),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
+        ["@tts1 Who is Nikola Tesla?"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
+        ["@image futuristic city at sunset"],
+        ["A train travels 60 kilometers per hour. How far will it travel in 5 hours?"],
     ],
     cache_examples=False,
+    description="# QwQ Edge 💬",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",