Spaces:

prithivMLmods
/

QwQ-Edge

Running on Zero

App Files Files Community

QwQ-Edge / app.py

prithivMLmods

Update app.py

37efd95 verified 17 days ago

raw

history blame

5.15 kB

	import os
	from collections.abc import Iterator
	from threading import Thread
	import gradio as gr
	import spaces
	import torch
	import edge_tts
	import asyncio
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	model_id = "prithivMLmods/FastThink-0.5B-Tiny"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	model.eval()


	DESCRIPTION = """
	# QwQ Edge 💬
	"""

	css = '''
	h1 {
	text-align: center;
	display: block;
	}

	#duplicate-button {
	margin: auto;
	color: #fff;
	background: #1565c0;
	border-radius: 100vh;
	}
	'''

	# List of voices
	voices = [
	"en-US-JennyNeural", # @tts1
	"en-US-GuyNeural", # @tts2
	"en-US-AriaNeural", # @tts3
	"en-US-JaneNeural", # @tts4
	"en-US-JasonNeural", # @tts5
	"en-US-NancyNeural", # @tts6
	"en-US-TonyNeural", # @tts7
	]


	async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
	"""Convert text to speech using Edge TTS and save as MP3"""
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(output_file)
	return output_file


	@spaces.GPU
	def generate(
	message: str,
	chat_history: list[dict],
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	):
	"""Generates chatbot response and handles TTS requests"""
	is_tts = message.strip().lower().startswith("@tts")
	tts_index = None

	if is_tts:
	# Extract the number after @tts
	tts_part = message.strip().lower().split()[0] # Get the @ttsX part
	if len(tts_part) > 8: # Check if it's @ttsX (e.g., @tts1, @tts2, etc.)
	try:
	tts_index = int(tts_part[8:]) - 1 # Convert to 0-based index
	if tts_index < 0 or tts_index >= len(voices):
	gr.Warning(f"Invalid TTS voice index. Using default voice.")
	tts_index = 0
	except ValueError:
	gr.Warning(f"Invalid TTS voice index. Using default voice.")
	tts_index = 0
	else:
	tts_index = 0 # Default to the first voice if no number is provided

	message = message.replace(tts_part, "").strip() # Remove @ttsX from the message

	conversation = [*chat_history, {"role": "user", "content": message}]

	input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	{"input_ids": input_ids},
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)

	final_response = "".join(outputs)

	if is_tts:
	voice = voices[tts_index] # Select the voice based on the index
	output_file = asyncio.run(text_to_speech(final_response, voice))
	yield gr.Audio(output_file, autoplay=True) # Return playable audio
	else:
	yield final_response # Return text response


	demo = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
	gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
	gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
	gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
	gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
	],
	stop_btn=None,
	examples=[
	["@tts7 Who is Nikola Tesla, and why did he die?"],
	["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],
	["Write a Python function to check if a number is prime."],
	["@tts6 What causes rainbows to form?"],
	["Rewrite the following sentence in passive voice: 'The dog chased the cat.'"],
	["@tts4 What is the capital of France?"],
	],
	cache_examples=False,
	type="messages",
	description=DESCRIPTION,
	css=css,
	fill_height=True,
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()