|
from __future__ import annotations |
|
import os |
|
|
|
os.system('python -m unidic download') |
|
|
|
|
|
|
|
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11') |
|
|
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
|
|
|
|
|
|
import textwrap |
|
from scipy.io.wavfile import write |
|
from pydub import AudioSegment |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import nltk |
|
nltk.download("punkt") |
|
|
|
import noisereduce as nr |
|
import subprocess |
|
import langid |
|
import uuid |
|
import emoji |
|
import pathlib |
|
|
|
import datetime |
|
|
|
from scipy.io.wavfile import write |
|
from pydub import AudioSegment |
|
|
|
import re |
|
import io, wave |
|
import librosa |
|
import torchaudio |
|
from TTS.api import TTS |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from TTS.utils.generic_utils import get_user_data_dir |
|
|
|
|
|
import gradio as gr |
|
import os |
|
import time |
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
import numpy as np |
|
|
|
from gradio_client import Client |
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
print("Downloading if not downloaded Coqui XTTS V2") |
|
|
|
from TTS.utils.manage import ModelManager |
|
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" |
|
ModelManager().download_model(model_name) |
|
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) |
|
print("XTTS downloaded") |
|
|
|
|
|
print("Loading XTTS") |
|
config = XttsConfig() |
|
config.load_json(os.path.join(model_path, "config.json")) |
|
|
|
model = Xtts.init_from_config(config) |
|
model.load_checkpoint( |
|
config, |
|
checkpoint_path=os.path.join(model_path, "model.pth"), |
|
vocab_path=os.path.join(model_path, "vocab.json"), |
|
eval=True, |
|
use_deepspeed=True, |
|
) |
|
model.cuda() |
|
print("Done loading TTS") |
|
|
|
|
|
|
|
title = "Voice chat with Zephyr/Mistral and Coqui XTTS" |
|
|
|
DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS""" |
|
css = """.toast-wrap { display: none !important } """ |
|
|
|
from huggingface_hub import HfApi |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
api = HfApi(token=HF_TOKEN) |
|
|
|
|
|
import base64 |
|
repo_id = "jbilcke-hf/ai-bedtime-story-server" |
|
SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret') |
|
SENTENCE_SPLIT_LENGTH=250 |
|
|
|
|
|
default_system_message = f""" |
|
You're the storyteller, crafting a short tale for young listeners. Please abide by these guidelines: |
|
- Keep your sentences short, concise and easy to understand. |
|
- There should be only the narrator speaking. If there are dialogues, they should be indirect. |
|
- Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper. |
|
- Don’t use complex words. Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken. |
|
- Type out numbers in words (e.g. 'twenty twelve' instead of the year 2012). |
|
- Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them. |
|
""" |
|
|
|
system_message = os.environ.get("SYSTEM_MESSAGE", default_system_message) |
|
system_message = system_message.replace("CURRENT_DATE", str(datetime.date.today())) |
|
|
|
ROLES = ["Cloée","Julian","Pirate","Thera"] |
|
|
|
ROLE_PROMPTS = {} |
|
ROLE_PROMPTS["Cloée"]=system_message |
|
ROLE_PROMPTS["Julian"]=system_message |
|
ROLE_PROMPTS["Thera"]=system_message |
|
|
|
|
|
character_name= "AI Beard" |
|
character_scenario= f"As {character_name} you are a 28 year old man who is a pirate on the ship Invisible AI. You are good friends with Guybrush Threepwood and Murray the Skull. Developers did not get you into Monkey Island games as you wanted huge shares of Big Whoop treasure." |
|
pirate_system_message = f"You as {character_name}. {character_scenario} Print out only exactly the words that {character_name} would speak out, do not add anything. Don't repeat. Answer short, only few words, as if in a talk. Craft your response only from the first-person perspective of {character_name} and never as user.Current date: #CURRENT_DATE#".replace("#CURRENT_DATE#", str(datetime.date.today())) |
|
|
|
ROLE_PROMPTS["Pirate"]= pirate_system_message |
|
|
|
|
|
|
|
|
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
print("Downloading LLM") |
|
|
|
|
|
print("Downloading Zephyr") |
|
|
|
hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf") |
|
|
|
zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf" |
|
|
|
from llama_cpp import Llama |
|
|
|
|
|
|
|
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35)) |
|
|
|
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"] |
|
|
|
LLAMA_VERBOSE=False |
|
|
|
print("Running LLM Zephyr") |
|
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=512, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE) |
|
|
|
def split_sentences(text, max_len): |
|
|
|
text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text) |
|
text = re.sub(r"(\s*\!{2})\s*", r"!\1 ", text) |
|
|
|
|
|
sentences = nltk.sent_tokenize(text) |
|
|
|
|
|
sentence_list = [] |
|
for sent in sentences: |
|
if len(sent) > max_len: |
|
wrapped = textwrap.wrap(sent, max_len, break_long_words=True) |
|
sentence_list.extend(wrapped) |
|
else: |
|
sentence_list.append(sent) |
|
|
|
return sentence_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_prompt_zephyr(message, history, system_message=system_message): |
|
prompt = ( |
|
"<|system|>\n" + system_message + "</s>" |
|
) |
|
for user_prompt, bot_response in history: |
|
prompt += f"<|user|>\n{user_prompt}</s>" |
|
prompt += f"<|assistant|>\n{bot_response}</s>" |
|
if message=="": |
|
message="Hello" |
|
prompt += f"<|user|>\n{message}</s>" |
|
prompt += f"<|assistant|>" |
|
print(prompt) |
|
return prompt |
|
|
|
import struct |
|
|
|
|
|
def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16): |
|
|
|
if pcm_data.startswith(b"RIFF"): |
|
return pcm_data |
|
|
|
|
|
fmt_subchunk_size = 16 |
|
data_subchunk_size = len(pcm_data) |
|
chunk_size = 4 + (8 + fmt_subchunk_size) + (8 + data_subchunk_size) |
|
|
|
|
|
wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE') |
|
fmt_subchunk = struct.pack('<4sIHHIIHH', |
|
b'fmt ', fmt_subchunk_size, 1, channels, |
|
sample_rate, sample_rate * channels * bit_depth // 8, |
|
channels * bit_depth // 8, bit_depth) |
|
|
|
data_subchunk = struct.pack('<4sI', b'data', data_subchunk_size) |
|
|
|
return wav_header + fmt_subchunk + data_subchunk + pcm_data |
|
|
|
def generate_local( |
|
prompt, |
|
history, |
|
system_message=None, |
|
temperature=0.8, |
|
max_tokens=256, |
|
top_p=0.95, |
|
stop = LLM_STOP_WORDS |
|
): |
|
temperature = float(temperature) |
|
if temperature < 1e-2: |
|
temperature = 1e-2 |
|
top_p = float(top_p) |
|
|
|
generate_kwargs = dict( |
|
temperature=temperature, |
|
max_tokens=max_tokens, |
|
top_p=top_p, |
|
stop=stop |
|
) |
|
|
|
sys_message= system_message.replace("##LLM_MODEL###","Zephyr").replace("##LLM_MODEL_PROVIDER###","Hugging Face") |
|
formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message) |
|
llm = llm_zephyr |
|
|
|
|
|
try: |
|
print("LLM Input:", formatted_prompt) |
|
stream = llm( |
|
formatted_prompt, |
|
**generate_kwargs, |
|
stream=True, |
|
) |
|
output = "" |
|
for response in stream: |
|
character= response["choices"][0]["text"] |
|
|
|
if "<|user|>" in character: |
|
|
|
return |
|
|
|
if emoji.is_emoji(character): |
|
|
|
return |
|
|
|
|
|
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","") |
|
yield output |
|
|
|
except Exception as e: |
|
if "Too Many Requests" in str(e): |
|
print("ERROR: Too many requests on mistral client") |
|
gr.Warning("Unfortunately Mistral is unable to process") |
|
output = "Unfortunately I am not able to process your request now !" |
|
else: |
|
print("Unhandled Exception: ", str(e)) |
|
gr.Warning("Unfortunately Mistral is unable to process") |
|
output = "I do not know what happened but I could not understand you ." |
|
|
|
return output |
|
|
|
def get_latents(speaker_wav,voice_cleanup=False): |
|
if (voice_cleanup): |
|
try: |
|
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02" |
|
resample_filter="-ac 1 -ar 22050" |
|
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" |
|
|
|
shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ") |
|
|
|
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True) |
|
speaker_wav=out_filename |
|
print("Filtered microphone input") |
|
except subprocess.CalledProcessError: |
|
|
|
print("Error: failed filtering, use original microphone input") |
|
else: |
|
speaker_wav=speaker_wav |
|
|
|
|
|
( |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
) = model.get_conditioning_latents(audio_path=speaker_wav) |
|
return gpt_cond_latent, speaker_embedding |
|
|
|
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000): |
|
|
|
|
|
|
|
wav_buf = io.BytesIO() |
|
with wave.open(wav_buf, "wb") as vfout: |
|
vfout.setnchannels(channels) |
|
vfout.setsampwidth(sample_width) |
|
vfout.setframerate(sample_rate) |
|
vfout.writeframes(frame_input) |
|
|
|
wav_buf.seek(0) |
|
return wav_buf.read() |
|
|
|
|
|
|
|
|
|
|
|
xtts_supported_languages=config.languages |
|
def detect_language(prompt): |
|
|
|
if len(prompt)>15: |
|
language_predicted=langid.classify(prompt)[0].strip() |
|
if language_predicted == "zh": |
|
|
|
language_predicted = "zh-cn" |
|
|
|
if language_predicted not in xtts_supported_languages: |
|
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now") |
|
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ") |
|
language= "en" |
|
else: |
|
language = language_predicted |
|
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}") |
|
else: |
|
|
|
language = "en" |
|
print(f"Language: Prompt is short or autodetect language disabled using english for xtts") |
|
|
|
return language |
|
|
|
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"): |
|
gpt_cond_latent, speaker_embedding = latent_tuple |
|
|
|
try: |
|
t0 = time.time() |
|
chunks = model.inference_stream( |
|
prompt, |
|
language, |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
|
|
temperature=0.85, |
|
) |
|
|
|
first_chunk = True |
|
for i, chunk in enumerate(chunks): |
|
if first_chunk: |
|
first_chunk_time = time.time() - t0 |
|
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n" |
|
first_chunk = False |
|
|
|
|
|
|
|
chunk = chunk.detach().cpu().numpy().squeeze() |
|
chunk = (chunk * 32767).astype(np.int16) |
|
|
|
yield chunk.tobytes() |
|
|
|
except RuntimeError as e: |
|
if "device-side assert" in str(e): |
|
|
|
print( |
|
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}", |
|
flush=True, |
|
) |
|
gr.Warning("Unhandled Exception encounter, please retry in a minute") |
|
print("Cuda device-assert Runtime encountered need restart") |
|
|
|
|
|
api.restart_space(repo_id=repo_id) |
|
else: |
|
print("RuntimeError: non device-side assert error:", str(e)) |
|
|
|
|
|
return None |
|
return None |
|
except: |
|
return None |
|
|
|
|
|
|
|
def add_text(history, text): |
|
history = [] if history is None else history |
|
history = history + [(text, None)] |
|
return history, gr.update(value="", interactive=False) |
|
|
|
|
|
def add_file(history, file): |
|
history = [] if history is None else history |
|
|
|
try: |
|
text = transcribe(file) |
|
print("Transcribed text:", text) |
|
except Exception as e: |
|
print(str(e)) |
|
gr.Warning("There was an issue with transcription, please try writing for now") |
|
|
|
text = "Transcription seems failed, please tell me a joke about chickens" |
|
|
|
history = history + [(text, None)] |
|
return history, gr.update(value="", interactive=False) |
|
|
|
|
|
def get_sentence(history, chatbot_role): |
|
|
|
history = [["", None]] if history is None else history |
|
|
|
history[-1][1] = "" |
|
|
|
sentence_list = [] |
|
sentence_hash_list = [] |
|
|
|
text_to_generate = "" |
|
stored_sentence = None |
|
stored_sentence_hash = None |
|
|
|
print(chatbot_role) |
|
|
|
for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]): |
|
history[-1][1] = character.replace("<|assistant|>","") |
|
|
|
|
|
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip()) |
|
if len(text_to_generate) > 1: |
|
|
|
dif = len(text_to_generate) - len(sentence_list) |
|
|
|
if dif == 1 and len(sentence_list) != 0: |
|
continue |
|
|
|
if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None: |
|
continue |
|
|
|
|
|
if stored_sentence is not None and stored_sentence_hash is None and dif>1: |
|
|
|
sentence = text_to_generate[len(sentence_list)+1] |
|
elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None: |
|
print("Appending stored") |
|
sentence = stored_sentence + text_to_generate[len(sentence_list)+1] |
|
stored_sentence_hash = None |
|
else: |
|
sentence = text_to_generate[len(sentence_list)] |
|
|
|
|
|
|
|
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None: |
|
if sentence[-1] in [".","!","?"]: |
|
if stored_sentence_hash != hash(sentence): |
|
stored_sentence = sentence |
|
stored_sentence_hash = hash(sentence) |
|
print("Storing:",stored_sentence) |
|
continue |
|
|
|
|
|
sentence_hash = hash(sentence) |
|
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash: |
|
continue |
|
|
|
if sentence_hash not in sentence_hash_list: |
|
sentence_hash_list.append(sentence_hash) |
|
sentence_list.append(sentence) |
|
print("New Sentence: ", sentence) |
|
yield (sentence, history) |
|
|
|
|
|
try: |
|
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1] |
|
sentence_hash = hash(last_sentence) |
|
if sentence_hash not in sentence_hash_list: |
|
if stored_sentence is not None and stored_sentence_hash is not None: |
|
last_sentence = stored_sentence + last_sentence |
|
stored_sentence = stored_sentence_hash = None |
|
print("Last Sentence with stored:",last_sentence) |
|
|
|
sentence_hash_list.append(sentence_hash) |
|
sentence_list.append(last_sentence) |
|
print("Last Sentence: ", last_sentence) |
|
|
|
yield (last_sentence, history) |
|
except: |
|
print("ERROR on last sentence history is :", history) |
|
|
|
|
|
from scipy.io.wavfile import write |
|
from pydub import AudioSegment |
|
|
|
second_of_silence = AudioSegment.silent() |
|
second_of_silence.export("sil.wav", format='wav') |
|
|
|
|
|
def generate_speech_from_history(history, chatbot_role, sentence): |
|
language = "autodetect" |
|
|
|
|
|
|
|
if len(sentence)==0: |
|
print("EMPTY SENTENCE") |
|
return |
|
|
|
|
|
|
|
sentence = sentence.replace("</s>", "") |
|
|
|
sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL) |
|
sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL) |
|
|
|
sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL) |
|
|
|
sentence = sentence.replace("```", "") |
|
sentence = sentence.replace("...", " ") |
|
sentence = sentence.replace("(", " ") |
|
sentence = sentence.replace(")", " ") |
|
sentence = sentence.replace("<|assistant|>","") |
|
|
|
if len(sentence)==0: |
|
print("EMPTY SENTENCE after processing") |
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)",r"\1 \2",sentence) |
|
|
|
print("Sentence for speech:", sentence) |
|
|
|
results = [] |
|
|
|
try: |
|
if len(sentence) < SENTENCE_SPLIT_LENGTH: |
|
|
|
sentence_list = [sentence] |
|
else: |
|
|
|
|
|
|
|
|
|
sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH) |
|
|
|
print("detected sentences:", sentence_list) |
|
|
|
for sentence in sentence_list: |
|
|
|
print("- sentence = ", sentence) |
|
|
|
if any(c.isalnum() for c in sentence): |
|
if language=="autodetect": |
|
|
|
language = detect_language(sentence) |
|
|
|
|
|
audio_stream = get_voice_streaming( |
|
sentence, language, latent_map[chatbot_role] |
|
) |
|
else: |
|
|
|
audio_stream = None |
|
continue |
|
|
|
|
|
|
|
if audio_stream is not None: |
|
sentence_wav_bytestream = b"" |
|
|
|
|
|
for chunk in audio_stream: |
|
try: |
|
if chunk is not None: |
|
sentence_wav_bytestream += chunk |
|
|
|
except: |
|
|
|
continue |
|
|
|
|
|
filter_output=True |
|
if filter_output: |
|
try: |
|
data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0) |
|
float_data = data_s16 * 0.5**15 |
|
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024) |
|
sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16) |
|
sentence_wav_bytestream = sentence_wav_bytestream.tobytes() |
|
except: |
|
print("failed to remove noise") |
|
|
|
|
|
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8') |
|
|
|
results.append({ "text": sentence, "audio": base64_audio }) |
|
else: |
|
|
|
results.append({ "text": sentence, "audio": "" }) |
|
|
|
except RuntimeError as e: |
|
if "device-side assert" in str(e): |
|
|
|
print( |
|
f"Exit due to: Unrecoverable exception caused by prompt:{sentence}", |
|
flush=True, |
|
) |
|
gr.Warning("Unhandled Exception encounter, please retry in a minute") |
|
print("Cuda device-assert Runtime encountered need restart") |
|
|
|
|
|
api.restart_space(repo_id=repo_id) |
|
else: |
|
print("RuntimeError: non device-side assert error:", str(e)) |
|
raise e |
|
|
|
return results |
|
|
|
latent_map = {} |
|
latent_map["Cloée"] = get_latents("voices/cloee-1.wav") |
|
latent_map["Julian"] = get_latents("voices/julian-bedtime-style-1.wav") |
|
latent_map["Pirate"] = get_latents("voices/pirate_by_coqui.wav") |
|
latent_map["Thera"] = get_latents("voices/thera-1.wav") |
|
|
|
|
|
def generate_story_and_speech(secret_token, input_text, chatbot_role): |
|
if secret_token != SECRET_TOKEN: |
|
raise gr.Error( |
|
f'Invalid secret token. Please fork the original space if you want to use it for yourself.') |
|
|
|
|
|
history = [[input_text, None]] |
|
story_sentences = get_sentence(history, chatbot_role) |
|
|
|
story_text = "" |
|
last_history = None |
|
|
|
|
|
for sentence, updated_history in story_sentences: |
|
if sentence: |
|
story_text += sentence.strip() + " " |
|
last_history = updated_history |
|
|
|
if last_history is not None: |
|
|
|
history_tuples = [tuple(entry) for entry in last_history] |
|
|
|
return generate_speech_from_history(history_tuples, chatbot_role, story_text) |
|
|
|
else: |
|
return [] |
|
|
|
|
|
demo = gr.Interface( |
|
fn=generate_story_and_speech, |
|
inputs=[gr.Text(label='Secret Token'),gr.Textbox(placeholder="Enter your text here"), gr.Dropdown(choices=ROLES, label="Select Chatbot Role")], |
|
outputs="json" |
|
) |
|
|
|
demo.queue() |
|
demo.launch(debug=True) |