import os import noisereduce as nr import soundfile as sf # from moviepy.editor import * import string import json from glob import glob import torchaudio import subprocess import shutil import pyloudnorm as pyln import torch from TTS.api import TTS import string def remove_punctuation(sentence): translator = str.maketrans('', '', string.punctuation) sentence = sentence.translate(translator) # Remove line breaks sentence = sentence.replace('\n', ' ').replace('\r', '') return sentence def run_audio_generation_v1(new_text,accent='None'): new_text = new_text.replace('\n', ' ').replace('\r', '') new_text_mod = remove_punctuation(new_text) new_text_split = new_text_mod.split() for word in new_text_split: if len(word)>=2 and word.isupper(): new_text = new_text.replace(word, " ".join([*word])) models = TTS().list_models() with open('models.txt', 'w') as f: f.writelines(f"{model}\n" for model in models) gpu = True if torch.cuda.is_available() else False tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=gpu) # gpu should be true when server (cuda) # if not gpu: # pre-process story audio file # convert to 16 bit mono # remove noise speaker_wav_data, speaker_wav_rate = sf.read("./tmp/audio/input_src/0.wav") speaker_wav_data_no_noise = nr.reduce_noise(y=speaker_wav_data, sr=speaker_wav_rate) sf.write('./tmp/audio/speaker_wav.wav', speaker_wav_data_no_noise, speaker_wav_rate, subtype='PCM_16') tts.tts_to_file( new_text, speaker_wav="./tmp/audio/speaker_wav.wav", language="en", file_path="./tmp/audio/generated-custom.wav" )