# custom script for kokoro removing phonemizer dependency, which had excessive sub-dependencies; full explanation at the bottom import re import torch import numpy as np import subprocess import os def split_num(num): num = num.group() if '.' in num: return num elif ':' in num: h, m = [int(n) for n in num.split(':')] if m == 0: return f"{h} o'clock" elif m < 10: return f'{h} oh {m}' return f'{h} {m}' year = int(num[:4]) if year < 1100 or year % 1000 < 10: return num left, right = num[:2], int(num[2:4]) s = 's' if num.endswith('s') else '' if 100 <= year % 1000 <= 999: if right == 0: return f'{left} hundred{s}' elif right < 10: return f'{left} oh {right}{s}' return f'{left} {right}{s}' def flip_money(m): m = m.group() bill = 'dollar' if m[0] == '$' else 'pound' if m[-1].isalpha(): return f'{m[1:]} {bill}s' elif '.' not in m: s = '' if m[1:] == '1' else 's' return f'{m[1:]} {bill}{s}' b, c = m[1:].split('.') s = '' if b == '1' else 's' c = int(c.ljust(2, '0')) coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence') return f'{b} {bill}{s} and {c} {coins}' def point_num(num): a, b = num.group().split('.') return ' point '.join([a, ' '.join(b)]) def expand_acronym(m): text = m.group(0).replace('.', '') letters = list(text) letters_with_periods = [letter + '.' for letter in letters] return ' '.join(letters_with_periods) def normalize_text(text): text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace('«', chr(8220)).replace('»', chr(8221)) text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace('(', '«').replace(')', '»') for a, b in zip('、。!,:;?', ',.!,:;?'): text = text.replace(a, b+' ') text = re.sub(r'[^\S \n]', ' ', text) text = re.sub(r' +', ' ', text) text = re.sub(r'(?<=\n) +(?=\n)', '', text) text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text) text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text) text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text) text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text) text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text) text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text) text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(? 510: tokens = tokens[:510] print('Truncated to 510 tokens') ref_s = voicepack[len(tokens)] out = forward(model, tokens, ref_s, speed) ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens) return out, ps def generate_full(model, text, voicepack, lang='a', speed=1, ps=None): ps = ps or phonemize(text, lang) tokens = tokenize(ps) if not tokens: return None outs = [] loop_count = len(tokens)//510 + (1 if len(tokens) % 510 != 0 else 0) for i in range(loop_count): ref_s = voicepack[len(tokens[i*510:(i+1)*510])] out = forward(model, tokens[i*510:(i+1)*510], ref_s, speed) outs.append(out) outs = np.concatenate(outs) ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens) return outs, ps """ THE ORIGINAL SCRIPT relied on "phonemizer" a wrapper around espeak-ng ```python phonemizers = dict( a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True), b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True), ) ``` - language selection (en-us or en-gb) - preserve_punctuation=True (keep punctuation in output) - with_stress=True (keep stress markers in phonemes) This modified script uses espeak-ng directly, providing the appropriate flags for identical functionality: ```python def direct_espeak(text, lang='en-us'): espeak_path = r"C:\Program Files\eSpeak NG\espeak-ng.exe" cmd = [espeak_path, '-q', '--ipa', '-v', lang, text] result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', shell=True) ``` - `-q`: Quiet mode (no audio output) - `--ipa`: Output in International Phonetic Alphabet notation - `-v lang`: Language selection (en-us or en-gb) THE ORIGINAL SCRIPT set certain variables: ```python os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll" os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\Program Files\eSpeak NG\espeak-ng.exe" ``` - PHONEMIZER_ESPEAK_LIBRARY pointed to the DLL file that phonemizer used to interface with espeak - PHONEMIZER_ESPEAK_PATH pointed to the espeak executable that phonemizer would call This modified script does not because we are no longer using the phonemize library, which requires 1. This script specifically sets the path to the espeak binary. 2. When distribution it'll be necessary to either bundle the binary and copy it to the same directory in which kokoro.py is located (in which case amending the PATH is not necessary) or add functionality to search for and locate espeak. espeak-ng requires three files to function: espeak-ng-data -- all the languages and stuff espeak-ng.exe - binary that links to the .dll libespeak-ng.dll """