Spaces:
Runtime error
Runtime error
# ---------------------------------------------------------------------------- | |
# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329) | |
# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM | |
# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4 | |
# | |
# Copyright (c) 2022 Microsoft | |
# Licensed under The MIT License [see LICENSE for details] | |
# ---------------------------------------------------------------------------- | |
""" | |
Modified from https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py | |
""" | |
import argparse | |
import numpy as np | |
import sys | |
from g2p_en import G2p | |
from tqdm import tqdm | |
import logging | |
logging.basicConfig( | |
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", | |
datefmt="%Y-%m-%d %H:%M:%S", | |
level=logging.INFO, | |
) | |
logger = logging.getLogger(__name__) | |
def get_parser(): | |
parser = argparse.ArgumentParser( | |
description="converts words to phones adding optional silences around in between words" | |
) | |
parser.add_argument( | |
"--sil-prob", | |
"-s", | |
type=float, | |
default=0, | |
help="probability of inserting silence between each word", | |
) | |
parser.add_argument( | |
"--surround", | |
action="store_true", | |
help="if set, surrounds each example with silence", | |
) | |
parser.add_argument( | |
"--lexicon", | |
help="lexicon to convert to phones", | |
required=True, | |
) | |
parser.add_argument( | |
"--strict", | |
action="store_true", | |
help="if set, OOV words will raise a error (for train/valid set)", | |
) | |
parser.add_argument( | |
"--input", | |
"-i", | |
help="input text file", | |
required=True, | |
) | |
parser.add_argument( | |
"--output", | |
"-o", | |
help="input text file", | |
required=True, | |
) | |
return parser | |
def normalize_phn(phons): | |
""" | |
convert g2p style phone to 39-phone set | |
""" | |
return [p.rstrip('0123456789') for p in phons] | |
def main(): | |
parser = get_parser() | |
args = parser.parse_args() | |
sil_prob = args.sil_prob | |
surround = args.surround | |
sil = "<SIL>" | |
wrd_to_phn = {} | |
g2p = G2p() | |
with open(args.lexicon, "r") as lf: | |
for line in lf: | |
items = line.rstrip().split() | |
assert len(items) > 1, line | |
assert items[0] not in wrd_to_phn, items | |
wrd_to_phn[items[0]] = items[1:] | |
with open(args.input, "r") as fin, open(args.output, "w", encoding="utf-8") as fout: | |
for line in tqdm(fin): | |
words = line.strip().upper().split() | |
if not all(w in wrd_to_phn for w in words): | |
if args.strict: | |
# logger.warning(f"| Warning: OOV words found: {line}") | |
pass | |
else: | |
continue | |
phones = [] | |
if surround: | |
phones.append(sil) | |
sample_sil_probs = None | |
if sil_prob > 0 and len(words) > 1: | |
sample_sil_probs = np.random.random(len(words) - 1) | |
for i, w in enumerate(words): | |
if w in wrd_to_phn: | |
phones.extend(wrd_to_phn[w]) | |
else: | |
phones.extend(normalize_phn(g2p(w))) | |
if ( | |
sample_sil_probs is not None | |
and i < len(sample_sil_probs) | |
and sample_sil_probs[i] < sil_prob | |
): | |
phones.append(sil) | |
if surround: | |
phones.append(sil) | |
print(" ".join(phones), file=fout) | |
if __name__ == "__main__": | |
main() | |