amupd's picture
SpeechT5 upload
62e9ca6
raw
history blame
3.82 kB
# ----------------------------------------------------------------------------
# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
#
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# ----------------------------------------------------------------------------
"""
Modified from https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py
"""
import argparse
import numpy as np
import sys
from g2p_en import G2p
from tqdm import tqdm
import logging
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger(__name__)
def get_parser():
parser = argparse.ArgumentParser(
description="converts words to phones adding optional silences around in between words"
)
parser.add_argument(
"--sil-prob",
"-s",
type=float,
default=0,
help="probability of inserting silence between each word",
)
parser.add_argument(
"--surround",
action="store_true",
help="if set, surrounds each example with silence",
)
parser.add_argument(
"--lexicon",
help="lexicon to convert to phones",
required=True,
)
parser.add_argument(
"--strict",
action="store_true",
help="if set, OOV words will raise a error (for train/valid set)",
)
parser.add_argument(
"--input",
"-i",
help="input text file",
required=True,
)
parser.add_argument(
"--output",
"-o",
help="input text file",
required=True,
)
return parser
def normalize_phn(phons):
"""
convert g2p style phone to 39-phone set
"""
return [p.rstrip('0123456789') for p in phons]
def main():
parser = get_parser()
args = parser.parse_args()
sil_prob = args.sil_prob
surround = args.surround
sil = "<SIL>"
wrd_to_phn = {}
g2p = G2p()
with open(args.lexicon, "r") as lf:
for line in lf:
items = line.rstrip().split()
assert len(items) > 1, line
assert items[0] not in wrd_to_phn, items
wrd_to_phn[items[0]] = items[1:]
with open(args.input, "r") as fin, open(args.output, "w", encoding="utf-8") as fout:
for line in tqdm(fin):
words = line.strip().upper().split()
if not all(w in wrd_to_phn for w in words):
if args.strict:
# logger.warning(f"| Warning: OOV words found: {line}")
pass
else:
continue
phones = []
if surround:
phones.append(sil)
sample_sil_probs = None
if sil_prob > 0 and len(words) > 1:
sample_sil_probs = np.random.random(len(words) - 1)
for i, w in enumerate(words):
if w in wrd_to_phn:
phones.extend(wrd_to_phn[w])
else:
phones.extend(normalize_phn(g2p(w)))
if (
sample_sil_probs is not None
and i < len(sample_sil_probs)
and sample_sil_probs[i] < sil_prob
):
phones.append(sil)
if surround:
phones.append(sil)
print(" ".join(phones), file=fout)
if __name__ == "__main__":
main()