Spaces:
Runtime error
Runtime error
File size: 3,821 Bytes
62e9ca6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# ----------------------------------------------------------------------------
# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
#
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# ----------------------------------------------------------------------------
"""
Modified from https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py
"""
import argparse
import numpy as np
import sys
from g2p_en import G2p
from tqdm import tqdm
import logging
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger(__name__)
def get_parser():
parser = argparse.ArgumentParser(
description="converts words to phones adding optional silences around in between words"
)
parser.add_argument(
"--sil-prob",
"-s",
type=float,
default=0,
help="probability of inserting silence between each word",
)
parser.add_argument(
"--surround",
action="store_true",
help="if set, surrounds each example with silence",
)
parser.add_argument(
"--lexicon",
help="lexicon to convert to phones",
required=True,
)
parser.add_argument(
"--strict",
action="store_true",
help="if set, OOV words will raise a error (for train/valid set)",
)
parser.add_argument(
"--input",
"-i",
help="input text file",
required=True,
)
parser.add_argument(
"--output",
"-o",
help="input text file",
required=True,
)
return parser
def normalize_phn(phons):
"""
convert g2p style phone to 39-phone set
"""
return [p.rstrip('0123456789') for p in phons]
def main():
parser = get_parser()
args = parser.parse_args()
sil_prob = args.sil_prob
surround = args.surround
sil = "<SIL>"
wrd_to_phn = {}
g2p = G2p()
with open(args.lexicon, "r") as lf:
for line in lf:
items = line.rstrip().split()
assert len(items) > 1, line
assert items[0] not in wrd_to_phn, items
wrd_to_phn[items[0]] = items[1:]
with open(args.input, "r") as fin, open(args.output, "w", encoding="utf-8") as fout:
for line in tqdm(fin):
words = line.strip().upper().split()
if not all(w in wrd_to_phn for w in words):
if args.strict:
# logger.warning(f"| Warning: OOV words found: {line}")
pass
else:
continue
phones = []
if surround:
phones.append(sil)
sample_sil_probs = None
if sil_prob > 0 and len(words) > 1:
sample_sil_probs = np.random.random(len(words) - 1)
for i, w in enumerate(words):
if w in wrd_to_phn:
phones.extend(wrd_to_phn[w])
else:
phones.extend(normalize_phn(g2p(w)))
if (
sample_sil_probs is not None
and i < len(sample_sil_probs)
and sample_sil_probs[i] < sil_prob
):
phones.append(sil)
if surround:
phones.append(sil)
print(" ".join(phones), file=fout)
if __name__ == "__main__":
main()
|