Spaces:

MBZUAI
/

artst-demo-asr

Runtime error

App Files Files Community

artst-demo-asr / SpeechT5 /SpeechLM /speechlm /data_process /phoneize_with_sil.py

amupd

SpeechT5 upload

62e9ca6 about 1 year ago

raw

history blame

3.82 kB

	# ----------------------------------------------------------------------------
	# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
	# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
	# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
	#
	# Copyright (c) 2022 Microsoft
	# Licensed under The MIT License [see LICENSE for details]
	# ----------------------------------------------------------------------------

	"""
	Modified from https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py
	"""

	import argparse
	import numpy as np
	import sys
	from g2p_en import G2p
	from tqdm import tqdm
	import logging
	logging.basicConfig(
	format="%(asctime)s \| %(levelname)s \| %(name)s \| %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	level=logging.INFO,
	)
	logger = logging.getLogger(__name__)

	def get_parser():
	parser = argparse.ArgumentParser(
	description="converts words to phones adding optional silences around in between words"
	)
	parser.add_argument(
	"--sil-prob",
	"-s",
	type=float,
	default=0,
	help="probability of inserting silence between each word",
	)
	parser.add_argument(
	"--surround",
	action="store_true",
	help="if set, surrounds each example with silence",
	)
	parser.add_argument(
	"--lexicon",
	help="lexicon to convert to phones",
	required=True,
	)
	parser.add_argument(
	"--strict",
	action="store_true",
	help="if set, OOV words will raise a error (for train/valid set)",
	)
	parser.add_argument(
	"--input",
	"-i",
	help="input text file",
	required=True,
	)
	parser.add_argument(
	"--output",
	"-o",
	help="input text file",
	required=True,
	)


	return parser


	def normalize_phn(phons):
	"""
	convert g2p style phone to 39-phone set
	"""
	return [p.rstrip('0123456789') for p in phons]


	def main():
	parser = get_parser()
	args = parser.parse_args()

	sil_prob = args.sil_prob
	surround = args.surround
	sil = "<SIL>"

	wrd_to_phn = {}
	g2p = G2p()

	with open(args.lexicon, "r") as lf:
	for line in lf:
	items = line.rstrip().split()
	assert len(items) > 1, line
	assert items[0] not in wrd_to_phn, items
	wrd_to_phn[items[0]] = items[1:]

	with open(args.input, "r") as fin, open(args.output, "w", encoding="utf-8") as fout:
	for line in tqdm(fin):
	words = line.strip().upper().split()

	if not all(w in wrd_to_phn for w in words):
	if args.strict:
	# logger.warning(f"\| Warning: OOV words found: {line}")
	pass
	else:
	continue

	phones = []
	if surround:
	phones.append(sil)

	sample_sil_probs = None
	if sil_prob > 0 and len(words) > 1:
	sample_sil_probs = np.random.random(len(words) - 1)

	for i, w in enumerate(words):
	if w in wrd_to_phn:
	phones.extend(wrd_to_phn[w])
	else:
	phones.extend(normalize_phn(g2p(w)))
	if (
	sample_sil_probs is not None
	and i < len(sample_sil_probs)
	and sample_sil_probs[i] < sil_prob
	):
	phones.append(sil)

	if surround:
	phones.append(sil)
	print(" ".join(phones), file=fout)


	if __name__ == "__main__":
	main()