{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Style-Bert-VITS2ライブラリの使用例\n", "\n", "`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# PyTorch環境の構築(ない場合)\n", "# 参照: https://pytorch.org/get-started/locally/\n", "\n", "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LLrngKcQEAyP" }, "outputs": [], "source": [ "# style-bert-vits2のインストール\n", "\n", "!pip install style-bert-vits2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9xRtfUg5EZkx" }, "outputs": [], "source": [ "# BERTモデルをロード(ローカルに手動でダウンロードする必要はありません)\n", "\n", "from style_bert_vits2.nlp import bert_models\n", "from style_bert_vits2.constants import Languages\n", "\n", "\n", "bert_models.load_model(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n", "bert_models.load_tokenizer(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n", "# bert_models.load_model(Languages.EN, \"microsoft/deberta-v3-large\")\n", "# bert_models.load_tokenizer(Languages.EN, \"microsoft/deberta-v3-large\")\n", "# bert_models.load_model(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")\n", "# bert_models.load_tokenizer(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "q2V9d3HyFAr_" }, "outputs": [], "source": [ "# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる\n", "# model_assetsディレクトリにダウンロードされます\n", "\n", "from pathlib import Path\n", "from huggingface_hub import hf_hub_download\n", "\n", "\n", "model_file = \"jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors\"\n", "config_file = \"jvnv-F1-jp/config.json\"\n", "style_file = \"jvnv-F1-jp/style_vectors.npy\"\n", "\n", "for file in [model_file, config_file, style_file]:\n", " print(file)\n", " hf_hub_download(\n", " \"litagin/style_bert_vits2_jvnv\",\n", " file,\n", " local_dir=\"model_assets\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hJa31MEUFhe4" }, "outputs": [], "source": [ "# 上でダウンロードしたモデルファイルを指定して音声合成のテスト\n", "\n", "from style_bert_vits2.tts_model import TTSModel\n", "\n", "assets_root = Path(\"model_assets\")\n", "\n", "model = TTSModel(\n", " model_path=assets_root / model_file,\n", " config_path=assets_root / config_file,\n", " style_vec_path=assets_root / style_file,\n", " device=\"cpu\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Gal0tqrtGXZx" }, "outputs": [], "source": [ "from IPython.display import Audio, display\n", "\n", "sr, audio = model.infer(text=\"こんにちは\")\n", "display(Audio(audio, rate=sr))" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }