{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Style-Bert-VITS2ライブラリの使用例\n",
        "\n",
        "`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# PyTorch環境の構築（ない場合）\n",
        "# 参照: https://pytorch.org/get-started/locally/\n",
        "\n",
        "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LLrngKcQEAyP"
      },
      "outputs": [],
      "source": [
        "# style-bert-vits2のインストール\n",
        "\n",
        "!pip install style-bert-vits2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9xRtfUg5EZkx"
      },
      "outputs": [],
      "source": [
        "# BERTモデルをロード（ローカルに手動でダウンロードする必要はありません）\n",
        "\n",
        "from style_bert_vits2.nlp import bert_models\n",
        "from style_bert_vits2.constants import Languages\n",
        "\n",
        "\n",
        "bert_models.load_model(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
        "bert_models.load_tokenizer(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
        "# bert_models.load_model(Languages.EN, \"microsoft/deberta-v3-large\")\n",
        "# bert_models.load_tokenizer(Languages.EN, \"microsoft/deberta-v3-large\")\n",
        "# bert_models.load_model(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")\n",
        "# bert_models.load_tokenizer(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "q2V9d3HyFAr_"
      },
      "outputs": [],
      "source": [
        "# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる\n",
        "# model_assetsディレクトリにダウンロードされます\n",
        "\n",
        "from pathlib import Path\n",
        "from huggingface_hub import hf_hub_download\n",
        "\n",
        "\n",
        "model_file = \"jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors\"\n",
        "config_file = \"jvnv-F1-jp/config.json\"\n",
        "style_file = \"jvnv-F1-jp/style_vectors.npy\"\n",
        "\n",
        "for file in [model_file, config_file, style_file]:\n",
        "    print(file)\n",
        "    hf_hub_download(\n",
        "        \"litagin/style_bert_vits2_jvnv\",\n",
        "        file,\n",
        "        local_dir=\"model_assets\"\n",
        "    )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hJa31MEUFhe4"
      },
      "outputs": [],
      "source": [
        "# 上でダウンロードしたモデルファイルを指定して音声合成のテスト\n",
        "\n",
        "from style_bert_vits2.tts_model import TTSModel\n",
        "\n",
        "assets_root = Path(\"model_assets\")\n",
        "\n",
        "model = TTSModel(\n",
        "    model_path=assets_root / model_file,\n",
        "    config_path=assets_root / config_file,\n",
        "    style_vec_path=assets_root / style_file,\n",
        "    device=\"cpu\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Gal0tqrtGXZx"
      },
      "outputs": [],
      "source": [
        "from IPython.display import Audio, display\n",
        "\n",
        "sr, audio = model.infer(text=\"こんにちは\")\n",
        "display(Audio(audio, rate=sr))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}