{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "## Converting to ggml using h5"
      ],
      "metadata": {
        "id": "Kbq33zFd4QXE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install torch safetensors h5py -q"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "M43EPtRB33cE",
        "outputId": "5081aebc-8864-48f4-c869-683e9511f082"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m60.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m76.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "e2sGa27b3pvJ",
        "outputId": "5660cafb-830e-409c-9921-feb584e4481c"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'whisper'...\n",
            "remote: Enumerating objects: 828, done.\u001b[K\n",
            "remote: Counting objects: 100% (370/370), done.\u001b[K\n",
            "remote: Compressing objects: 100% (69/69), done.\u001b[K\n",
            "remote: Total 828 (delta 333), reused 301 (delta 301), pack-reused 458 (from 2)\u001b[K\n",
            "Receiving objects: 100% (828/828), 8.26 MiB | 10.11 MiB/s, done.\n",
            "Resolving deltas: 100% (496/496), done.\n"
          ]
        }
      ],
      "source": [
        "!git clone https://github.com/openai/whisper"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/ggerganov/whisper.cpp\n",
        "!cd whisper.cpp && make"
      ],
      "metadata": {
        "id": "bnJW45ez3sx0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git lfs install\n",
        "!git clone https://huggingface.co/sadeghk/whisper-base"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "K7wQ0-d53sve",
        "outputId": "d1736412-81c1-4a64-c3e0-4d46093d3dcd"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Git LFS initialized.\n",
            "Cloning into 'whisper-base'...\n",
            "remote: Enumerating objects: 571, done.\u001b[K\n",
            "remote: Counting objects: 100% (568/568), done.\u001b[K\n",
            "remote: Compressing objects: 100% (568/568), done.\u001b[K\n",
            "remote: Total 571 (delta 167), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
            "Receiving objects: 100% (571/571), 692.35 KiB | 3.48 MiB/s, done.\n",
            "Resolving deltas: 100% (167/167), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import h5py\n",
        "from safetensors.torch import load_file\n",
        "\n",
        "# Load the safetensors file\n",
        "safetensors_path = \"whisper-base/model.safetensors\"  # Replace with your file path\n",
        "state_dict = load_file(safetensors_path)\n",
        "\n",
        "# Create an HDF5 file and store the tensors\n",
        "h5_path = \"whisper-base/model.h5\"\n",
        "with h5py.File(h5_path, \"w\") as h5f:\n",
        "    for key, tensor in state_dict.items():\n",
        "        h5f.create_dataset(key, data=tensor.numpy())  # Convert tensor to numpy before saving\n",
        "\n",
        "print(f\"Conversion complete: {h5_path} saved successfully!\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b931-wDb36Bf",
        "outputId": "59337d93-e872-4ed4-dc9b-7b2c29b53c89"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Conversion complete: whisper-base/model.h5 saved successfully!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# if there is an error about max_length, set max_length = 448 in ./whisper-base/config.json\n",
        "\n",
        "!python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-base/ ./whisper ."
      ],
      "metadata": {
        "id": "oNohyE-B3ss5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!./whisper.cpp/build/bin/whisper-cli -m ggml-model.bin -l fa -f cab991ea4681b712417e0d7569c94dccc48f134ed1e6353f8ee69a85.wav"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yj_lHrgE3sqW",
        "outputId": "b7435036-0a31-4ca5-927d-9952c72f9bcf"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "whisper_init_from_file_with_params_no_state: loading model from 'ggml-model.bin'\n",
            "whisper_init_with_params_no_state: use gpu    = 1\n",
            "whisper_init_with_params_no_state: flash attn = 0\n",
            "whisper_init_with_params_no_state: gpu_device = 0\n",
            "whisper_init_with_params_no_state: dtw        = 0\n",
            "whisper_init_with_params_no_state: devices    = 1\n",
            "whisper_init_with_params_no_state: backends   = 1\n",
            "whisper_model_load: loading model\n",
            "whisper_model_load: n_vocab       = 51865\n",
            "whisper_model_load: n_audio_ctx   = 1500\n",
            "whisper_model_load: n_audio_state = 512\n",
            "whisper_model_load: n_audio_head  = 8\n",
            "whisper_model_load: n_audio_layer = 6\n",
            "whisper_model_load: n_text_ctx    = 448\n",
            "whisper_model_load: n_text_state  = 512\n",
            "whisper_model_load: n_text_head   = 8\n",
            "whisper_model_load: n_text_layer  = 6\n",
            "whisper_model_load: n_mels        = 80\n",
            "whisper_model_load: ftype         = 1\n",
            "whisper_model_load: qntvr         = 0\n",
            "whisper_model_load: type          = 2 (base)\n",
            "whisper_model_load: adding 1607 extra tokens\n",
            "whisper_model_load: n_langs       = 99\n",
            "whisper_model_load:      CPU total size =   147.37 MB\n",
            "whisper_model_load: model size    =  147.37 MB\n",
            "whisper_backend_init_gpu: no GPU found\n",
            "whisper_init_state: kv self size  =    6.29 MB\n",
            "whisper_init_state: kv cross size =   18.87 MB\n",
            "whisper_init_state: kv pad  size  =    3.15 MB\n",
            "whisper_init_state: compute buffer (conv)   =   16.26 MB\n",
            "whisper_init_state: compute buffer (encode) =   85.86 MB\n",
            "whisper_init_state: compute buffer (cross)  =    4.65 MB\n",
            "whisper_init_state: compute buffer (decode) =   96.35 MB\n",
            "\n",
            "system_info: n_threads = 2 / 2 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 0 | \n",
            "\n",
            "main: processing 'cab991ea4681b712417e0d7569c94dccc48f134ed1e6353f8ee69a85.wav' (167711 samples, 10.5 sec), 2 threads, 1 processors, 5 beams + best of 5, lang = fa, task = transcribe, timestamps = 1 ...\n",
            "\n",
            "\n",
            "[00:00:00.000 --> 00:00:30.000]   لطه اصلی بارش باران جا به جایی هوای مرطوب به علت اختلاف دم آوروتو و تسه که به جبه‌های هواشناسی معروف است\n",
            "\n",
            "\n",
            "whisper_print_timings:     load time =   165.24 ms\n",
            "whisper_print_timings:     fallbacks =   0 p /   0 h\n",
            "whisper_print_timings:      mel time =    39.49 ms\n",
            "whisper_print_timings:   sample time =   722.76 ms /   315 runs (    2.29 ms per run)\n",
            "whisper_print_timings:   encode time =  6999.46 ms /     1 runs ( 6999.46 ms per run)\n",
            "whisper_print_timings:   decode time =     0.00 ms /     1 runs (    0.00 ms per run)\n",
            "whisper_print_timings:   batchd time =  4939.69 ms /   313 runs (   15.78 ms per run)\n",
            "whisper_print_timings:   prompt time =     0.00 ms /     1 runs (    0.00 ms per run)\n",
            "whisper_print_timings:    total time = 12942.70 ms\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "QN0WRNiY3snj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "eIUmVDHn3skk"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}