{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "## Converting to ggml using h5" ], "metadata": { "id": "Kbq33zFd4QXE" } }, { "cell_type": "code", "source": [ "!pip install torch safetensors h5py -q" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "M43EPtRB33cE", "outputId": "5081aebc-8864-48f4-c869-683e9511f082" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m60.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m76.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e2sGa27b3pvJ", "outputId": "5660cafb-830e-409c-9921-feb584e4481c" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'whisper'...\n", "remote: Enumerating objects: 828, done.\u001b[K\n", "remote: Counting objects: 100% (370/370), done.\u001b[K\n", "remote: Compressing objects: 100% (69/69), done.\u001b[K\n", "remote: Total 828 (delta 333), reused 301 (delta 301), pack-reused 458 (from 2)\u001b[K\n", "Receiving objects: 100% (828/828), 8.26 MiB | 10.11 MiB/s, done.\n", "Resolving deltas: 100% (496/496), done.\n" ] } ], "source": [ "!git clone https://github.com/openai/whisper" ] }, { "cell_type": "code", "source": [ "!git clone https://github.com/ggerganov/whisper.cpp\n", "!cd whisper.cpp && make" ], "metadata": { "id": "bnJW45ez3sx0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!git lfs install\n", "!git clone https://huggingface.co/sadeghk/whisper-base" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "K7wQ0-d53sve", "outputId": "d1736412-81c1-4a64-c3e0-4d46093d3dcd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Git LFS initialized.\n", "Cloning into 'whisper-base'...\n", "remote: Enumerating objects: 571, done.\u001b[K\n", "remote: Counting objects: 100% (568/568), done.\u001b[K\n", "remote: Compressing objects: 100% (568/568), done.\u001b[K\n", "remote: Total 571 (delta 167), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n", "Receiving objects: 100% (571/571), 692.35 KiB | 3.48 MiB/s, done.\n", "Resolving deltas: 100% (167/167), done.\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import h5py\n", "from safetensors.torch import load_file\n", "\n", "# Load the safetensors file\n", "safetensors_path = \"whisper-base/model.safetensors\" # Replace with your file path\n", "state_dict = load_file(safetensors_path)\n", "\n", "# Create an HDF5 file and store the tensors\n", "h5_path = \"whisper-base/model.h5\"\n", "with h5py.File(h5_path, \"w\") as h5f:\n", " for key, tensor in state_dict.items():\n", " h5f.create_dataset(key, data=tensor.numpy()) # Convert tensor to numpy before saving\n", "\n", "print(f\"Conversion complete: {h5_path} saved successfully!\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b931-wDb36Bf", "outputId": "59337d93-e872-4ed4-dc9b-7b2c29b53c89" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Conversion complete: whisper-base/model.h5 saved successfully!\n" ] } ] }, { "cell_type": "code", "source": [ "# if there is an error about max_length, set max_length = 448 in ./whisper-base/config.json\n", "\n", "!python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-base/ ./whisper ." ], "metadata": { "id": "oNohyE-B3ss5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!./whisper.cpp/build/bin/whisper-cli -m ggml-model.bin -l fa -f cab991ea4681b712417e0d7569c94dccc48f134ed1e6353f8ee69a85.wav" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yj_lHrgE3sqW", "outputId": "b7435036-0a31-4ca5-927d-9952c72f9bcf" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "whisper_init_from_file_with_params_no_state: loading model from 'ggml-model.bin'\n", "whisper_init_with_params_no_state: use gpu = 1\n", "whisper_init_with_params_no_state: flash attn = 0\n", "whisper_init_with_params_no_state: gpu_device = 0\n", "whisper_init_with_params_no_state: dtw = 0\n", "whisper_init_with_params_no_state: devices = 1\n", "whisper_init_with_params_no_state: backends = 1\n", "whisper_model_load: loading model\n", "whisper_model_load: n_vocab = 51865\n", "whisper_model_load: n_audio_ctx = 1500\n", "whisper_model_load: n_audio_state = 512\n", "whisper_model_load: n_audio_head = 8\n", "whisper_model_load: n_audio_layer = 6\n", "whisper_model_load: n_text_ctx = 448\n", "whisper_model_load: n_text_state = 512\n", "whisper_model_load: n_text_head = 8\n", "whisper_model_load: n_text_layer = 6\n", "whisper_model_load: n_mels = 80\n", "whisper_model_load: ftype = 1\n", "whisper_model_load: qntvr = 0\n", "whisper_model_load: type = 2 (base)\n", "whisper_model_load: adding 1607 extra tokens\n", "whisper_model_load: n_langs = 99\n", "whisper_model_load: CPU total size = 147.37 MB\n", "whisper_model_load: model size = 147.37 MB\n", "whisper_backend_init_gpu: no GPU found\n", "whisper_init_state: kv self size = 6.29 MB\n", "whisper_init_state: kv cross size = 18.87 MB\n", "whisper_init_state: kv pad size = 3.15 MB\n", "whisper_init_state: compute buffer (conv) = 16.26 MB\n", "whisper_init_state: compute buffer (encode) = 85.86 MB\n", "whisper_init_state: compute buffer (cross) = 4.65 MB\n", "whisper_init_state: compute buffer (decode) = 96.35 MB\n", "\n", "system_info: n_threads = 2 / 2 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 0 | \n", "\n", "main: processing 'cab991ea4681b712417e0d7569c94dccc48f134ed1e6353f8ee69a85.wav' (167711 samples, 10.5 sec), 2 threads, 1 processors, 5 beams + best of 5, lang = fa, task = transcribe, timestamps = 1 ...\n", "\n", "\n", "[00:00:00.000 --> 00:00:30.000] لطه اصلی بارش باران جا به جایی هوای مرطوب به علت اختلاف دم آوروتو و تسه که به جبه‌های هواشناسی معروف است\n", "\n", "\n", "whisper_print_timings: load time = 165.24 ms\n", "whisper_print_timings: fallbacks = 0 p / 0 h\n", "whisper_print_timings: mel time = 39.49 ms\n", "whisper_print_timings: sample time = 722.76 ms / 315 runs ( 2.29 ms per run)\n", "whisper_print_timings: encode time = 6999.46 ms / 1 runs ( 6999.46 ms per run)\n", "whisper_print_timings: decode time = 0.00 ms / 1 runs ( 0.00 ms per run)\n", "whisper_print_timings: batchd time = 4939.69 ms / 313 runs ( 15.78 ms per run)\n", "whisper_print_timings: prompt time = 0.00 ms / 1 runs ( 0.00 ms per run)\n", "whisper_print_timings: total time = 12942.70 ms\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "QN0WRNiY3snj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "eIUmVDHn3skk" }, "execution_count": null, "outputs": [] } ] }