Update README.md

dbfea30 verified 11 days ago

7.06 kB

	---
	tags:
	- text-generation-inference
	- whisper
	- audio
	base_model:
	- openai/whisper-large-v3
	---


	# Whisper Large v3 with Key-Value-Cache enabled in ONNX fp16 format
	- Model creator: [Open AI](https://huggingface.co/openai)
	- Original model: [Whisper Large v3](https://huggingface.co/openai/whisper-large-v3)

	<!-- description start -->
	## Description

	This repo contains the ONNX files for the ONNX conversion of Whisper Large v3 done by Esperanto Technologies.
	The model is in the fp16 format and has the KVC enabled.

	<!-- description end -->

	## How to download ONNX model and weight files

	The easiest way to obtain the model is to clone this whole repo.
	Alternatively you can download the files is using the `huggingface-hub` Python library.

	```shell
	pip3 install huggingface-hub>=0.17.1
	```

	Then you can download any individual model file to the current directory, at high speed, with a command like this:

	```shell
	huggingface-cli download Esperanto/whisper-large-v3-kvc-fp16-onnx --local-dir whisper-large-v3-kvc-fp16-onnx --local-dir-use-symlinks False
	```

	For more documentation on downloading with `huggingface-cli`, please see: [HF -> Hub Python Library -> Download files -> Download from the CLI](https://huggingface.co/docs/huggingface_hub/guides/download#download-from-the-cli).

	## How to run from Python code using ONNXRuntime

	This model can easily be ran in a CPU using [ONNXRuntime](https://onnxruntime.ai/).

	Here is a sample script to run this models:

	```python
	#!/usr/bin/env python3
	import whisper
	import onnx
	import sys
	import time
	import onnxruntime
	from typing import Sequence, Optional
	import numpy as np
	from pathlib import Path

	def run_whisper_decoder(decoder_model_path, execution_provider, session_options, decoder_output_names, cross_attn_tensors, num_new_tokens, provider_options = {}):
	start = time.time()
	decoder_session = onnxruntime.InferenceSession(decoder_model_path, sess_options=session_options, providers=[execution_provider], provider_options=[provider_options])
	compile_time = time.time()
	transcription = decoder_loop(decoder_session, decoder_output_names, cross_attn_tensors, num_new_tokens)
	inference_time = time.time()
	return transcription


	def decoder_loop(decoder_session, decoder_output_names, cross_attn_tensors, num_new_tokens):
	# Generate start of transcription tokens
	tokenizer = whisper.tokenizer.get_tokenizer(multilingual=True)
	first_tokens = np.array([tokenizer.sot, 0, tokenizer.transcribe, tokenizer.no_timestamps], dtype=np.int64)

	# Self attention mask key, value vectors
	self_attn_past_k = []
	self_attn_past_v = []
	for i in range(32):
	self_attn_past_k.append(np.zeros((1, 20, 447, 64), dtype=np.float16))
	self_attn_past_v.append(np.zeros((1, 20, 447, 64), dtype=np.float16))

	# Cross attention
	cross_attn_k = cross_attn_tensors[0::2]
	cross_attn_v = cross_attn_tensors[1::2]

	# Attention mask
	attn_mask_size = 448
	attn_mask = np.zeros((1,attn_mask_size), dtype=np.int64)

	# Process first tokens
	for j in range(len(first_tokens)):
	tokens = np.array([first_tokens[j]], dtype=np.int64).reshape(1, 1)
	attn_mask[0,-1 - j] = 1

	decoder_input = {"input_ids": tokens, "attention_mask": attn_mask}
	for i in range(32):
	decoder_input[f"past_key_values.{str(i)}.key"] = self_attn_past_k[i]
	decoder_input[f"past_key_values.{str(i)}.value"] = self_attn_past_v[i]
	decoder_input[f"cross_attn.{str(i)}.key"] = cross_attn_k[i]
	decoder_input[f"cross_attn.{str(i)}.value"] = cross_attn_v[i]

	logits, *cache_tensors = decoder_session.run(decoder_output_names, decoder_input)
	next_token = np.argmax(logits[0,0])

	self_attn_k = cache_tensors[0::2]
	self_attn_v = cache_tensors[1::2]
	for i in range(32):
	self_attn_past_k[i] = self_attn_k[i][:,:,1:,:]
	self_attn_past_v[i] = self_attn_v[i][:,:,1:,:]

	if (j == 0):
	# set language token
	first_tokens[1] = next_token

	transcribed_tokens = [next_token]
	for j in range(4, 4 + num_new_tokens):
	tokens = np.array([transcribed_tokens[-1]], dtype=np.int64).reshape(1, 1)
	attn_mask[0,-1 - j] = 1

	decoder_input = {"input_ids": tokens, "attention_mask": attn_mask}
	for i in range(32):
	decoder_input[f"past_key_values.{str(i)}.key"] = self_attn_past_k[i]
	decoder_input[f"past_key_values.{str(i)}.value"] = self_attn_past_v[i]
	decoder_input[f"cross_attn.{str(i)}.key"] = cross_attn_k[i]
	decoder_input[f"cross_attn.{str(i)}.value"] = cross_attn_v[i]

	logits, *cache_tensors = decoder_session.run(decoder_output_names, decoder_input)
	next_token = np.argmax(logits[0,0])
	# print(j, next_token)
	if next_token == tokenizer.eot: # end_of_transcription
	break
	transcribed_tokens.append(next_token)
	self_attn_k = cache_tensors[0::2]
	self_attn_v = cache_tensors[1::2]
	for i in range(32):
	self_attn_past_k[i] = self_attn_k[i][:,:,1:,:]
	self_attn_past_v[i] = self_attn_v[i][:,:,1:,:]

	return tokenizer.decode(transcribed_tokens)


	def main(argv: Optional[Sequence[str]] = None):
	num_seconds = 28.8

	speech_path = 'sample_audio.wav'
	encoder_model_path = 'whisper-large-v3-kvc-fp16-onnx/encoder/model.onnx'
	decoder_model_path = 'whisper-large-v3-kvc-fp16-onnx/decoder/model.onnx'

	# Load audio
	print(f"Spectrogram speech audio file {speech_path}... ", end="")
	audio = whisper.load_audio(speech_path)
	audio = whisper.pad_or_trim(audio, length=int(num_seconds*16000))
	mel = whisper.log_mel_spectrogram(audio, n_mels=128).unsqueeze(0) # Unsqueeze to set batch=1
	print("OK")

	print("Running encoder... ", end="")

	# Session options
	session_options = onnxruntime.SessionOptions()
	# Disable all the graph optimizations
	session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL

	# Encode
	encoder = onnx.load(encoder_model_path, load_external_data=False)
	encoder_input = {"mel": mel.numpy().astype('float16')}
	encoder_output_names = [tensor.name for tensor in encoder.graph.output]
	# CPU encoding
	cpu_provider = 'CPUExecutionProvider'
	enc_session_cpu = onnxruntime.InferenceSession(encoder_model_path, sess_options=session_options, providers=[cpu_provider])
	cross_attn_tensors_cpu = enc_session_cpu.run(encoder_output_names, encoder_input)

	print("OK")

	# DECODE API PARAMS
	max_context = 448
	new_tokens = 20

	# Run decoder model CPU
	decoder = onnx.load(decoder_model_path, load_external_data=False)
	decoder_output_names = [tensor.name for tensor in decoder.graph.output]

	run_whisper_decoder(decoder_model_path, cpu_provider, session_options, decoder_output_names, cross_attn_tensors_cpu, new_tokens)


	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))
	```