Spaces:
Running
Running
File size: 7,764 Bytes
8453b3d 5206df6 8453b3d 296b5e1 8453b3d 84a6b12 ac4bff9 ca26f25 b8faf9d 45fcb1d abc6224 899eb46 661e83c a2bf507 b8faf9d 661e83c b8faf9d 8453b3d b3ffc71 a2bf507 5206df6 a2bf507 5206df6 a2bf507 5206df6 a2bf507 5206df6 a2bf507 5206df6 a2bf507 5206df6 a2bf507 5206df6 84a6b12 5206df6 84a6b12 5206df6 ca26f25 ac4bff9 ca26f25 b8faf9d 45fcb1d abc6224 45fcb1d 8808c7b 45fcb1d abc6224 899eb46 661e83c a2bf507 296b5e1 b8faf9d a2bf507 8453b3d 296b5e1 661e83c 296b5e1 8453b3d 84a6b12 ac4bff9 ca26f25 b8faf9d 45fcb1d abc6224 899eb46 661e83c a2bf507 296b5e1 a2bf507 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
from dataclasses import dataclass, fields
import gradio as gr
from typing import Optional
@dataclass
class WhisperParameters:
model_size: gr.Dropdown
lang: gr.Dropdown
is_translate: gr.Checkbox
beam_size: gr.Number
log_prob_threshold: gr.Number
no_speech_threshold: gr.Number
compute_type: gr.Dropdown
best_of: gr.Number
patience: gr.Number
condition_on_previous_text: gr.Checkbox
initial_prompt: gr.Textbox
temperature: gr.Slider
compression_ratio_threshold: gr.Number
vad_filter: gr.Checkbox
threshold: gr.Slider
min_speech_duration_ms: gr.Number
max_speech_duration_s: gr.Number
min_silence_duration_ms: gr.Number
window_size_sample: gr.Number
speech_pad_ms: gr.Number
chunk_length_s: gr.Number
batch_size: gr.Number
"""
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
This data class is used to mitigate the key-value problem between Gradio components and function parameters.
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
See more about Gradio pre-processing: https://www.gradio.app/docs/components
Attributes
----------
model_size: gr.Dropdown
Whisper model size.
lang: gr.Dropdown
Source language of the file to transcribe.
is_translate: gr.Checkbox
Boolean value that determines whether to translate to English.
It's Whisper's feature to translate speech from another language directly into English end-to-end.
beam_size: gr.Number
Int value that is used for decoding option.
log_prob_threshold: gr.Number
If the average log probability over sampled tokens is below this value, treat as failed.
no_speech_threshold: gr.Number
If the no_speech probability is higher than this value AND
the average log probability over sampled tokens is below `log_prob_threshold`,
consider the segment as silent.
compute_type: gr.Dropdown
compute type for transcription.
see more info : https://opennmt.net/CTranslate2/quantization.html
best_of: gr.Number
Number of candidates when sampling with non-zero temperature.
patience: gr.Number
Beam search patience factor.
condition_on_previous_text: gr.Checkbox
if True, the previous output of the model is provided as a prompt for the next window;
disabling may make the text inconsistent across windows, but the model becomes less prone to
getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
initial_prompt: gr.Textbox
Optional text to provide as a prompt for the first window. This can be used to provide, or
"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
to make it more likely to predict those word correctly.
temperature: gr.Slider
Temperature for sampling. It can be a tuple of temperatures,
which will be successively used upon failures according to either
`compression_ratio_threshold` or `log_prob_threshold`.
compression_ratio_threshold: gr.Number
If the gzip compression ratio is above this value, treat as failed
vad_filter: gr.Checkbox
Enable the voice activity detection (VAD) to filter out parts of the audio
without speech. This step is using the Silero VAD model
https://github.com/snakers4/silero-vad.
threshold: gr.Slider
This parameter is related with Silero VAD. Speech threshold.
Silero VAD outputs speech probabilities for each audio chunk,
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
min_speech_duration_ms: gr.Number
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: gr.Number
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
min_silence_duration_ms: gr.Number
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
before separating it
window_size_samples: gr.Number
This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
Values other than these may affect model performance!!
speech_pad_ms: gr.Number
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
chunk_length_s: gr.Number
This parameter is related with insanely-fast-whisper pipe.
Maximum length of each chunk
batch_size: gr.Number
This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
"""
def to_list(self) -> list:
"""
Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
Returns
----------
A list of Gradio components
"""
return [getattr(self, f.name) for f in fields(self)]
@staticmethod
def post_process(*args) -> 'WhisperValues':
"""
To use Whisper parameters in function after Gradio post-processing.
See more about Gradio post-processing: : https://www.gradio.app/docs/components
Returns
----------
WhisperValues
Data class that has values of parameters
"""
return WhisperValues(
model_size=args[0],
lang=args[1],
is_translate=args[2],
beam_size=args[3],
log_prob_threshold=args[4],
no_speech_threshold=args[5],
compute_type=args[6],
best_of=args[7],
patience=args[8],
condition_on_previous_text=args[9],
initial_prompt=args[10],
temperature=args[11],
compression_ratio_threshold=args[12],
vad_filter=args[13],
threshold=args[14],
min_speech_duration_ms=args[15],
max_speech_duration_s=args[16],
min_silence_duration_ms=args[17],
window_size_samples=args[18],
speech_pad_ms=args[19],
chunk_length_s=args[20],
batch_size=args[21]
)
@dataclass
class WhisperValues:
model_size: str
lang: str
is_translate: bool
beam_size: int
log_prob_threshold: float
no_speech_threshold: float
compute_type: str
best_of: int
patience: float
condition_on_previous_text: bool
initial_prompt: Optional[str]
temperature: float
compression_ratio_threshold: float
vad_filter: bool
threshold: float
min_speech_duration_ms: int
max_speech_duration_s: float
min_silence_duration_ms: int
window_size_samples: int
speech_pad_ms: int
chunk_length_s: int
batch_size: int
"""
A data class to use Whisper parameters.
"""
|