File size: 13,776 Bytes
8453b3d
 
afc1f96
e5ef0df
8453b3d
 
 
296b5e1
8453b3d
 
 
 
 
 
 
84a6b12
 
ac4bff9
20f9596
ca26f25
b8faf9d
45fcb1d
abc6224
899eb46
 
 
 
 
661e83c
 
f1d9939
 
201b316
19ab4f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2bf507
b8faf9d
661e83c
 
b8faf9d
8453b3d
b3ffc71
a2bf507
 
 
5206df6
a2bf507
 
5206df6
a2bf507
 
 
5206df6
a2bf507
 
5206df6
a2bf507
 
5206df6
a2bf507
 
 
 
5206df6
a2bf507
 
 
5206df6
84a6b12
 
5206df6
84a6b12
 
5206df6
ca26f25
ac4bff9
 
 
ca26f25
 
 
 
 
b8faf9d
45fcb1d
abc6224
 
 
45fcb1d
8808c7b
45fcb1d
abc6224
 
 
 
 
899eb46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661e83c
 
 
 
 
 
 
f1d9939
 
 
 
 
 
ca8ee6a
201b316
 
 
19ab4f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2bf507
 
6148cfe
a2bf507
296b5e1
b8faf9d
a2bf507
 
 
 
 
8453b3d
 
296b5e1
6148cfe
296b5e1
 
 
 
 
 
 
 
 
f24a6e8
296b5e1
8453b3d
 
 
 
 
 
 
 
 
 
84a6b12
 
ac4bff9
20f9596
ca26f25
b8faf9d
45fcb1d
abc6224
899eb46
 
 
 
 
661e83c
 
f1d9939
 
201b316
19ab4f1
 
 
 
 
 
 
 
 
 
f24a6e8
 
 
19ab4f1
f24a6e8
19ab4f1
a2bf507
296b5e1
e5ef0df
 
afc1f96
e5ef0df
 
 
fdb3baa
e5ef0df
 
 
 
 
 
 
 
fdb3baa
e5ef0df
 
077980d
e5ef0df
 
 
 
fdb3baa
e5ef0df
 
 
 
 
 
 
 
 
fdb3baa
e5ef0df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afc1f96
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
from dataclasses import dataclass, fields
import gradio as gr
from typing import Optional, Dict
import yaml


@dataclass
class WhisperParameters:
    model_size: gr.Dropdown
    lang: gr.Dropdown
    is_translate: gr.Checkbox
    beam_size: gr.Number
    log_prob_threshold: gr.Number
    no_speech_threshold: gr.Number
    compute_type: gr.Dropdown
    best_of: gr.Number
    patience: gr.Number
    condition_on_previous_text: gr.Checkbox
    prompt_reset_on_temperature: gr.Slider
    initial_prompt: gr.Textbox
    temperature: gr.Slider
    compression_ratio_threshold: gr.Number
    vad_filter: gr.Checkbox
    threshold: gr.Slider
    min_speech_duration_ms: gr.Number
    max_speech_duration_s: gr.Number
    min_silence_duration_ms: gr.Number
    speech_pad_ms: gr.Number
    chunk_length_s: gr.Number
    batch_size: gr.Number
    is_diarize: gr.Checkbox
    hf_token: gr.Textbox
    diarization_device: gr.Dropdown
    length_penalty: gr.Number
    repetition_penalty: gr.Number
    no_repeat_ngram_size: gr.Number
    prefix: gr.Textbox
    suppress_blank: gr.Checkbox
    suppress_tokens: gr.Textbox
    max_initial_timestamp: gr.Number
    word_timestamps: gr.Checkbox
    prepend_punctuations: gr.Textbox
    append_punctuations: gr.Textbox
    max_new_tokens: gr.Number
    chunk_length: gr.Number
    hallucination_silence_threshold: gr.Number
    hotwords: gr.Textbox
    language_detection_threshold: gr.Number
    language_detection_segments: gr.Number
    """
    A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
    This data class is used to mitigate the key-value problem between Gradio components and function parameters.
    Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
    See more about Gradio pre-processing: https://www.gradio.app/docs/components

    Attributes
    ----------
    model_size: gr.Dropdown
        Whisper model size.
        
    lang: gr.Dropdown
        Source language of the file to transcribe.
        
    is_translate: gr.Checkbox
        Boolean value that determines whether to translate to English.
        It's Whisper's feature to translate speech from another language directly into English end-to-end.
        
    beam_size: gr.Number
        Int value that is used for decoding option.
        
    log_prob_threshold: gr.Number
        If the average log probability over sampled tokens is below this value, treat as failed.
        
    no_speech_threshold: gr.Number
        If the no_speech probability is higher than this value AND 
        the average log probability over sampled tokens is below `log_prob_threshold`,
        consider the segment as silent.
        
    compute_type: gr.Dropdown
        compute type for transcription.
        see more info : https://opennmt.net/CTranslate2/quantization.html
        
    best_of: gr.Number
        Number of candidates when sampling with non-zero temperature.
        
    patience: gr.Number
        Beam search patience factor.
        
    condition_on_previous_text: gr.Checkbox
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
        
    initial_prompt: gr.Textbox
        Optional text to provide as a prompt for the first window. This can be used to provide, or
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.
        
    temperature: gr.Slider 
        Temperature for sampling. It can be a tuple of temperatures,
        which will be successively used upon failures according to either
        `compression_ratio_threshold` or `log_prob_threshold`.
            
    compression_ratio_threshold: gr.Number
        If the gzip compression ratio is above this value, treat as failed
        
    vad_filter: gr.Checkbox
        Enable the voice activity detection (VAD) to filter out parts of the audio
        without speech. This step is using the Silero VAD model
        https://github.com/snakers4/silero-vad.
        
    threshold: gr.Slider
        This parameter is related with Silero VAD. Speech threshold. 
        Silero VAD outputs speech probabilities for each audio chunk,
        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
        
    min_speech_duration_ms: gr.Number
        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
        
    max_speech_duration_s: gr.Number
        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
        than max_speech_duration_s will be split at the timestamp of the last silence that
        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
        split aggressively just before max_speech_duration_s.
    
    min_silence_duration_ms: gr.Number
        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
        before separating it
        
    speech_pad_ms: gr.Number
        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side    
        
    chunk_length_s: gr.Number
        This parameter is related with insanely-fast-whisper pipe.
        Maximum length of each chunk
        
    batch_size: gr.Number
        This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
        
    is_diarize: gr.Checkbox
        This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
        
    hf_token: gr.Textbox
        This parameter is related with whisperx. Huggingface token is needed to download diarization models.
        Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
        
    diarization_device: gr.Dropdown
        This parameter is related with whisperx. Device to run diarization model
        
    length_penalty: 
        This parameter is related to faster-whisper. Exponential length penalty constant.
    
    repetition_penalty: 
        This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
        (set > 1 to penalize).

    no_repeat_ngram_size:
        This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).

    prefix:
        This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.

    suppress_blank:
        This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.

    suppress_tokens:
        This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
        of symbols as defined in the model config.json file.

    max_initial_timestamp:
        This parameter is related to faster-whisper. The initial timestamp cannot be later than this.

    word_timestamps:
        This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
        and dynamic time warping, and include the timestamps for each word in each segment.

    prepend_punctuations:
        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
        with the next word.

    append_punctuations:
        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
        with the previous word.

    max_new_tokens:
        This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
        the maximum will be set by the default max_length.

    chunk_length:
        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
        default chunk_length of the FeatureExtractor.

    hallucination_silence_threshold:
        This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
        (in seconds) when a possible hallucination is detected.

    hotwords:
        This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.

    language_detection_threshold:
        This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.

    language_detection_segments:
        This parameter is related to faster-whisper. Number of segments to consider for the language detection.
    """

    def as_list(self) -> list:
        """
        Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
        See more about Gradio pre-processing: : https://www.gradio.app/docs/components

        Returns
        ----------
        A list of Gradio components
        """
        return [getattr(self, f.name) for f in fields(self)]

    @staticmethod
    def as_value(*args) -> 'WhisperValues':
        """
        To use Whisper parameters in function after Gradio post-processing.
        See more about Gradio post-processing: : https://www.gradio.app/docs/components

        Returns
        ----------
        WhisperValues
           Data class that has values of parameters
        """
        return WhisperValues(*args)


@dataclass
class WhisperValues:
    model_size: str
    lang: str
    is_translate: bool
    beam_size: int
    log_prob_threshold: float
    no_speech_threshold: float
    compute_type: str
    best_of: int
    patience: float
    condition_on_previous_text: bool
    prompt_reset_on_temperature: float
    initial_prompt: Optional[str]
    temperature: float
    compression_ratio_threshold: float
    vad_filter: bool
    threshold: float
    min_speech_duration_ms: int
    max_speech_duration_s: float
    min_silence_duration_ms: int
    speech_pad_ms: int
    chunk_length_s: int
    batch_size: int
    is_diarize: bool
    hf_token: str
    diarization_device: str
    length_penalty: float
    repetition_penalty: float
    no_repeat_ngram_size: int
    prefix: Optional[str]
    suppress_blank: bool
    suppress_tokens: Optional[str]
    max_initial_timestamp: float
    word_timestamps: bool
    prepend_punctuations: Optional[str]
    append_punctuations: Optional[str]
    max_new_tokens: Optional[int]
    chunk_length: Optional[int]
    hallucination_silence_threshold: Optional[float]
    hotwords: Optional[str]
    language_detection_threshold: Optional[float]
    language_detection_segments: int
    """
    A data class to use Whisper parameters.
    """

    def to_yaml(self) -> Dict:
        data = {
            "whisper": {
                "model_size": self.model_size,
                "lang": "Automatic Detection" if self.lang is None else self.lang,
                "is_translate": self.is_translate,
                "beam_size": self.beam_size,
                "log_prob_threshold": self.log_prob_threshold,
                "no_speech_threshold": self.no_speech_threshold,
                "best_of": self.best_of,
                "patience": self.patience,
                "condition_on_previous_text": self.condition_on_previous_text,
                "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
                "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
                "temperature": self.temperature,
                "compression_ratio_threshold": self.compression_ratio_threshold,
                "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
                "batch_size": self.batch_size,
                "length_penalty": self.length_penalty,
                "repetition_penalty": self.repetition_penalty,
                "no_repeat_ngram_size": self.no_repeat_ngram_size,
                "prefix": None if not self.prefix else self.prefix,
                "suppress_blank": self.suppress_blank,
                "suppress_tokens": self.suppress_tokens,
                "max_initial_timestamp": self.max_initial_timestamp,
                "word_timestamps": self.word_timestamps,
                "prepend_punctuations": self.prepend_punctuations,
                "append_punctuations": self.append_punctuations,
                "max_new_tokens": self.max_new_tokens,
                "chunk_length": self.chunk_length,
                "hallucination_silence_threshold": self.hallucination_silence_threshold,
                "hotwords": None if not self.hotwords else self.hotwords,
                "language_detection_threshold": self.language_detection_threshold,
                "language_detection_segments": self.language_detection_segments,
            },
            "vad": {
                "vad_filter": self.vad_filter,
                "threshold": self.threshold,
                "min_speech_duration_ms": self.min_speech_duration_ms,
                "max_speech_duration_s": self.max_speech_duration_s,
                "min_silence_duration_ms": self.min_silence_duration_ms,
                "speech_pad_ms": self.speech_pad_ms,
            },
            "diarization": {
                "is_diarize": self.is_diarize,
                "hf_token": self.hf_token
            }
        }
        return data