jhj0517 commited on
Commit
2b4b831
·
unverified ·
2 Parent(s): 6a24751 20f9596

Merge pull request #200 from jhj0517/feature/add-parameter

Browse files
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import argparse
 
3
 
4
  from modules.whisper.whisper_Inference import WhisperInference
5
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -59,20 +60,126 @@ class App:
59
  )
60
  return whisper_inf
61
 
62
- @staticmethod
63
- def open_folder(folder_path: str):
64
- if os.path.exists(folder_path):
65
- os.system(f"start {folder_path}")
66
- else:
67
- print(f"The folder {folder_path} does not exist.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- @staticmethod
70
- def on_change_models(model_size: str):
71
- translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
72
- if model_size not in translatable_model:
73
- return gr.Checkbox(visible=False, value=False, interactive=False)
74
- else:
75
- return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def launch(self):
78
  with self.app:
@@ -84,47 +191,13 @@ class App:
84
  with gr.Column():
85
  input_file = gr.Files(type="filepath", label="Upload File here")
86
  tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
87
- info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
88
  " Leave this field empty if you do not wish to use a local path.",
89
  visible=self.args.colab,
90
  value="")
91
- with gr.Row():
92
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
93
- label="Model")
94
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
95
- value="Automatic Detection", label="Language")
96
- dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
97
- with gr.Row():
98
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
99
- with gr.Row():
100
- cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
101
- with gr.Accordion("Advanced Parameters", open=False):
102
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
103
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
104
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
105
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
106
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
107
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
108
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
109
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
110
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
111
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
112
- with gr.Accordion("VAD", open=False):
113
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
114
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
115
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
116
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
118
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
119
- with gr.Accordion("Diarization", open=False):
120
- cb_diarize = gr.Checkbox(label="Enable Diarization")
121
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
122
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
123
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
124
- dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
125
- with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
126
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
127
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
128
  with gr.Row():
129
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
130
  with gr.Row():
@@ -133,36 +206,10 @@ class App:
133
  btn_openfolder = gr.Button('📂', scale=1)
134
 
135
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
136
- whisper_params = WhisperParameters(model_size=dd_model,
137
- lang=dd_lang,
138
- is_translate=cb_translate,
139
- beam_size=nb_beam_size,
140
- log_prob_threshold=nb_log_prob_threshold,
141
- no_speech_threshold=nb_no_speech_threshold,
142
- compute_type=dd_compute_type,
143
- best_of=nb_best_of,
144
- patience=nb_patience,
145
- condition_on_previous_text=cb_condition_on_previous_text,
146
- initial_prompt=tb_initial_prompt,
147
- temperature=sd_temperature,
148
- compression_ratio_threshold=nb_compression_ratio_threshold,
149
- vad_filter=cb_vad_filter,
150
- threshold=sd_threshold,
151
- min_speech_duration_ms=nb_min_speech_duration_ms,
152
- max_speech_duration_s=nb_max_speech_duration_s,
153
- min_silence_duration_ms=nb_min_silence_duration_ms,
154
- speech_pad_ms=nb_speech_pad_ms,
155
- chunk_length_s=nb_chunk_length_s,
156
- batch_size=nb_batch_size,
157
- is_diarize=cb_diarize,
158
- hf_token=tb_hf_token,
159
- diarization_device=dd_diarization_device)
160
-
161
  btn_run.click(fn=self.whisper_inf.transcribe_file,
162
  inputs=params + whisper_params.as_list(),
163
  outputs=[tb_indicator, files_subtitles])
164
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
165
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
166
 
167
  with gr.TabItem("Youtube"): # tab2
168
  with gr.Row():
@@ -173,45 +220,9 @@ class App:
173
  with gr.Column():
174
  tb_title = gr.Label(label="Youtube Title")
175
  tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
176
- with gr.Row():
177
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
178
- label="Model")
179
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
180
- value="Automatic Detection", label="Language")
181
- dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
182
- with gr.Row():
183
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
184
- with gr.Row():
185
- cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
186
- interactive=True)
187
- with gr.Accordion("Advanced Parameters", open=False):
188
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
189
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
190
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
191
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
192
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
193
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
194
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
195
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
196
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
197
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
198
- with gr.Accordion("VAD", open=False):
199
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
200
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
201
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
202
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
203
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
204
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
205
- with gr.Accordion("Diarization", open=False):
206
- cb_diarize = gr.Checkbox(label="Enable Diarization")
207
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
208
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
209
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
210
- dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
211
- with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
212
- visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
213
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
214
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
215
  with gr.Row():
216
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
217
  with gr.Row():
@@ -220,30 +231,6 @@ class App:
220
  btn_openfolder = gr.Button('📂', scale=1)
221
 
222
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
223
- whisper_params = WhisperParameters(model_size=dd_model,
224
- lang=dd_lang,
225
- is_translate=cb_translate,
226
- beam_size=nb_beam_size,
227
- log_prob_threshold=nb_log_prob_threshold,
228
- no_speech_threshold=nb_no_speech_threshold,
229
- compute_type=dd_compute_type,
230
- best_of=nb_best_of,
231
- patience=nb_patience,
232
- condition_on_previous_text=cb_condition_on_previous_text,
233
- initial_prompt=tb_initial_prompt,
234
- temperature=sd_temperature,
235
- compression_ratio_threshold=nb_compression_ratio_threshold,
236
- vad_filter=cb_vad_filter,
237
- threshold=sd_threshold,
238
- min_speech_duration_ms=nb_min_speech_duration_ms,
239
- max_speech_duration_s=nb_max_speech_duration_s,
240
- min_silence_duration_ms=nb_min_silence_duration_ms,
241
- speech_pad_ms=nb_speech_pad_ms,
242
- chunk_length_s=nb_chunk_length_s,
243
- batch_size=nb_batch_size,
244
- is_diarize=cb_diarize,
245
- hf_token=tb_hf_token,
246
- diarization_device=dd_diarization_device)
247
 
248
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
249
  inputs=params + whisper_params.as_list(),
@@ -251,48 +238,13 @@ class App:
251
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
252
  outputs=[img_thumbnail, tb_title, tb_description])
253
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
254
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
255
 
256
  with gr.TabItem("Mic"): # tab3
257
  with gr.Row():
258
  mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
259
- with gr.Row():
260
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
261
- label="Model")
262
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
263
- value="Automatic Detection", label="Language")
264
- dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
265
- with gr.Row():
266
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
267
- with gr.Accordion("Advanced Parameters", open=False):
268
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
269
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
270
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
271
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
272
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
273
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
274
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
275
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
276
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
277
- with gr.Accordion("VAD", open=False):
278
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
279
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
280
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
281
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
282
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
283
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
284
- with gr.Accordion("Diarization", open=False):
285
- cb_diarize = gr.Checkbox(label="Enable Diarization")
286
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
287
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
288
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
289
- dd_diarization_device = gr.Dropdown(label="Device",
290
- choices=self.whisper_inf.diarizer.get_available_device(),
291
- value=self.whisper_inf.diarizer.get_device())
292
- with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
293
- visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
294
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
295
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
296
  with gr.Row():
297
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
298
  with gr.Row():
@@ -301,36 +253,11 @@ class App:
301
  btn_openfolder = gr.Button('📂', scale=1)
302
 
303
  params = [mic_input, dd_file_format]
304
- whisper_params = WhisperParameters(model_size=dd_model,
305
- lang=dd_lang,
306
- is_translate=cb_translate,
307
- beam_size=nb_beam_size,
308
- log_prob_threshold=nb_log_prob_threshold,
309
- no_speech_threshold=nb_no_speech_threshold,
310
- compute_type=dd_compute_type,
311
- best_of=nb_best_of,
312
- patience=nb_patience,
313
- condition_on_previous_text=cb_condition_on_previous_text,
314
- initial_prompt=tb_initial_prompt,
315
- temperature=sd_temperature,
316
- compression_ratio_threshold=nb_compression_ratio_threshold,
317
- vad_filter=cb_vad_filter,
318
- threshold=sd_threshold,
319
- min_speech_duration_ms=nb_min_speech_duration_ms,
320
- max_speech_duration_s=nb_max_speech_duration_s,
321
- min_silence_duration_ms=nb_min_silence_duration_ms,
322
- speech_pad_ms=nb_speech_pad_ms,
323
- chunk_length_s=nb_chunk_length_s,
324
- batch_size=nb_batch_size,
325
- is_diarize=cb_diarize,
326
- hf_token=tb_hf_token,
327
- diarization_device=dd_diarization_device)
328
 
329
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
330
  inputs=params + whisper_params.as_list(),
331
  outputs=[tb_indicator, files_subtitles])
332
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
333
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
334
 
335
  with gr.TabItem("T2T Translation"): # tab 4
336
  with gr.Row():
@@ -389,7 +316,8 @@ class App:
389
  md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
390
 
391
  btn_run.click(fn=self.nllb_inf.translate_file,
392
- inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, nb_max_length, cb_timestamp],
 
393
  outputs=[tb_indicator, files_subtitles])
394
 
395
  btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
@@ -412,10 +340,26 @@ class App:
412
 
413
  self.app.queue(api_open=False).launch(**launch_args)
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
  # Create the parser for command-line arguments
417
  parser = argparse.ArgumentParser()
418
- parser.add_argument('--whisper_type', type=str, default="faster-whisper", help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 
419
  parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
420
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
421
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -425,11 +369,17 @@ parser.add_argument('--password', type=str, default=None, help='Gradio authentic
425
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
426
  parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
427
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
428
- parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
429
- parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
430
- parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
431
- parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"), help='Directory path of the diarization model')
432
- parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 
 
 
 
 
 
433
  parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
434
  _args = parser.parse_args()
435
 
 
1
  import os
2
  import argparse
3
+ import gradio as gr
4
 
5
  from modules.whisper.whisper_Inference import WhisperInference
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
 
60
  )
61
  return whisper_inf
62
 
63
+ def create_whisper_parameters(self):
64
+ with gr.Row():
65
+ dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
66
+ label="Model")
67
+ dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
68
+ value="Automatic Detection", label="Language")
69
+ dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
70
+ with gr.Row():
71
+ cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
72
+ with gr.Row():
73
+ cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
74
+ interactive=True)
75
+ with gr.Accordion("Advanced Parameters", open=False):
76
+ nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True,
77
+ info="Beam size to use for decoding.")
78
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
79
+ info="If the average log probability over sampled tokens is below this value, treat as failed.")
80
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True,
81
+ info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
82
+ dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
83
+ value=self.whisper_inf.current_compute_type, interactive=True,
84
+ info="Select the type of computation to perform.")
85
+ nb_best_of = gr.Number(label="Best Of", value=5, interactive=True,
86
+ info="Number of candidates when sampling with non-zero temperature.")
87
+ nb_patience = gr.Number(label="Patience", value=1, interactive=True,
88
+ info="Beam search patience factor.")
89
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
90
+ interactive=True,
91
+ info="Condition on previous text during decoding.")
92
+ sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=0.5,
93
+ minimum=0, maximum=1, step=0.01, interactive=True,
94
+ info="Resets prompt if temperature is above this value."
95
+ " Arg has effect only if 'Condition On Previous Text' is True.")
96
+ tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
97
+ info="Initial prompt to use for decoding.")
98
+ sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True,
99
+ info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
100
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True,
101
+ info="If the gzip compression ratio is above this value, treat as failed.")
102
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
103
+ nb_length_penalty = gr.Number(label="Length Penalty", value=1,
104
+ info="Exponential length penalty constant.")
105
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
106
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
107
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
108
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
109
+ tb_prefix = gr.Textbox(label="Prefix", value=lambda: None,
110
+ info="Optional text to provide as a prefix for the first window.")
111
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
112
+ info="Suppress blank outputs at the beginning of the sampling.")
113
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
114
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
115
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
116
+ info="The initial timestamp cannot be later than this.")
117
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
118
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
119
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
120
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
121
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations", value="\"'.。,,!!??::”)]}、",
122
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
123
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
124
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
125
+ nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
126
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
127
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
128
+ value=lambda: None,
129
+ info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
130
+ tb_hotwords = gr.Textbox(label="Hotwords", value=None,
131
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
132
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=None,
133
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
134
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1, precision=0,
135
+ info="Number of segments to consider for the language detection.")
136
+ with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
137
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
138
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
139
 
140
+ with gr.Accordion("VAD", open=False):
141
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
142
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
143
+ info="Lower it to be more sensitive to small sounds.")
144
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
145
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
146
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
147
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
148
+
149
+ with gr.Accordion("Diarization", open=False):
150
+ cb_diarize = gr.Checkbox(label="Enable Diarization")
151
+ tb_hf_token = gr.Text(label="HuggingFace Token", value="",
152
+ info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
153
+ dd_diarization_device = gr.Dropdown(label="Device",
154
+ choices=self.whisper_inf.diarizer.get_available_device(),
155
+ value=self.whisper_inf.diarizer.get_device())
156
+
157
+ dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
158
+
159
+ return (
160
+ WhisperParameters(
161
+ model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
162
+ log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
163
+ compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
164
+ condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
165
+ temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
166
+ vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
167
+ max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
168
+ speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
169
+ is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
170
+ length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
171
+ no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
172
+ suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
173
+ word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
174
+ append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
175
+ hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
176
+ language_detection_threshold=nb_language_detection_threshold,
177
+ language_detection_segments=nb_language_detection_segments,
178
+ prompt_reset_on_temperature=sld_prompt_reset_on_temperature
179
+ ),
180
+ dd_file_format,
181
+ cb_timestamp
182
+ )
183
 
184
  def launch(self):
185
  with self.app:
 
191
  with gr.Column():
192
  input_file = gr.Files(type="filepath", label="Upload File here")
193
  tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
194
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
195
  " Leave this field empty if you do not wish to use a local path.",
196
  visible=self.args.colab,
197
  value="")
198
+
199
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
200
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  with gr.Row():
202
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
203
  with gr.Row():
 
206
  btn_openfolder = gr.Button('📂', scale=1)
207
 
208
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  btn_run.click(fn=self.whisper_inf.transcribe_file,
210
  inputs=params + whisper_params.as_list(),
211
  outputs=[tb_indicator, files_subtitles])
212
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
213
 
214
  with gr.TabItem("Youtube"): # tab2
215
  with gr.Row():
 
220
  with gr.Column():
221
  tb_title = gr.Label(label="Youtube Title")
222
  tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
223
+
224
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
225
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  with gr.Row():
227
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
228
  with gr.Row():
 
231
  btn_openfolder = gr.Button('📂', scale=1)
232
 
233
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
236
  inputs=params + whisper_params.as_list(),
 
238
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
239
  outputs=[img_thumbnail, tb_title, tb_description])
240
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
241
 
242
  with gr.TabItem("Mic"): # tab3
243
  with gr.Row():
244
  mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
245
+
246
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
247
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  with gr.Row():
249
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
250
  with gr.Row():
 
253
  btn_openfolder = gr.Button('📂', scale=1)
254
 
255
  params = [mic_input, dd_file_format]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
258
  inputs=params + whisper_params.as_list(),
259
  outputs=[tb_indicator, files_subtitles])
260
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
261
 
262
  with gr.TabItem("T2T Translation"): # tab 4
263
  with gr.Row():
 
316
  md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
317
 
318
  btn_run.click(fn=self.nllb_inf.translate_file,
319
+ inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang,
320
+ nb_max_length, cb_timestamp],
321
  outputs=[tb_indicator, files_subtitles])
322
 
323
  btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
 
340
 
341
  self.app.queue(api_open=False).launch(**launch_args)
342
 
343
+ @staticmethod
344
+ def open_folder(folder_path: str):
345
+ if os.path.exists(folder_path):
346
+ os.system(f"start {folder_path}")
347
+ else:
348
+ print(f"The folder {folder_path} does not exist.")
349
+
350
+ @staticmethod
351
+ def on_change_models(model_size: str):
352
+ translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
353
+ if model_size not in translatable_model:
354
+ return gr.Checkbox(visible=False, value=False, interactive=False)
355
+ else:
356
+ return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
357
+
358
 
359
  # Create the parser for command-line arguments
360
  parser = argparse.ArgumentParser()
361
+ parser.add_argument('--whisper_type', type=str, default="faster-whisper",
362
+ help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
363
  parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
364
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
365
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 
369
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
370
  parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
371
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
372
+ parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"),
373
+ help='Directory path of the whisper model')
374
+ parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"),
375
+ help='Directory path of the faster-whisper model')
376
+ parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
377
+ default=os.path.join("models", "Whisper", "insanely-fast-whisper"),
378
+ help='Directory path of the insanely-fast-whisper model')
379
+ parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"),
380
+ help='Directory path of the diarization model')
381
+ parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"),
382
+ help='Directory path of the Facebook NLLB model')
383
  parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
384
  _args = parser.parse_args()
385
 
modules/whisper/faster_whisper_inference.py CHANGED
@@ -5,6 +5,7 @@ import torch
5
  from typing import BinaryIO, Union, Tuple, List
6
  import faster_whisper
7
  from faster_whisper.vad import VadOptions
 
8
  import ctranslate2
9
  import whisper
10
  import gradio as gr
@@ -62,6 +63,8 @@ class FasterWhisperInference(WhisperBase):
62
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
63
  self.update_model(params.model_size, params.compute_type, progress)
64
 
 
 
65
  segments, info = self.model.transcribe(
66
  audio=audio,
67
  language=params.lang,
@@ -73,6 +76,23 @@ class FasterWhisperInference(WhisperBase):
73
  patience=params.patience,
74
  temperature=params.temperature,
75
  compression_ratio_threshold=params.compression_ratio_threshold,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  )
77
  progress(0, desc="Loading audio..")
78
 
@@ -147,3 +167,13 @@ class FasterWhisperInference(WhisperBase):
147
  return "cuda"
148
  else:
149
  return "auto"
 
 
 
 
 
 
 
 
 
 
 
5
  from typing import BinaryIO, Union, Tuple, List
6
  import faster_whisper
7
  from faster_whisper.vad import VadOptions
8
+ import ast
9
  import ctranslate2
10
  import whisper
11
  import gradio as gr
 
63
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
64
  self.update_model(params.model_size, params.compute_type, progress)
65
 
66
+ params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
67
+
68
  segments, info = self.model.transcribe(
69
  audio=audio,
70
  language=params.lang,
 
76
  patience=params.patience,
77
  temperature=params.temperature,
78
  compression_ratio_threshold=params.compression_ratio_threshold,
79
+ length_penalty=params.length_penalty,
80
+ repetition_penalty=params.repetition_penalty,
81
+ no_repeat_ngram_size=params.no_repeat_ngram_size,
82
+ prefix=params.prefix,
83
+ suppress_blank=params.suppress_blank,
84
+ suppress_tokens=params.suppress_tokens,
85
+ max_initial_timestamp=params.max_initial_timestamp,
86
+ word_timestamps=params.word_timestamps,
87
+ prepend_punctuations=params.prepend_punctuations,
88
+ append_punctuations=params.append_punctuations,
89
+ max_new_tokens=params.max_new_tokens,
90
+ chunk_length=params.chunk_length,
91
+ hallucination_silence_threshold=params.hallucination_silence_threshold,
92
+ hotwords=params.hotwords,
93
+ language_detection_threshold=params.language_detection_threshold,
94
+ language_detection_segments=params.language_detection_segments,
95
+ prompt_reset_on_temperature=params.prompt_reset_on_temperature
96
  )
97
  progress(0, desc="Loading audio..")
98
 
 
167
  return "cuda"
168
  else:
169
  return "auto"
170
+
171
+ @staticmethod
172
+ def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
173
+ try:
174
+ suppress_tokens = ast.literal_eval(suppress_tokens_str)
175
+ if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
176
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
177
+ return suppress_tokens
178
+ except Exception as e:
179
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
modules/whisper/whisper_parameter.py CHANGED
@@ -15,6 +15,7 @@ class WhisperParameters:
15
  best_of: gr.Number
16
  patience: gr.Number
17
  condition_on_previous_text: gr.Checkbox
 
18
  initial_prompt: gr.Textbox
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
@@ -29,6 +30,22 @@ class WhisperParameters:
29
  is_diarize: gr.Checkbox
30
  hf_token: gr.Textbox
31
  diarization_device: gr.Dropdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
34
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -129,6 +146,62 @@ class WhisperParameters:
129
 
130
  diarization_device: gr.Dropdown
131
  This parameter is related with whisperx. Device to run diarization model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  """
133
 
134
  def as_list(self) -> list:
@@ -153,32 +226,7 @@ class WhisperParameters:
153
  WhisperValues
154
  Data class that has values of parameters
155
  """
156
- return WhisperValues(
157
- model_size=args[0],
158
- lang=args[1],
159
- is_translate=args[2],
160
- beam_size=args[3],
161
- log_prob_threshold=args[4],
162
- no_speech_threshold=args[5],
163
- compute_type=args[6],
164
- best_of=args[7],
165
- patience=args[8],
166
- condition_on_previous_text=args[9],
167
- initial_prompt=args[10],
168
- temperature=args[11],
169
- compression_ratio_threshold=args[12],
170
- vad_filter=args[13],
171
- threshold=args[14],
172
- min_speech_duration_ms=args[15],
173
- max_speech_duration_s=args[16],
174
- min_silence_duration_ms=args[17],
175
- speech_pad_ms=args[18],
176
- chunk_length_s=args[19],
177
- batch_size=args[20],
178
- is_diarize=args[21],
179
- hf_token=args[22],
180
- diarization_device=args[23]
181
- )
182
 
183
 
184
  @dataclass
@@ -193,6 +241,7 @@ class WhisperValues:
193
  best_of: int
194
  patience: float
195
  condition_on_previous_text: bool
 
196
  initial_prompt: Optional[str]
197
  temperature: float
198
  compression_ratio_threshold: float
@@ -207,6 +256,22 @@ class WhisperValues:
207
  is_diarize: bool
208
  hf_token: str
209
  diarization_device: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  """
211
  A data class to use Whisper parameters.
212
  """
 
15
  best_of: gr.Number
16
  patience: gr.Number
17
  condition_on_previous_text: gr.Checkbox
18
+ prompt_reset_on_temperature: gr.Slider
19
  initial_prompt: gr.Textbox
20
  temperature: gr.Slider
21
  compression_ratio_threshold: gr.Number
 
30
  is_diarize: gr.Checkbox
31
  hf_token: gr.Textbox
32
  diarization_device: gr.Dropdown
33
+ length_penalty: gr.Number
34
+ repetition_penalty: gr.Number
35
+ no_repeat_ngram_size: gr.Number
36
+ prefix: gr.Textbox
37
+ suppress_blank: gr.Checkbox
38
+ suppress_tokens: gr.Textbox
39
+ max_initial_timestamp: gr.Number
40
+ word_timestamps: gr.Checkbox
41
+ prepend_punctuations: gr.Textbox
42
+ append_punctuations: gr.Textbox
43
+ max_new_tokens: gr.Number
44
+ chunk_length: gr.Number
45
+ hallucination_silence_threshold: gr.Number
46
+ hotwords: gr.Textbox
47
+ language_detection_threshold: gr.Number
48
+ language_detection_segments: gr.Number
49
  """
50
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
51
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
 
146
 
147
  diarization_device: gr.Dropdown
148
  This parameter is related with whisperx. Device to run diarization model
149
+
150
+ length_penalty:
151
+ This parameter is related to faster-whisper. Exponential length penalty constant.
152
+
153
+ repetition_penalty:
154
+ This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
155
+ (set > 1 to penalize).
156
+
157
+ no_repeat_ngram_size:
158
+ This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
159
+
160
+ prefix:
161
+ This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
162
+
163
+ suppress_blank:
164
+ This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
165
+
166
+ suppress_tokens:
167
+ This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
168
+ of symbols as defined in the model config.json file.
169
+
170
+ max_initial_timestamp:
171
+ This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
172
+
173
+ word_timestamps:
174
+ This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
175
+ and dynamic time warping, and include the timestamps for each word in each segment.
176
+
177
+ prepend_punctuations:
178
+ This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
179
+ with the next word.
180
+
181
+ append_punctuations:
182
+ This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
183
+ with the previous word.
184
+
185
+ max_new_tokens:
186
+ This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
187
+ the maximum will be set by the default max_length.
188
+
189
+ chunk_length:
190
+ This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
191
+ default chunk_length of the FeatureExtractor.
192
+
193
+ hallucination_silence_threshold:
194
+ This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
195
+ (in seconds) when a possible hallucination is detected.
196
+
197
+ hotwords:
198
+ This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
199
+
200
+ language_detection_threshold:
201
+ This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
202
+
203
+ language_detection_segments:
204
+ This parameter is related to faster-whisper. Number of segments to consider for the language detection.
205
  """
206
 
207
  def as_list(self) -> list:
 
226
  WhisperValues
227
  Data class that has values of parameters
228
  """
229
+ return WhisperValues(*args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
 
232
  @dataclass
 
241
  best_of: int
242
  patience: float
243
  condition_on_previous_text: bool
244
+ prompt_reset_on_temperature: float
245
  initial_prompt: Optional[str]
246
  temperature: float
247
  compression_ratio_threshold: float
 
256
  is_diarize: bool
257
  hf_token: str
258
  diarization_device: str
259
+ length_penalty: float
260
+ repetition_penalty: float
261
+ no_repeat_ngram_size: int
262
+ prefix: Optional[str]
263
+ suppress_blank: bool
264
+ suppress_tokens: Optional[str]
265
+ max_initial_timestamp: float
266
+ word_timestamps: bool
267
+ prepend_punctuations: Optional[str]
268
+ append_punctuations: Optional[str]
269
+ max_new_tokens: Optional[int]
270
+ chunk_length: Optional[int]
271
+ hallucination_silence_threshold: Optional[float]
272
+ hotwords: Optional[str]
273
+ language_detection_threshold: Optional[float]
274
+ language_detection_segments: int
275
  """
276
  A data class to use Whisper parameters.
277
  """