Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Apr 8, 2024

Commit

84a6b12

1 Parent(s): 875278a

add parameters

Browse files

Files changed (4) hide show

app.py +15 -3
modules/faster_whisper_inference.py +2 -0
modules/whisper_Inference.py +2 -0
modules/whisper_data_class.py +8 -0

app.py CHANGED Viewed

@@ -63,6 +63,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -77,7 +79,9 @@ class App:
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -109,6 +113,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -123,7 +129,9 @@ class App:
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -148,6 +156,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -162,7 +172,9 @@ class App:
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
-                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              beam_size=nb_beam_size,
                                                              log_prob_threshold=nb_log_prob_threshold,
                                                              no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -264,6 +264,8 @@ class FasterWhisperInference(BaseInterface):
             beam_size=params.beam_size,
             log_prob_threshold=params.log_prob_threshold,
             no_speech_threshold=params.no_speech_threshold,
         )
         progress(0, desc="Loading audio..")

             beam_size=params.beam_size,
             log_prob_threshold=params.log_prob_threshold,
             no_speech_threshold=params.no_speech_threshold,
+            best_of=params.best_of,
+            patience=params.patience
         )
         progress(0, desc="Loading audio..")

modules/whisper_Inference.py CHANGED Viewed

@@ -255,6 +255,8 @@ class WhisperInference(BaseInterface):
                                                 no_speech_threshold=params.no_speech_threshold,
                                                 task="translate" if params.is_translate and self.current_model_size in self.translatable_model else "transcribe",
                                                 fp16=True if params.compute_type == "float16" else False,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time

                                                 no_speech_threshold=params.no_speech_threshold,
                                                 task="translate" if params.is_translate and self.current_model_size in self.translatable_model else "transcribe",
                                                 fp16=True if params.compute_type == "float16" else False,
+                                                best_of=params.best_of,
+                                                patience=params.patience,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time

modules/whisper_data_class.py CHANGED Viewed

@@ -11,6 +11,8 @@ class WhisperGradioComponents:
     log_prob_threshold: gr.Number
     no_speech_threshold: gr.Number
     compute_type: gr.Dropdown
     """
     A data class to pass Gradio components to the function before Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -35,6 +37,10 @@ class WhisperGradioComponents:
     compute_type: gr.Dropdown
         compute type for transcription.
         see more info : https://opennmt.net/CTranslate2/quantization.html
     """
     def to_list(self) -> list:
@@ -74,6 +80,8 @@ class WhisperValues:
     log_prob_threshold: float
     no_speech_threshold: float
     compute_type: str
     """
     A data class to use Whisper parameters in the function after Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components

     log_prob_threshold: gr.Number
     no_speech_threshold: gr.Number
     compute_type: gr.Dropdown
+    best_of: gr.Number
+    patience: gr.Number
     """
     A data class to pass Gradio components to the function before Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
     compute_type: gr.Dropdown
         compute type for transcription.
         see more info : https://opennmt.net/CTranslate2/quantization.html
+    best_of: gr.Number
+        Number of candidates when sampling with non-zero temperature.
+    patience: gr.Number
+        Beam search patience factor.
     """
     def to_list(self) -> list:
     log_prob_threshold: float
     no_speech_threshold: float
     compute_type: str
+    best_of: int
+    patience: float
     """
     A data class to use Whisper parameters in the function after Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components