Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 5, 2024

Commit

8bcf1fb

unverified ·

2 Parent(s): 4904f1c d8c2d87

Merge pull request #255 from jhj0517/feature/remember-settings

Browse files

Files changed (19) hide show

.gitignore +2 -1
app.py +88 -67
configs/default_parameters.yaml +58 -0
modules/diarize/diarize_pipeline.py +2 -1
modules/diarize/diarizer.py +2 -1
modules/translation/deepl_api.py +27 -2
modules/translation/nllb_inference.py +3 -2
modules/translation/translation_base.py +26 -2
{ui → modules/ui}/__init__.py +0 -0
{ui → modules/ui}/htmls.py +0 -0
modules/utils/files_manager.py +25 -1
modules/utils/paths.py +24 -0
modules/whisper/faster_whisper_inference.py +5 -6
modules/whisper/insanely_fast_whisper_inference.py +4 -3
modules/whisper/whisper_Inference.py +4 -3
modules/whisper/whisper_base.py +28 -5
modules/whisper/whisper_factory.py +7 -5
modules/whisper/whisper_parameter.py +53 -2
requirements.txt +1 -0

.gitignore CHANGED Viewed

@@ -3,8 +3,9 @@
 *.mp4
 *.mp3
 venv/
-ui/__pycache__/
 outputs/
 modules/__pycache__/
 models/
 modules/yt_tmp.wav

 *.mp4
 *.mp3
 venv/
+modules/ui/__pycache__/
 outputs/
 modules/__pycache__/
 models/
 modules/yt_tmp.wav
+configs/default_parameters.yaml

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import os
 import argparse
 import gradio as gr
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
-from ui.htmls import *
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
 from modules.whisper.whisper_parameter import *
@@ -32,103 +36,117 @@ class App:
         self.deepl_api = DeepLAPI(
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
     def create_whisper_parameters(self):
         with gr.Row():
-            dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
                                    label="Model")
             dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                  value="Automatic Detection", label="Language")
             dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
         with gr.Row():
-            cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
         with gr.Row():
-            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
-            nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
-            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")
-            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True,
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
                                           info="Select the type of computation to perform.")
-            nb_best_of = gr.Number(label="Best Of", value=5, interactive=True,
                                    info="Number of candidates when sampling with non-zero temperature.")
-            nb_patience = gr.Number(label="Patience", value=1, interactive=True,
                                     info="Beam search patience factor.")
-            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
                                                         interactive=True,
                                                         info="Condition on previous text during decoding.")
-            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=0.5,
                                                         minimum=0, maximum=1, step=0.01, interactive=True,
                                                         info="Resets prompt if temperature is above this value."
                                                              " Arg has effect only if 'Condition On Previous Text' is True.")
             tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
                                            info="Initial prompt to use for decoding.")
-            sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True,
                                        info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
-            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
-                nb_length_penalty = gr.Number(label="Length Penalty", value=1,
                                               info="Exponential length penalty constant.")
-                nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
                                                   info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
-                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
                                                     info="Prevent repetitions of n-grams with this size (set 0 to disable).")
-                tb_prefix = gr.Textbox(label="Prefix", value=lambda: None,
                                        info="Optional text to provide as a prefix for the first window.")
-                cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
                                                 info="Suppress blank outputs at the beginning of the sampling.")
-                tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
                                                 info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
-                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
                                                      info="The initial timestamp cannot be later than this.")
-                cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
                                                  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
-                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
                                                      info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
-                tb_append_punctuations = gr.Textbox(label="Append Punctuations", value="\"'.。,，!！?？:：”)]}、",
                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
-                nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
-                nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
                                             info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
-                                                               value=lambda: None,
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
-                tb_hotwords = gr.Textbox(label="Hotwords", value=None,
                                          info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
-                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=None,
                                                             info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
-                nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1, precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
-                nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
         with gr.Accordion("VAD", open=False):
-            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
                                      info="Lower it to be more sensitive to small sounds.")
-            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250,
                                                   info="Final speech chunks shorter than this time are thrown out")
-            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999,
                                                  info="Maximum duration of speech chunks in \"seconds\". Chunks longer"
                                                         " than this time will be split at the timestamp of the last silence that"
                                                         " lasts more than 100ms (if any), to prevent aggressive cutting.")
-            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000,
                                                    info="In the end of each speech chunk wait for this time"
                                                         " before separating it")
-            nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400,
                                          info="Final speech chunks are padded by this time each side")
         with gr.Accordion("Diarization", open=False):
-            cb_diarize = gr.Checkbox(label="Enable Diarization")
-            tb_hf_token = gr.Text(label="HuggingFace Token", value="",
                                   info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
             dd_diarization_device = gr.Dropdown(label="Device",
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
@@ -162,6 +180,10 @@ class App:
         )
     def launch(self):
         with self.app:
             with gr.Row():
                 with gr.Column():
@@ -246,19 +268,17 @@ class App:
                     with gr.TabItem("DeepL API"):  # sub tab1
                         with gr.Row():
-                            tb_authkey = gr.Textbox(label="Your Auth Key (API KEY)",
-                                                    value="")
                         with gr.Row():
-                            dd_deepl_sourcelang = gr.Dropdown(label="Source Language", value="Automatic Detection",
-                                                              choices=list(
                                                                   self.deepl_api.available_source_langs.keys()))
-                            dd_deepl_targetlang = gr.Dropdown(label="Target Language", value="English",
-                                                              choices=list(
-                                                                  self.deepl_api.available_target_langs.keys()))
                         with gr.Row():
-                            cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
                         with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                        interactive=True)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
@@ -268,26 +288,27 @@ class App:
                             btn_openfolder = gr.Button('📂', scale=1)
                     btn_run.click(fn=self.deepl_api.translate_deepl,
-                                  inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
-                                          cb_deepl_ispro, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
                                          inputs=None,
                                          outputs=None)
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
-                            dd_nllb_model = gr.Dropdown(label="Model", value="facebook/nllb-200-1.3B",
                                                         choices=self.nllb_inf.available_models)
-                            dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
-                                                             choices=self.nllb_inf.available_source_langs)
-                            dd_nllb_targetlang = gr.Dropdown(label="Target Language",
-                                                             choices=self.nllb_inf.available_target_langs)
                         with gr.Row():
-                            nb_max_length = gr.Number(label="Max Length Per Line", value=200, precision=0)
                         with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                        interactive=True)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
@@ -299,11 +320,11 @@ class App:
                             md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
                     btn_run.click(fn=self.nllb_inf.translate_file,
-                                  inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang,
                                           nb_max_length, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
                                          inputs=None,
                                          outputs=None)
@@ -351,18 +372,18 @@ parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme
 parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
 parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
 parser.add_argument('--inbrowser', type=bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
-parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"),
                     help='Directory path of the whisper model')
-parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"),
                     help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
-                    default=os.path.join("models", "Whisper", "insanely-fast-whisper"),
                     help='Directory path of the insanely-fast-whisper model')
-parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"),
                     help='Directory path of the diarization model')
-parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"),
                     help='Directory path of the Facebook NLLB model')
-parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()
 if __name__ == "__main__":

 import os
 import argparse
 import gradio as gr
+import yaml
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
+from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
+from modules.ui.htmls import *
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
 from modules.whisper.whisper_parameter import *
         self.deepl_api = DeepLAPI(
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
+        self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
     def create_whisper_parameters(self):
+        whisper_params = self.default_params["whisper"]
+        vad_params = self.default_params["vad"]
+        diarization_params = self.default_params["diarization"]
         with gr.Row():
+            dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
                                    label="Model")
             dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                  value=whisper_params["lang"], label="Language")
             dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
         with gr.Row():
+            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
+                                       interactive=True)
         with gr.Row():
+            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
+            nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
+            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")
+            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
                                           info="Select the type of computation to perform.")
+            nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
                                    info="Number of candidates when sampling with non-zero temperature.")
+            nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
                                     info="Beam search patience factor.")
+            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
                                                         interactive=True,
                                                         info="Condition on previous text during decoding.")
+            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
                                                         minimum=0, maximum=1, step=0.01, interactive=True,
                                                         info="Resets prompt if temperature is above this value."
                                                              " Arg has effect only if 'Condition On Previous Text' is True.")
             tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
                                            info="Initial prompt to use for decoding.")
+            sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
+                                       step=0.01, maximum=1.0, interactive=True,
                                        info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
+            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
+                                                       interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
+                nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
                                                   info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
+                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
+                                                    precision=0,
                                                     info="Prevent repetitions of n-grams with this size (set 0 to disable).")
+                tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
                                        info="Optional text to provide as a prefix for the first window.")
+                cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
                                                 info="Suppress blank outputs at the beginning of the sampling.")
+                tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
                                                 info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
+                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
                                                      info="The initial timestamp cannot be later than this.")
+                cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
                                                  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
+                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
                                                      info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
+                tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
+                nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
+                                              precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
+                nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: whisper_params["chunk_length"],
+                                            precision=0,
                                             info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
+                                                               value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
+                tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
                                          info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
+                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
                                                             info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
+                nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
+                                                           precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+                nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=whisper_params["chunk_length_s"],
+                                              precision=0)
+                nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
         with gr.Accordion("VAD", open=False):
+            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
+                                        interactive=True)
+            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=vad_params["threshold"],
                                      info="Lower it to be more sensitive to small sounds.")
+            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=vad_params["min_speech_duration_ms"],
                                                   info="Final speech chunks shorter than this time are thrown out")
+            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=vad_params["max_speech_duration_s"],
                                                  info="Maximum duration of speech chunks in \"seconds\". Chunks longer"
                                                         " than this time will be split at the timestamp of the last silence that"
                                                         " lasts more than 100ms (if any), to prevent aggressive cutting.")
+            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=vad_params["min_silence_duration_ms"],
                                                    info="In the end of each speech chunk wait for this time"
                                                         " before separating it")
+            nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
                                          info="Final speech chunks are padded by this time each side")
         with gr.Accordion("Diarization", open=False):
+            cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
+            tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
                                   info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
             dd_diarization_device = gr.Dropdown(label="Device",
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
         )
     def launch(self):
+        translation_params = self.default_params["translation"]
+        deepl_params = translation_params["deepl"]
+        nllb_params = translation_params["nllb"]
         with self.app:
             with gr.Row():
                 with gr.Column():
                     with gr.TabItem("DeepL API"):  # sub tab1
                         with gr.Row():
+                            tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"])
                         with gr.Row():
+                            dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"],
+                                                          choices=list(
                                                                   self.deepl_api.available_source_langs.keys()))
+                            dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
+                                                         choices=list(self.deepl_api.available_target_langs.keys()))
                         with gr.Row():
+                            cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
                         with gr.Row():
+                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                                        interactive=True)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
                             btn_openfolder = gr.Button('📂', scale=1)
                     btn_run.click(fn=self.deepl_api.translate_deepl,
+                                  inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
+                                          cb_is_pro, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
                                          inputs=None,
                                          outputs=None)
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
+                            dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
                                                         choices=self.nllb_inf.available_models)
+                            dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
+                                                         choices=self.nllb_inf.available_source_langs)
+                            dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
+                                                         choices=self.nllb_inf.available_target_langs)
                         with gr.Row():
+                            nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
+                                                      precision=0)
                         with gr.Row():
+                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                                        interactive=True)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
                             md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
                     btn_run.click(fn=self.nllb_inf.translate_file,
+                                  inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
                                           nb_max_length, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
                                          inputs=None,
                                          outputs=None)
 parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
 parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
 parser.add_argument('--inbrowser', type=bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
+parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
                     help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
+                    default=INSANELY_FAST_WHISPER_MODELS_DIR,
                     help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
                     help='Directory path of the diarization model')
+parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
                     help='Directory path of the Facebook NLLB model')
+parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
 _args = parser.parse_args()
 if __name__ == "__main__":

configs/default_parameters.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+whisper:
+  model_size: "large-v2"
+  lang: "Automatic Detection"
+  is_translate: false
+  beam_size: 5
+  log_prob_threshold: -1
+  no_speech_threshold: 0.6
+  best_of: 5
+  patience: 1
+  condition_on_previous_text: true
+  prompt_reset_on_temperature: 0.5
+  initial_prompt: null
+  temperature: 0
+  compression_ratio_threshold: 2.4
+  chunk_length_s: 30
+  batch_size: 24
+  length_penalty: 1
+  repetition_penalty: 1
+  no_repeat_ngram_size: 0
+  prefix: null
+  suppress_blank: true
+  suppress_tokens: "[-1]"
+  max_initial_timestamp: 1
+  word_timestamps: false
+  prepend_punctuations: "\"'“¿([{-"
+  append_punctuations: "\"'.。,，!！?？:：”)]}、"
+  max_new_tokens: null
+  chunk_length: null
+  hallucination_silence_threshold: null
+  hotwords: null
+  language_detection_threshold: null
+  language_detection_segments: 1
+  add_timestamp: true
+vad:
+  vad_filter: false
+  threshold: 0.5
+  min_speech_duration_ms: 250
+  max_speech_duration_s: 9999
+  min_silence_duration_ms: 2000
+  speech_pad_ms: 400
+diarization:
+  is_diarize: false
+  hf_token: ""
+translation:
+  deepl:
+    api_key: ""
+    is_pro: false
+    source_lang: "Automatic Detection"
+    target_lang: "English"
+  nllb:
+    model_size: "facebook/nllb-200-1.3B"
+    source_lang: null
+    target_lang: null
+    max_length: 200
+  add_timestamp: true

modules/diarize/diarize_pipeline.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pyannote.audio import Pipeline
 from typing import Optional, Union
 import torch
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
@@ -14,7 +15,7 @@ class DiarizationPipeline:
     def __init__(
         self,
         model_name="pyannote/speaker-diarization-3.1",
-        cache_dir: str = os.path.join("models", "Diarization"),
         use_auth_token=None,
         device: Optional[Union[str, torch.device]] = "cpu",
     ):

 from typing import Optional, Union
 import torch
+from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
     def __init__(
         self,
         model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = DIARIZATION_MODELS_DIR,
         use_auth_token=None,
         device: Optional[Union[str, torch.device]] = "cpu",
     ):

modules/diarize/diarizer.py CHANGED Viewed

@@ -5,13 +5,14 @@ import numpy as np
 import time
 import logging
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
 class Diarizer:
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Diarization")
                  ):
         self.device = self.get_device()
         self.available_device = self.get_available_device()

 import time
 import logging
+from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
 class Diarizer:
     def __init__(self,
+                 model_dir: str = DIARIZATION_MODELS_DIR
                  ):
         self.device = self.get_device()
         self.available_device = self.get_available_device()

modules/translation/deepl_api.py CHANGED Viewed

@@ -4,7 +4,9 @@ import os
 from datetime import datetime
 import gradio as gr
 from modules.utils.subtitle_manager import *
 """
 This is written with reference to the DeepL API documentation.
@@ -83,7 +85,7 @@ DEEPL_AVAILABLE_SOURCE_LANGS = {
 class DeepLAPI:
     def __init__(self,
-                 output_dir: str = os.path.join("outputs", "translations")
                  ):
         self.api_interval = 1
         self.max_text_batch_size = 50
@@ -124,6 +126,13 @@ class DeepLAPI:
         String to return to gr.Textbox()
         Files to return to gr.Files()
         """
         files_info = {}
         for fileobj in fileobjs:
@@ -198,4 +207,20 @@ class DeepLAPI:
         }
         response = requests.post(url, headers=headers, data=data).json()
         time.sleep(self.api_interval)
-        return response["translations"]

 from datetime import datetime
 import gradio as gr
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
 from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
 """
 This is written with reference to the DeepL API documentation.
 class DeepLAPI:
     def __init__(self,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
                  ):
         self.api_interval = 1
         self.max_text_batch_size = 50
         String to return to gr.Textbox()
         Files to return to gr.Files()
         """
+        self.cache_parameters(
+            api_key=auth_key,
+            is_pro=is_pro,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            add_timestamp=add_timestamp
+        )
         files_info = {}
         for fileobj in fileobjs:
         }
         response = requests.post(url, headers=headers, data=data).json()
         time.sleep(self.api_interval)
+        return response["translations"]
+    @staticmethod
+    def cache_parameters(api_key: str,
+                         is_pro: bool,
+                         source_lang: str,
+                         target_lang: str,
+                         add_timestamp: bool):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_params["translation"]["deepl"] = {
+            "api_key": api_key,
+            "is_pro": is_pro,
+            "source_lang": source_lang,
+            "target_lang": target_lang
+        }
+        cached_params["translation"]["add_timestamp"] = add_timestamp
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/translation/nllb_inference.py CHANGED Viewed

@@ -2,13 +2,14 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
 import os
 from modules.translation.translation_base import TranslationBase
 class NLLBInference(TranslationBase):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "NLLB"),
-                 output_dir: str = os.path.join("outputs", "translations")
                  ):
         super().__init__(
             model_dir=model_dir,

 import gradio as gr
 import os
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
 from modules.translation.translation_base import TranslationBase
 class NLLBInference(TranslationBase):
     def __init__(self,
+                 model_dir: str = NLLB_MODELS_DIR,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
                  ):
         super().__init__(
             model_dir=model_dir,

modules/translation/translation_base.py CHANGED Viewed

@@ -7,12 +7,14 @@ from datetime import datetime
 from modules.whisper.whisper_parameter import *
 from modules.utils.subtitle_manager import *
 class TranslationBase(ABC):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "NLLB"),
-                 output_dir: str = os.path.join("outputs", "translations")
                  ):
         super().__init__()
         self.model = None
@@ -75,6 +77,12 @@ class TranslationBase(ABC):
         Files to return to gr.Files()
         """
         try:
             self.update_model(model_size=model_size,
                               src_lang=src_lang,
                               tgt_lang=tgt_lang,
@@ -149,3 +157,19 @@ class TranslationBase(ABC):
         for file_path in file_paths:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)

 from modules.whisper.whisper_parameter import *
 from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
 class TranslationBase(ABC):
     def __init__(self,
+                 model_dir: str = NLLB_MODELS_DIR,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
                  ):
         super().__init__()
         self.model = None
         Files to return to gr.Files()
         """
         try:
+            self.cache_parameters(model_size=model_size,
+                                  src_lang=src_lang,
+                                  tgt_lang=tgt_lang,
+                                  max_length=max_length,
+                                  add_timestamp=add_timestamp)
             self.update_model(model_size=model_size,
                               src_lang=src_lang,
                               tgt_lang=tgt_lang,
         for file_path in file_paths:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
+    @staticmethod
+    def cache_parameters(model_size: str,
+                         src_lang: str,
+                         tgt_lang: str,
+                         max_length: int,
+                         add_timestamp: bool):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_params["translation"]["nllb"] = {
+            "model_size": model_size,
+            "source_lang": src_lang,
+            "target_lang": tgt_lang,
+            "max_length": max_length,
+        }
+        cached_params["translation"]["add_timestamp"] = add_timestamp
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

{ui → modules/ui}/__init__.py RENAMED Viewed

File without changes

{ui → modules/ui}/htmls.py RENAMED Viewed

File without changes

modules/utils/files_manager.py CHANGED Viewed

@@ -1,8 +1,32 @@
 import os
 import fnmatch
 from gradio.utils import NamedString
 def get_media_files(folder_path, include_sub_directory=False):
     video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']

 import os
 import fnmatch
+from ruamel.yaml import YAML
 from gradio.utils import NamedString
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
+def load_yaml(path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
+    yaml = YAML(typ="safe")
+    yaml.preserve_quotes = True
+    with open(path, 'r', encoding='utf-8') as file:
+        config = yaml.load(file)
+    return config
+def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
+    yaml = YAML(typ="safe")
+    yaml.map_indent = 2
+    yaml.sequence_indent = 4
+    yaml.sequence_dash_offset = 2
+    yaml.preserve_quotes = True
+    yaml.default_flow_style = False
+    yaml.sort_base_mapping_type_on_output = False
+    with open(path, 'w', encoding='utf-8') as file:
+        yaml.dump(data, file)
+    return path
 def get_media_files(folder_path, include_sub_directory=False):
     video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']

modules/utils/paths.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+WEBUI_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+MODELS_DIR = os.path.join(WEBUI_DIR, "models")
+WHISPER_MODELS_DIR = os.path.join(MODELS_DIR, "Whisper")
+FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
+INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
+NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
+DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
+CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
+DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
+OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
+TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
+for dir_path in [MODELS_DIR,
+                 WHISPER_MODELS_DIR,
+                 FASTER_WHISPER_MODELS_DIR,
+                 INSANELY_FAST_WHISPER_MODELS_DIR,
+                 NLLB_MODELS_DIR,
+                 DIARIZATION_MODELS_DIR,
+                 CONFIGS_DIR,
+                 OUTPUT_DIR,
+                 TRANSLATION_OUTPUT_DIR]:
+    os.makedirs(dir_path, exist_ok=True)

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -11,15 +11,16 @@ import whisper
 import gradio as gr
 from argparse import Namespace
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,
@@ -163,14 +164,12 @@ class FasterWhisperInference(WhisperBase):
         wrong_dirs = [".locks"]
         existing_models = list(set(existing_models) - set(wrong_dirs))
-        webui_dir = os.getcwd()
         for model_name in existing_models:
             if faster_whisper_prefix in model_name:
                 model_name = model_name[len(faster_whisper_prefix):]
             if model_name not in whisper.available_models():
-                model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
         return model_paths
     @staticmethod

 import gradio as gr
 from argparse import Namespace
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = FASTER_WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
         wrong_dirs = [".locks"]
         existing_models = list(set(existing_models) - set(wrong_dirs))
         for model_name in existing_models:
             if faster_whisper_prefix in model_name:
                 model_name = model_name[len(faster_whisper_prefix):]
             if model_name not in whisper.available_models():
+                model_paths[model_name] = os.path.join(self.model_dir, model_name)
         return model_paths
     @staticmethod

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -11,15 +11,16 @@ import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,

 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
+from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -7,15 +7,16 @@ import torch
 import os
 from argparse import Namespace
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,

 import os
 from argparse import Namespace
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,

modules/whisper/whisper_base.py CHANGED Viewed

@@ -9,9 +9,10 @@ from datetime import datetime
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
-from modules.utils.files_manager import get_media_files, format_gradio_files
 from modules.whisper.whisper_parameter import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
@@ -19,9 +20,9 @@ from modules.vad.silero_vad import SileroVAD
 class WhisperBase(ABC):
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
                  ):
         self.model_dir = model_dir
         self.output_dir = output_dir
@@ -61,7 +62,8 @@ class WhisperBase(ABC):
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
-            progress: gr.Progress,
             *whisper_params,
             ) -> Tuple[List[dict], float]:
         """
@@ -75,6 +77,8 @@ class WhisperBase(ABC):
             Audio input. This can be file path or binary type.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
@@ -87,6 +91,11 @@ class WhisperBase(ABC):
         """
         params = WhisperParameters.as_value(*whisper_params)
         if params.lang == "Automatic Detection":
             params.lang = None
         else:
@@ -178,6 +187,7 @@ class WhisperBase(ABC):
                 transcribed_segments, time_for_task = self.run(
                     file.name,
                     progress,
                     *whisper_params,
                 )
@@ -301,6 +311,7 @@ class WhisperBase(ABC):
             transcribed_segments, time_for_task = self.run(
                 audio,
                 progress,
                 *whisper_params,
             )
@@ -434,3 +445,15 @@ class WhisperBase(ABC):
         for file_path in file_paths:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)

 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
 from modules.whisper.whisper_parameter import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
 class WhisperBase(ABC):
     def __init__(self,
+                 model_dir: str = WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
                  ):
         self.model_dir = model_dir
         self.output_dir = output_dir
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
+            progress: gr.Progress = gr.Progress(),
+            add_timestamp: bool = True,
             *whisper_params,
             ) -> Tuple[List[dict], float]:
         """
             Audio input. This can be file path or binary type.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        add_timestamp: bool
+            Whether to add a timestamp at the end of the filename.
         *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         """
         params = WhisperParameters.as_value(*whisper_params)
+        self.cache_parameters(
+            whisper_params=params,
+            add_timestamp=add_timestamp
+        )
         if params.lang == "Automatic Detection":
             params.lang = None
         else:
                 transcribed_segments, time_for_task = self.run(
                     file.name,
                     progress,
+                    add_timestamp,
                     *whisper_params,
                 )
             transcribed_segments, time_for_task = self.run(
                 audio,
                 progress,
+                add_timestamp,
                 *whisper_params,
             )
         for file_path in file_paths:
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
+    @staticmethod
+    def cache_parameters(
+        whisper_params: WhisperValues,
+        add_timestamp: bool
+    ):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_whisper_param = whisper_params.to_yaml()
+        cached_yaml = {**cached_params, **cached_whisper_param}
+        cached_yaml["whisper"]["add_timestamp"] = add_timestamp
+        save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/whisper/whisper_factory.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import Optional
 import os
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -11,11 +13,11 @@ class WhisperFactory:
     @staticmethod
     def create_whisper_inference(
         whisper_type: str,
-        whisper_model_dir: str = os.path.join("models", "Whisper"),
-        faster_whisper_model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
-        insanely_fast_whisper_model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
-        diarization_model_dir: str = os.path.join("models", "Diarization"),
-        output_dir: str = os.path.join("outputs"),
     ) -> "WhisperBase":
         """
         Create a whisper inference class based on the provided whisper_type.

 from typing import Optional
 import os
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
     @staticmethod
     def create_whisper_inference(
         whisper_type: str,
+        whisper_model_dir: str = WHISPER_MODELS_DIR,
+        faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
+        insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
+        diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+        output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
         Create a whisper inference class based on the provided whisper_type.

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass, fields
 import gradio as gr
-from typing import Optional
 @dataclass
@@ -274,4 +275,54 @@ class WhisperValues:
     language_detection_segments: int
     """
     A data class to use Whisper parameters.
-    """

 from dataclasses import dataclass, fields
 import gradio as gr
+from typing import Optional, Dict
+import yaml
 @dataclass
     language_detection_segments: int
     """
     A data class to use Whisper parameters.
+    """
+    def to_yaml(self) -> Dict:
+        data = {
+            "whisper": {
+                "model_size": self.model_size,
+                "lang": "Automatic Detection" if self.lang is None else self.lang,
+                "is_translate": self.is_translate,
+                "beam_size": self.beam_size,
+                "log_prob_threshold": self.log_prob_threshold,
+                "no_speech_threshold": self.no_speech_threshold,
+                "best_of": self.best_of,
+                "patience": self.patience,
+                "condition_on_previous_text": self.condition_on_previous_text,
+                "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
+                "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
+                "temperature": self.temperature,
+                "compression_ratio_threshold": self.compression_ratio_threshold,
+                "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
+                "batch_size": self.batch_size,
+                "length_penalty": self.length_penalty,
+                "repetition_penalty": self.repetition_penalty,
+                "no_repeat_ngram_size": self.no_repeat_ngram_size,
+                "prefix": None if not self.prefix else self.prefix,
+                "suppress_blank": self.suppress_blank,
+                "suppress_tokens": self.suppress_tokens,
+                "max_initial_timestamp": self.max_initial_timestamp,
+                "word_timestamps": self.word_timestamps,
+                "prepend_punctuations": self.prepend_punctuations,
+                "append_punctuations": self.append_punctuations,
+                "max_new_tokens": self.max_new_tokens,
+                "chunk_length": self.chunk_length,
+                "hallucination_silence_threshold": self.hallucination_silence_threshold,
+                "hotwords": None if not self.hotwords else self.hotwords,
+                "language_detection_threshold": self.language_detection_threshold,
+                "language_detection_segments": self.language_detection_segments,
+            },
+            "vad": {
+                "vad_filter": self.vad_filter,
+                "threshold": self.threshold,
+                "min_speech_duration_ms": self.min_speech_duration_ms,
+                "max_speech_duration_s": self.max_speech_duration_s,
+                "min_silence_duration_ms": self.min_silence_duration_ms,
+                "speech_pad_ms": self.speech_pad_ms,
+            },
+            "diarization": {
+                "is_diarize": self.is_diarize,
+                "hf_token": self.hf_token
+            }
+        }
+        return data

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ faster-whisper==1.0.3
 transformers==4.42.3
 gradio==4.29.0
 pytubefix
 pyannote.audio==3.3.1

 transformers==4.42.3
 gradio==4.29.0
 pytubefix
+ruamel.yaml==0.18.6
 pyannote.audio==3.3.1