Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jun 18, 2024

Commit

d843d51

unverified ·

2 Parent(s): d868316 091209e

Merge pull request #173 from jhj0517/fix/refactor-scalability

Browse files

Files changed (8) hide show

app.py +21 -13
modules/base_interface.py +0 -23
modules/faster_whisper_inference.py +5 -285
modules/nllb_inference.py +31 -123
modules/translation_base.py +148 -0
modules/whisper_Inference.py +7 -284
modules/whisper_base.py +333 -0
user-start-webui.bat +5 -2

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import os
 import argparse
-import webbrowser
 from modules.whisper_Inference import WhisperInference
 from modules.faster_whisper_inference import FasterWhisperInference
@@ -16,17 +15,26 @@ class App:
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme)
-        self.whisper_inf = WhisperInference() if self.args.disable_faster_whisper else FasterWhisperInference()
-        if isinstance(self.whisper_inf, FasterWhisperInference):
-            self.whisper_inf.model_dir = args.faster_whisper_model_dir
-            print("Use Faster Whisper implementation")
-        else:
-            self.whisper_inf.model_dir = args.whisper_model_dir
-            print("Use Open AI Whisper implementation")
         print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference()
         self.deepl_api = DeepLAPI()
     @staticmethod
     def open_folder(folder_path: str):
         if os.path.exists(folder_path):
@@ -61,7 +69,7 @@ class App:
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -135,7 +143,7 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -201,7 +209,7 @@ class App:
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -289,7 +297,7 @@ class App:
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
-                            dd_nllb_model = gr.Dropdown(label="Model", value=self.nllb_inf.default_model_size,
                                                         choices=self.nllb_inf.available_models)
                             dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
                                                              choices=self.nllb_inf.available_source_langs)
@@ -332,7 +340,7 @@ class App:
 # Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('--disable_faster_whisper', type=bool, default=False, nargs='?', const=True, help='Disable the faster_whisper implementation. faster_whipser is implemented by https://github.com/guillaumekln/faster-whisper')
 parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')

 import gradio as gr
 import os
 import argparse
 from modules.whisper_Inference import WhisperInference
 from modules.faster_whisper_inference import FasterWhisperInference
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme)
+        self.whisper_inf = self.init_whisper()
+        print(f"Use \"{self.args.whisper_type}\" implementation")
         print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference()
         self.deepl_api = DeepLAPI()
+    def init_whisper(self):
+        whisper_type = self.args.whisper_type.lower().strip()
+        if whisper_type in ["faster_whisper", "faster-whisper"]:
+            whisper_inf = FasterWhisperInference()
+            whisper_inf.model_dir = self.args.faster_whisper_model_dir
+        if whisper_type in ["whisper"]:
+            whisper_inf = WhisperInference()
+            whisper_inf.model_dir = self.args.whisper_model_dir
+        else:
+            whisper_inf = FasterWhisperInference()
+            whisper_inf.model_dir = self.args.faster_whisper_model_dir
+        return whisper_inf
     @staticmethod
     def open_folder(folder_path: str):
         if os.path.exists(folder_path):
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
+                            dd_nllb_model = gr.Dropdown(label="Model", value="facebook/nllb-200-1.3B",
                                                         choices=self.nllb_inf.available_models)
                             dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
                                                              choices=self.nllb_inf.available_source_langs)
 # Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default="faster-whisper", help='A type of the whisper implementation between: ["whisper", "faster-whisper"]')
 parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')

modules/base_interface.py DELETED Viewed

@@ -1,23 +0,0 @@
-import os
-import torch
-from typing import List
-class BaseInterface:
-    def __init__(self):
-        pass
-    @staticmethod
-    def release_cuda_memory():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.reset_max_memory_allocated()
-    @staticmethod
-    def remove_input_files(file_paths: List[str]):
-        if not file_paths:
-            return
-        for file_path in file_paths:
-            if file_path and os.path.exists(file_path):
-                os.remove(file_path)

modules/faster_whisper_inference.py CHANGED Viewed

@@ -2,233 +2,29 @@ import os
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
-from datetime import datetime
 import faster_whisper
 from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
-import torch
 import gradio as gr
-from .base_interface import BaseInterface
-from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
-from modules.youtube_manager import get_ytdata, get_ytaudio
 from modules.whisper_parameter import *
 # Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-class FasterWhisperInference(BaseInterface):
     def __init__(self):
-        super().__init__()
-        self.model_dir = os.path.join("models", "Whisper", "faster-whisper")
-        os.makedirs(self.model_dir, exist_ok=True)
-        self.current_model_size = None
-        self.model = None
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()
-        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
-        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
-        if torch.cuda.is_available():
-            self.device = "cuda"
-        elif torch.backends.mps.is_available():
-            self.device = "mps"
-        else:
-            self.device = "cpu"
         self.available_compute_types = ctranslate2.get_supported_compute_types(
             "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
-        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-    def transcribe_file(self,
-                        files: list,
-                        file_format: str,
-                        add_timestamp: bool,
-                        progress=gr.Progress(),
-                        *whisper_params,
-                        ) -> list:
-        """
-        Write subtitle file from Files
-        Parameters
-        ----------
-        files: list
-            List of files to transcribe from gr.Files()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            files_info = {}
-            for file in files:
-                transcribed_segments, time_for_task = self.transcribe(
-                    file.name,
-                    progress,
-                    *whisper_params,
-                )
-                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
-                file_name = safe_filename(file_name)
-                subtitle, file_path = self.generate_and_write_file(
-                    file_name=file_name,
-                    transcribed_segments=transcribed_segments,
-                    add_timestamp=add_timestamp,
-                    file_format=file_format
-                )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
-            total_result = ''
-            total_time = 0
-            for file_name, info in files_info.items():
-                total_result += '------------------------------------\n'
-                total_result += f'{file_name}\n\n'
-                total_result += f'{info["subtitle"]}'
-                total_time += info["time_for_task"]
-            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            result_file_path = [info['path'] for info in files_info.values()]
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            self.release_cuda_memory()
-            if not files:
-                self.remove_input_files([file.name for file in files])
-    def transcribe_youtube(self,
-                           youtube_link: str,
-                           file_format: str,
-                           add_timestamp: bool,
-                           progress=gr.Progress(),
-                           *whisper_params,
-                           ) -> list:
-        """
-        Write subtitle file from Youtube
-        Parameters
-        ----------
-        youtube_link: str
-            URL of the Youtube video to transcribe from gr.Textbox()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtube_link)
-            audio = get_ytaudio(yt)
-            transcribed_segments, time_for_task = self.transcribe(
-                audio,
-                progress,
-                *whisper_params,
-            )
-            progress(1, desc="Completed!")
-            file_name = safe_filename(yt.title)
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name=file_name,
-                transcribed_segments=transcribed_segments,
-                add_timestamp=add_timestamp,
-                file_format=file_format
-            )
-            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            try:
-                if 'yt' not in locals():
-                    yt = get_ytdata(youtube_link)
-                    file_path = get_ytaudio(yt)
-                else:
-                    file_path = get_ytaudio(yt)
-                self.release_cuda_memory()
-                self.remove_input_files([file_path])
-            except Exception as cleanup_error:
-                pass
-    def transcribe_mic(self,
-                       mic_audio: str,
-                       file_format: str,
-                       progress=gr.Progress(),
-                       *whisper_params,
-                       ) -> list:
-        """
-        Write subtitle file from microphone
-        Parameters
-        ----------
-        mic_audio: str
-            Audio file path from gr.Microphone()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio..")
-            transcribed_segments, time_for_task = self.transcribe(
-                mic_audio,
-                progress,
-                *whisper_params,
-            )
-            progress(1, desc="Completed!")
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name="Mic",
-                transcribed_segments=transcribed_segments,
-                add_timestamp=True,
-                file_format=file_format
-            )
-            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            self.release_cuda_memory()
-            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
@@ -356,79 +152,3 @@ class FasterWhisperInference(BaseInterface):
             if model_name not in whisper.available_models():
                 model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
         return model_paths
-    @staticmethod
-    def generate_and_write_file(file_name: str,
-                                transcribed_segments: list,
-                                add_timestamp: bool,
-                                file_format: str,
-                                ) -> str:
-        """
-        Writes subtitle file
-        Parameters
-        ----------
-        file_name: str
-            Output file name
-        transcribed_segments: list
-            Text segments transcribed from audio
-        add_timestamp: bool
-            Determines whether to add a timestamp to the end of the filename.
-        file_format: str
-            File format to write. Supported formats: [SRT, WebVTT, txt]
-        Returns
-        ----------
-        content: str
-            Result of the transcription
-        output_path: str
-            output file path
-        """
-        timestamp = datetime.now().strftime("%m%d%H%M%S")
-        if add_timestamp:
-            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
-        else:
-            output_path = os.path.join("outputs", f"{file_name}")
-        if file_format == "SRT":
-            content = get_srt(transcribed_segments)
-            output_path += '.srt'
-            write_file(content, output_path)
-        elif file_format == "WebVTT":
-            content = get_vtt(transcribed_segments)
-            output_path += '.vtt'
-            write_file(content, output_path)
-        elif file_format == "txt":
-            content = get_txt(transcribed_segments)
-            output_path += '.txt'
-            write_file(content, output_path)
-        return content, output_path
-    @staticmethod
-    def format_time(elapsed_time: float) -> str:
-        """
-        Get {hours} {minutes} {seconds} time format string
-        Parameters
-        ----------
-        elapsed_time: str
-            Elapsed time for transcription
-        Returns
-        ----------
-        Time format string
-        """
-        hours, rem = divmod(elapsed_time, 3600)
-        minutes, seconds = divmod(rem, 60)
-        time_str = ""
-        if hours:
-            time_str += f"{hours} hours "
-        if minutes:
-            time_str += f"{minutes} minutes "
-        seconds = round(seconds)
-        time_str += f"{seconds} seconds"
-        return time_str.strip()

 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
 import faster_whisper
 from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import gradio as gr
 from modules.whisper_parameter import *
+from modules.whisper_base import WhisperBase
 # Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+class FasterWhisperInference(WhisperBase):
     def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "Whisper", "faster-whisper")
+        )
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()
         self.available_compute_types = ctranslate2.get_supported_compute_types(
             "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
             if model_name not in whisper.available_models():
                 model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
         return model_paths

modules/nllb_inference.py CHANGED Viewed

@@ -1,141 +1,49 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
-import torch
 import os
-from datetime import datetime
-from .base_interface import BaseInterface
-from modules.subtitle_manager import *
-DEFAULT_MODEL_SIZE = "facebook/nllb-200-1.3B"
-NLLB_MODELS = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
-class NLLBInference(BaseInterface):
     def __init__(self):
-        super().__init__()
-        self.default_model_size = DEFAULT_MODEL_SIZE
-        self.current_model_size = None
-        self.model = None
         self.tokenizer = None
-        self.available_models = NLLB_MODELS
         self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
         self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
-        self.device = 0 if torch.cuda.is_available() else -1
         self.pipeline = None
-    def translate_text(self, text):
         result = self.pipeline(text)
         return result[0]['translation_text']
-    def translate_file(self,
-                       fileobjs: list,
-                       model_size: str,
-                       src_lang: str,
-                       tgt_lang: str,
-                       add_timestamp: bool,
-                       progress=gr.Progress()) -> list:
-        """
-        Translate subtitle file from source language to target language
-        Parameters
-        ----------
-        fileobjs: list
-            List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        src_lang: str
-            Source language of the file to translate from gr.Dropdown()
-        tgt_lang: str
-            Target language of the file to translate from gr.Dropdown()
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
-        Returns
-        ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
-        """
-        try:
-            if model_size != self.current_model_size or self.model is None:
-                print("\nInitializing NLLB Model..\n")
-                progress(0, desc="Initializing NLLB Model..")
-                self.current_model_size = model_size
-                self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                                   cache_dir=os.path.join("models", "NLLB"))
-                self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                               cache_dir=os.path.join("models", "NLLB", "tokenizers"))
-            src_lang = NLLB_AVAILABLE_LANGS[src_lang]
-            tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
-            self.pipeline = pipeline("translation",
-                                     model=self.model,
-                                     tokenizer=self.tokenizer,
-                                     src_lang=src_lang,
-                                     tgt_lang=tgt_lang,
-                                     device=self.device)
-            files_info = {}
-            for fileobj in fileobjs:
-                file_path = fileobj.name
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
-                if file_ext == ".srt":
-                    parsed_dicts = parse_srt(file_path=file_path)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate_text(dic["sentence"])
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_srt(parsed_dicts)
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    if add_timestamp:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
-                    else:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}")
-                    output_path += '.srt'
-                    write_file(subtitle, output_path)
-                elif file_ext == ".vtt":
-                    parsed_dicts = parse_vtt(file_path=file_path)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate_text(dic["sentence"])
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_vtt(parsed_dicts)
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    if add_timestamp:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
-                    else:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}")
-                    output_path += '.vtt'
-                    write_file(subtitle, output_path)
-                files_info[file_name] = subtitle
-            total_result = ''
-            for file_name, subtitle in files_info.items():
-                total_result += '------------------------------------\n'
-                total_result += f'{file_name}\n\n'
-                total_result += f'{subtitle}'
-            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
-            return [gr_str, output_path]
-        except Exception as e:
-            print(f"Error: {str(e)}")
-        finally:
-            self.release_cuda_memory()
-            self.remove_input_files([fileobj.name for fileobj in fileobjs])
 NLLB_AVAILABLE_LANGS = {
     "Acehnese (Arabic script)": "ace_Arab",

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
 import os
+from modules.translation_base import TranslationBase
+class NLLBInference(TranslationBase):
     def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "NLLB")
+        )
         self.tokenizer = None
+        self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
         self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
         self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
         self.pipeline = None
+    def translate(self,
+                  text: str
+                  ):
         result = self.pipeline(text)
         return result[0]['translation_text']
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        if model_size != self.current_model_size or self.model is None:
+            print("\nInitializing NLLB Model..\n")
+            progress(0, desc="Initializing NLLB Model..")
+            self.current_model_size = model_size
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=self.model_dir)
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"))
+        src_lang = NLLB_AVAILABLE_LANGS[src_lang]
+        tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
+        self.pipeline = pipeline("translation",
+                                 model=self.model,
+                                 tokenizer=self.tokenizer,
+                                 src_lang=src_lang,
+                                 tgt_lang=tgt_lang,
+                                 device=self.device)
 NLLB_AVAILABLE_LANGS = {
     "Acehnese (Arabic script)": "ace_Arab",

modules/translation_base.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import torch
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import List
+from datetime import datetime
+from modules.whisper_parameter import *
+from modules.subtitle_manager import *
+class TranslationBase(ABC):
+    def __init__(self,
+                 model_dir: str):
+        super().__init__()
+        self.model = None
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.current_model_size = None
+        self.device = self.get_device()
+    @abstractmethod
+    def translate(self,
+                  text: str
+                  ):
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        pass
+    def translate_file(self,
+                       fileobjs: list,
+                       model_size: str,
+                       src_lang: str,
+                       tgt_lang: str,
+                       add_timestamp: bool,
+                       progress=gr.Progress()) -> list:
+        """
+        Translate subtitle file from source language to target language
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        src_lang: str
+            Source language of the file to translate from gr.Dropdown()
+        tgt_lang: str
+            Target language of the file to translate from gr.Dropdown()
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model(model_size=model_size,
+                              src_lang=src_lang,
+                              tgt_lang=tgt_lang,
+                              progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                file_path = fileobj.name
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                if file_ext == ".srt":
+                    parsed_dicts = parse_srt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_srt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}.srt")
+                elif file_ext == ".vtt":
+                    parsed_dicts = parse_vtt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_vtt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}.vtt")
+                write_file(subtitle, output_path)
+                files_info[file_name] = subtitle
+            total_result = ''
+            for file_name, subtitle in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{subtitle}'
+            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            return [gr_str, output_path]
+        except Exception as e:
+            print(f"Error: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([fileobj.name for fileobj in fileobjs])
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

modules/whisper_Inference.py CHANGED Viewed

@@ -4,218 +4,17 @@ import time
 import os
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
-from datetime import datetime
 import torch
-from .base_interface import BaseInterface
-from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
-from modules.youtube_manager import get_ytdata, get_ytaudio
 from modules.whisper_parameter import *
-DEFAULT_MODEL_SIZE = "large-v3"
-class WhisperInference(BaseInterface):
     def __init__(self):
-        super().__init__()
-        self.current_model_size = None
-        self.model = None
-        self.available_models = whisper.available_models()
-        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
-        self.translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
-        if torch.cuda.is_available():
-            self.device = "cuda"
-        elif torch.backends.mps.is_available():
-            self.device = "mps"
-        else:
-            self.device = "cpu"
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.available_compute_types = ["float16", "float32"]
-        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-        self.model_dir = os.path.join("models", "Whisper")
-    def transcribe_file(self,
-                        files: list,
-                        file_format: str,
-                        add_timestamp: bool,
-                        progress=gr.Progress(),
-                        *whisper_params
-                        ) -> list:
-        """
-        Write subtitle file from Files
-        Parameters
-        ----------
-        files: list
-            List of files to transcribe from gr.Files()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            files_info = {}
-            for file in files:
-                progress(0, desc="Loading Audio..")
-                audio = whisper.load_audio(file.name)
-                result, elapsed_time = self.transcribe(audio,
-                                                       progress,
-                                                       *whisper_params)
-                progress(1, desc="Completed!")
-                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
-                file_name = safe_filename(file_name)
-                subtitle, file_path = self.generate_and_write_file(
-                    file_name=file_name,
-                    transcribed_segments=result,
-                    add_timestamp=add_timestamp,
-                    file_format=file_format
-                )
-                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path": file_path}
-            total_result = ''
-            total_time = 0
-            for file_name, info in files_info.items():
-                total_result += '------------------------------------\n'
-                total_result += f'{file_name}\n\n'
-                total_result += f"{info['subtitle']}"
-                total_time += info["elapsed_time"]
-            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            result_file_path = [info['path'] for info in files_info.values()]
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {str(e)}")
-        finally:
-            self.release_cuda_memory()
-            self.remove_input_files([file.name for file in files])
-    def transcribe_youtube(self,
-                           youtube_link: str,
-                           file_format: str,
-                           add_timestamp: bool,
-                           progress=gr.Progress(),
-                           *whisper_params) -> list:
-        """
-        Write subtitle file from Youtube
-        Parameters
-        ----------
-        youtube_link: str
-            URL of the Youtube video to transcribe from gr.Textbox()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtube_link)
-            audio = whisper.load_audio(get_ytaudio(yt))
-            result, elapsed_time = self.transcribe(audio,
-                                                   progress,
-                                                   *whisper_params)
-            progress(1, desc="Completed!")
-            file_name = safe_filename(yt.title)
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name=file_name,
-                transcribed_segments=result,
-                add_timestamp=add_timestamp,
-                file_format=file_format
-            )
-            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing youtube video: {str(e)}")
-        finally:
-            try:
-                if 'yt' not in locals():
-                    yt = get_ytdata(youtube_link)
-                    file_path = get_ytaudio(yt)
-                else:
-                    file_path = get_ytaudio(yt)
-                self.release_cuda_memory()
-                self.remove_input_files([file_path])
-            except Exception as cleanup_error:
-                pass
-    def transcribe_mic(self,
-                       mic_audio: str,
-                       file_format: str,
-                       progress=gr.Progress(),
-                       *whisper_params) -> list:
-        """
-        Write subtitle file from microphone
-        Parameters
-        ----------
-        mic_audio: str
-            Audio file path from gr.Microphone()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio..")
-            result, elapsed_time = self.transcribe(
-                mic_audio,
-                progress,
-                *whisper_params,
-            )
-            progress(1, desc="Completed!")
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name="Mic",
-                transcribed_segments=result,
-                add_timestamp=True,
-                file_format=file_format
-            )
-            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing mic: {str(e)}")
-        finally:
-            self.release_cuda_memory()
-            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
@@ -259,7 +58,7 @@ class WhisperInference(BaseInterface):
                                                 beam_size=params.beam_size,
                                                 logprob_threshold=params.log_prob_threshold,
                                                 no_speech_threshold=params.no_speech_threshold,
-                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_model else "transcribe",
                                                 fp16=True if params.compute_type == "float16" else False,
                                                 best_of=params.best_of,
                                                 patience=params.patience,
@@ -295,80 +94,4 @@ class WhisperInference(BaseInterface):
             name=model_size,
             device=self.device,
             download_root=self.model_dir
-        )
-    @staticmethod
-    def generate_and_write_file(file_name: str,
-                                transcribed_segments: list,
-                                add_timestamp: bool,
-                                file_format: str,
-                                ) -> str:
-        """
-        Writes subtitle file
-        Parameters
-        ----------
-        file_name: str
-            Output file name
-        transcribed_segments: list
-            Text segments transcribed from audio
-        add_timestamp: bool
-            Determines whether to add a timestamp to the end of the filename.
-        file_format: str
-            File format to write. Supported formats: [SRT, WebVTT, txt]
-        Returns
-        ----------
-        content: str
-            Result of the transcription
-        output_path: str
-            output file path
-        """
-        timestamp = datetime.now().strftime("%m%d%H%M%S")
-        if add_timestamp:
-            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
-        else:
-            output_path = os.path.join("outputs", f"{file_name}")
-        if file_format == "SRT":
-            content = get_srt(transcribed_segments)
-            output_path += '.srt'
-            write_file(content, output_path)
-        elif file_format == "WebVTT":
-            content = get_vtt(transcribed_segments)
-            output_path += '.vtt'
-            write_file(content, output_path)
-        elif file_format == "txt":
-            content = get_txt(transcribed_segments)
-            output_path += '.txt'
-            write_file(content, output_path)
-        return content, output_path
-    @staticmethod
-    def format_time(elapsed_time: float) -> str:
-        """
-        Get {hours} {minutes} {seconds} time format string
-        Parameters
-        ----------
-        elapsed_time: str
-            Elapsed time for transcription
-        Returns
-        ----------
-        Time format string
-        """
-        hours, rem = divmod(elapsed_time, 3600)
-        minutes, seconds = divmod(rem, 60)
-        time_str = ""
-        if hours:
-            time_str += f"{hours} hours "
-        if minutes:
-            time_str += f"{minutes} minutes "
-        seconds = round(seconds)
-        time_str += f"{seconds} seconds"
-        return time_str.strip()

 import os
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
+from modules.whisper_base import WhisperBase
 from modules.whisper_parameter import *
+class WhisperInference(WhisperBase):
     def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "Whisper")
+        )
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
                                                 beam_size=params.beam_size,
                                                 logprob_threshold=params.log_prob_threshold,
                                                 no_speech_threshold=params.no_speech_threshold,
+                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
                                                 fp16=True if params.compute_type == "float16" else False,
                                                 best_of=params.best_of,
                                                 patience=params.patience,
             name=model_size,
             device=self.device,
             download_root=self.model_dir
+        )

modules/whisper_base.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import torch
+from typing import List
+import whisper
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+from datetime import datetime
+from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_parameter import *
+class WhisperBase(ABC):
+    def __init__(self,
+                 model_dir: str):
+        self.model = None
+        self.current_model_size = None
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
+        self.device = self.get_device()
+        self.available_compute_types = ["float16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+    @abstractmethod
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ):
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
+        pass
+    def transcribe_file(self,
+                        files: list,
+                        file_format: str,
+                        add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params,
+                        ) -> list:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        files: list
+            List of files to transcribe from gr.Files()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            files_info = {}
+            for file in files:
+                transcribed_segments, time_for_task = self.transcribe(
+                    file.name,
+                    progress,
+                    *whisper_params,
+                )
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
+                file_name = safe_filename(file_name)
+                subtitle, file_path = self.generate_and_write_file(
+                    file_name=file_name,
+                    transcribed_segments=transcribed_segments,
+                    add_timestamp=add_timestamp,
+                    file_format=file_format
+                )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+                total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            if not files:
+                self.remove_input_files([file.name for file in files])
+    def transcribe_mic(self,
+                       mic_audio: str,
+                       file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params,
+                       ) -> list:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        mic_audio: str
+            Audio file path from gr.Microphone()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name="Mic",
+                transcribed_segments=transcribed_segments,
+                add_timestamp=True,
+                file_format=file_format
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
+    def transcribe_youtube(self,
+                           youtube_link: str,
+                           file_format: str,
+                           add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params,
+                           ) -> list:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
+            audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.transcribe(
+                audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name=file_name,
+                transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
+                file_format=file_format
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
+    @staticmethod
+    def generate_and_write_file(file_name: str,
+                                transcribed_segments: list,
+                                add_timestamp: bool,
+                                file_format: str,
+                                ) -> str:
+        """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
+        """
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        if add_timestamp:
+            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
+        else:
+            output_path = os.path.join("outputs", f"{file_name}")
+        if file_format == "SRT":
+            content = get_srt(transcribed_segments)
+            output_path += '.srt'
+            write_file(content, output_path)
+        elif file_format == "WebVTT":
+            content = get_vtt(transcribed_segments)
+            output_path += '.vtt'
+            write_file(content, output_path)
+        elif file_format == "txt":
+            content = get_txt(transcribed_segments)
+            output_path += '.txt'
+            write_file(content, output_path)
+        return content, output_path
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

user-start-webui.bat CHANGED Viewed

@@ -8,8 +8,8 @@ set USERNAME=
 set PASSWORD=
 set SHARE=
 set THEME=
-set DISABLE_FASTER_WHISPER=
 set API_OPEN=
 set WHISPER_MODEL_DIR=
 set FASTER_WHISPER_MODEL_DIR=
@@ -38,6 +38,9 @@ if /I "%DISABLE_FASTER_WHISPER%"=="true" (
 if /I "%API_OPEN%"=="true" (
     set API_OPEN=--api_open
 )
 if not "%WHISPER_MODEL_DIR%"=="" (
     set WHISPER_MODEL_DIR_ARG=--whisper_model_dir "%WHISPER_MODEL_DIR%"
 )
@@ -46,5 +49,5 @@ if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
 )
 :: Call the original .bat script with optional arguments
-start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %DISABLE_FASTER_WHISPER_ARG% %API_OPEN% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG%
 pause

 set PASSWORD=
 set SHARE=
 set THEME=
 set API_OPEN=
+set WHISPER_TYPE=
 set WHISPER_MODEL_DIR=
 set FASTER_WHISPER_MODEL_DIR=
 if /I "%API_OPEN%"=="true" (
     set API_OPEN=--api_open
 )
+if not "%WHISPER_TYPE%"=="" (
+    set WHISPER_TYPE_ARG=--whisper_type %WHISPER_TYPE%
+)
 if not "%WHISPER_MODEL_DIR%"=="" (
     set WHISPER_MODEL_DIR_ARG=--whisper_model_dir "%WHISPER_MODEL_DIR%"
 )
 )
 :: Call the original .bat script with optional arguments
+start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG%
 pause