jhj0517 commited on
Commit
3764662
·
1 Parent(s): f3ecc7a

Add infos for the clear use of submodels

Browse files
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -59,6 +59,7 @@ class App:
59
  with gr.Row():
60
  cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
61
  interactive=True)
 
62
  with gr.Accordion("Advanced Parameters", open=False):
63
  nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
64
  info="Beam size to use for decoding.")
@@ -129,30 +130,35 @@ class App:
129
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
130
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
131
 
132
- with gr.Accordion("BGM Separation", open=False):
133
- cb_bgm_separation = gr.Checkbox(label="Enable BGM Separation Filter", value=uvr_params["is_separate_bgm"],
134
- interactive=True)
 
 
135
  dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
136
  choices=self.whisper_inf.music_separator.available_devices)
137
  dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
138
  choices=self.whisper_inf.music_separator.available_models)
139
  nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
140
  cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
141
- cb_uvr_enable_offload = gr.Checkbox(label="Offload UVR model after separation",
142
  value=uvr_params["enable_offload"])
143
 
144
- with gr.Accordion("VAD", open=False):
145
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
146
- interactive=True)
147
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=vad_params["threshold"],
 
 
148
  info="Lower it to be more sensitive to small sounds.")
149
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=vad_params["min_speech_duration_ms"],
 
150
  info="Final speech chunks shorter than this time are thrown out")
151
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=vad_params["max_speech_duration_s"],
152
- info="Maximum duration of speech chunks in \"seconds\". Chunks longer"
153
- " than this time will be split at the timestamp of the last silence that"
154
- " lasts more than 100ms (if any), to prevent aggressive cutting.")
155
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=vad_params["min_silence_duration_ms"],
156
  info="In the end of each speech chunk wait for this time"
157
  " before separating it")
158
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
@@ -161,7 +167,10 @@ class App:
161
  with gr.Accordion("Diarization", open=False):
162
  cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
163
  tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
164
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
 
 
 
165
  dd_diarization_device = gr.Dropdown(label="Device",
166
  choices=self.whisper_inf.diarizer.get_available_device(),
167
  value=self.whisper_inf.diarizer.get_device())
 
59
  with gr.Row():
60
  cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
61
  interactive=True)
62
+
63
  with gr.Accordion("Advanced Parameters", open=False):
64
  nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
65
  info="Beam size to use for decoding.")
 
130
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
131
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
132
 
133
+ with gr.Accordion("Background Music Remover Filter", open=False):
134
+ cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
135
+ interactive=True,
136
+ info="Enabling this will remove background music by submodel before"
137
+ " transcribing ")
138
  dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
139
  choices=self.whisper_inf.music_separator.available_devices)
140
  dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
141
  choices=self.whisper_inf.music_separator.available_models)
142
  nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
143
  cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
144
+ cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
145
  value=uvr_params["enable_offload"])
146
 
147
+ with gr.Accordion("Voice Detection Filter", open=False):
148
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
149
+ interactive=True,
150
+ info="Enable this to transcribe only detected voice parts by submodel.")
151
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
152
+ value=vad_params["threshold"],
153
  info="Lower it to be more sensitive to small sounds.")
154
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
155
+ value=vad_params["min_speech_duration_ms"],
156
  info="Final speech chunks shorter than this time are thrown out")
157
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
158
+ value=vad_params["max_speech_duration_s"],
159
+ info="Maximum duration of speech chunks in \"seconds\".")
160
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
161
+ value=vad_params["min_silence_duration_ms"],
162
  info="In the end of each speech chunk wait for this time"
163
  " before separating it")
164
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
 
167
  with gr.Accordion("Diarization", open=False):
168
  cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
169
  tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
170
+ info="This is only needed the first time you download the model. If you already have"
171
+ " models, you don't need to enter. To download the model, you must manually go "
172
+ "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
173
+ " their requirement.")
174
  dd_diarization_device = gr.Dropdown(label="Device",
175
  choices=self.whisper_inf.diarizer.get_available_device(),
176
  value=self.whisper_inf.diarizer.get_device())