jhj0517 commited on
Commit
3d6a684
·
1 Parent(s): f4c648c

Remove duplicates parameter and add defaults

Browse files
Files changed (1) hide show
  1. modules/whisper/whisper_parameter.py +47 -54
modules/whisper/whisper_parameter.py CHANGED
@@ -26,7 +26,6 @@ class WhisperParameters:
26
  max_speech_duration_s: gr.Number
27
  min_silence_duration_ms: gr.Number
28
  speech_pad_ms: gr.Number
29
- chunk_length_s: gr.Number
30
  batch_size: gr.Number
31
  is_diarize: gr.Checkbox
32
  hf_token: gr.Textbox
@@ -136,10 +135,6 @@ class WhisperParameters:
136
  speech_pad_ms: gr.Number
137
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
138
 
139
- chunk_length_s: gr.Number
140
- This parameter is related with insanely-fast-whisper pipe.
141
- Maximum length of each chunk
142
-
143
  batch_size: gr.Number
144
  This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
145
 
@@ -193,8 +188,8 @@ class WhisperParameters:
193
  the maximum will be set by the default max_length.
194
 
195
  chunk_length: gr.Number
196
- This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
197
- default chunk_length of the FeatureExtractor.
198
 
199
  hallucination_silence_threshold: gr.Number
200
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
@@ -252,52 +247,51 @@ class WhisperParameters:
252
 
253
  @dataclass
254
  class WhisperValues:
255
- model_size: str
256
- lang: str
257
- is_translate: bool
258
- beam_size: int
259
- log_prob_threshold: float
260
- no_speech_threshold: float
261
- compute_type: str
262
- best_of: int
263
- patience: float
264
- condition_on_previous_text: bool
265
- prompt_reset_on_temperature: float
266
- initial_prompt: Optional[str]
267
- temperature: float
268
- compression_ratio_threshold: float
269
- vad_filter: bool
270
- threshold: float
271
- min_speech_duration_ms: int
272
- max_speech_duration_s: float
273
- min_silence_duration_ms: int
274
- speech_pad_ms: int
275
- chunk_length_s: int
276
- batch_size: int
277
- is_diarize: bool
278
- hf_token: str
279
- diarization_device: str
280
- length_penalty: float
281
- repetition_penalty: float
282
- no_repeat_ngram_size: int
283
- prefix: Optional[str]
284
- suppress_blank: bool
285
- suppress_tokens: Optional[str]
286
- max_initial_timestamp: float
287
- word_timestamps: bool
288
- prepend_punctuations: Optional[str]
289
- append_punctuations: Optional[str]
290
- max_new_tokens: Optional[int]
291
- chunk_length: Optional[int]
292
- hallucination_silence_threshold: Optional[float]
293
- hotwords: Optional[str]
294
- language_detection_threshold: Optional[float]
295
- language_detection_segments: int
296
- is_bgm_separate: bool
297
- uvr_model_size: str
298
- uvr_device: str
299
- uvr_segment_size: int
300
- uvr_save_file: bool
301
  """
302
  A data class to use Whisper parameters.
303
  """
@@ -318,7 +312,6 @@ class WhisperValues:
318
  "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
319
  "temperature": self.temperature,
320
  "compression_ratio_threshold": self.compression_ratio_threshold,
321
- "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
322
  "batch_size": self.batch_size,
323
  "length_penalty": self.length_penalty,
324
  "repetition_penalty": self.repetition_penalty,
 
26
  max_speech_duration_s: gr.Number
27
  min_silence_duration_ms: gr.Number
28
  speech_pad_ms: gr.Number
 
29
  batch_size: gr.Number
30
  is_diarize: gr.Checkbox
31
  hf_token: gr.Textbox
 
135
  speech_pad_ms: gr.Number
136
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
137
 
 
 
 
 
138
  batch_size: gr.Number
139
  This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
140
 
 
188
  the maximum will be set by the default max_length.
189
 
190
  chunk_length: gr.Number
191
+ This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
192
+ If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
193
 
194
  hallucination_silence_threshold: gr.Number
195
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
 
247
 
248
  @dataclass
249
  class WhisperValues:
250
+ model_size: str = "large-v2"
251
+ lang: Optional[str] = None
252
+ is_translate: bool = False
253
+ beam_size: int = 5
254
+ log_prob_threshold: float = -1.0
255
+ no_speech_threshold: float = 0.6
256
+ compute_type: str = "float16"
257
+ best_of: int = 5
258
+ patience: float = 1.0
259
+ condition_on_previous_text: bool = True
260
+ prompt_reset_on_temperature: float = 0.5
261
+ initial_prompt: Optional[str] = None
262
+ temperature: float = 0.0
263
+ compression_ratio_threshold: float = 2.4
264
+ vad_filter: bool = False
265
+ threshold: float = 0.5
266
+ min_speech_duration_ms: int = 250
267
+ max_speech_duration_s: float = float("inf")
268
+ min_silence_duration_ms: int = 2000
269
+ speech_pad_ms: int = 400
270
+ batch_size: int = 24
271
+ is_diarize: bool = False
272
+ hf_token: str = ""
273
+ diarization_device: str = "cuda"
274
+ length_penalty: float = 1.0
275
+ repetition_penalty: float = 1.0
276
+ no_repeat_ngram_size: int = 0.0
277
+ prefix: Optional[str] = None
278
+ suppress_blank: bool = True
279
+ suppress_tokens: Optional[str] = "[-1]"
280
+ max_initial_timestamp: float = 0.0
281
+ word_timestamps: bool = False
282
+ prepend_punctuations: Optional[str] = "\"'“¿([{-"
283
+ append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
284
+ max_new_tokens: Optional[int] = None
285
+ chunk_length: Optional[int] = 30
286
+ hallucination_silence_threshold: Optional[float] = None
287
+ hotwords: Optional[str] = None
288
+ language_detection_threshold: Optional[float] = None
289
+ language_detection_segments: int = 1
290
+ is_bgm_separate: bool = False
291
+ uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
292
+ uvr_device: str = "cuda"
293
+ uvr_segment_size: int = 256
294
+ uvr_save_file: bool = False
 
295
  """
296
  A data class to use Whisper parameters.
297
  """
 
312
  "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
313
  "temperature": self.temperature,
314
  "compression_ratio_threshold": self.compression_ratio_threshold,
 
315
  "batch_size": self.batch_size,
316
  "length_penalty": self.length_penalty,
317
  "repetition_penalty": self.repetition_penalty,