Spaces:
Configuration error
Configuration error
import enum | |
from pydantic import BaseModel, Field | |
from pydantic_settings import BaseSettings, SettingsConfigDict | |
from speaches.server_models import ResponseFormat | |
SAMPLES_PER_SECOND = 16000 | |
BYTES_PER_SAMPLE = 2 | |
BYTES_PER_SECOND = SAMPLES_PER_SECOND * BYTES_PER_SAMPLE | |
# 2 BYTES = 16 BITS = 1 SAMPLE | |
# 1 SECOND OF AUDIO = 32000 BYTES = 16000 SAMPLES | |
# https://huggingface.co/Systran | |
class Model(enum.StrEnum): | |
TINY_EN = "tiny.en" | |
TINY = "tiny" | |
BASE_EN = "base.en" | |
BASE = "base" | |
SMALL_EN = "small.en" | |
SMALL = "small" | |
MEDIUM_EN = "medium.en" | |
MEDIUM = "medium" | |
LARGE = "large" | |
LARGE_V1 = "large-v1" | |
LARGE_V2 = "large-v2" | |
LARGE_V3 = "large-v3" | |
DISTIL_SMALL_EN = "distil-small.en" | |
DISTIL_MEDIUM_EN = "distil-medium.en" | |
DISTIL_LARGE_V2 = "distil-large-v2" | |
DISTIL_LARGE_V3 = "distil-large-v3" | |
class Device(enum.StrEnum): | |
CPU = "cpu" | |
CUDA = "cuda" | |
AUTO = "auto" | |
# https://github.com/OpenNMT/CTranslate2/blob/master/docs/quantization.md | |
# NOTE: `Precision` might be a better name | |
class Quantization(enum.StrEnum): | |
INT8 = "int8" | |
INT8_FLOAT16 = "int8_float16" | |
INT8_BFLOAT16 = "int8_bfloat16" | |
INT8_FLOAT32 = "int8_float32" | |
INT16 = "int16" | |
FLOAT16 = "float16" | |
BFLOAT16 = "bfloat16" | |
FLOAT32 = "float32" | |
DEFAULT = "default" | |
class Language(enum.StrEnum): | |
AF = "af" | |
AM = "am" | |
AR = "ar" | |
AS = "as" | |
AZ = "az" | |
BA = "ba" | |
BE = "be" | |
BG = "bg" | |
BN = "bn" | |
BO = "bo" | |
BR = "br" | |
BS = "bs" | |
CA = "ca" | |
CS = "cs" | |
CY = "cy" | |
DA = "da" | |
DE = "de" | |
EL = "el" | |
EN = "en" | |
ES = "es" | |
ET = "et" | |
EU = "eu" | |
FA = "fa" | |
FI = "fi" | |
FO = "fo" | |
FR = "fr" | |
GL = "gl" | |
GU = "gu" | |
HA = "ha" | |
HAW = "haw" | |
HE = "he" | |
HI = "hi" | |
HR = "hr" | |
HT = "ht" | |
HU = "hu" | |
HY = "hy" | |
ID = "id" | |
IS = "is" | |
IT = "it" | |
JA = "ja" | |
JW = "jw" | |
KA = "ka" | |
KK = "kk" | |
KM = "km" | |
KN = "kn" | |
KO = "ko" | |
LA = "la" | |
LB = "lb" | |
LN = "ln" | |
LO = "lo" | |
LT = "lt" | |
LV = "lv" | |
MG = "mg" | |
MI = "mi" | |
MK = "mk" | |
ML = "ml" | |
MN = "mn" | |
MR = "mr" | |
MS = "ms" | |
MT = "mt" | |
MY = "my" | |
NE = "ne" | |
NL = "nl" | |
NN = "nn" | |
NO = "no" | |
OC = "oc" | |
PA = "pa" | |
PL = "pl" | |
PS = "ps" | |
PT = "pt" | |
RO = "ro" | |
RU = "ru" | |
SA = "sa" | |
SD = "sd" | |
SI = "si" | |
SK = "sk" | |
SL = "sl" | |
SN = "sn" | |
SO = "so" | |
SQ = "sq" | |
SR = "sr" | |
SU = "su" | |
SV = "sv" | |
SW = "sw" | |
TA = "ta" | |
TE = "te" | |
TG = "tg" | |
TH = "th" | |
TK = "tk" | |
TL = "tl" | |
TR = "tr" | |
TT = "tt" | |
UK = "uk" | |
UR = "ur" | |
UZ = "uz" | |
VI = "vi" | |
YI = "yi" | |
YO = "yo" | |
YUE = "yue" | |
ZH = "zh" | |
class WhisperConfig(BaseModel): | |
model: Model = Field(default=Model.DISTIL_MEDIUM_EN) # ENV: WHISPER_MODEL | |
inference_device: Device = Field( | |
default=Device.AUTO | |
) # ENV: WHISPER_INFERENCE_DEVICE | |
compute_type: Quantization = Field( | |
default=Quantization.DEFAULT | |
) # ENV: WHISPER_COMPUTE_TYPE | |
class Config(BaseSettings): | |
model_config = SettingsConfigDict(env_nested_delimiter="_") | |
log_level: str = "info" # ENV: LOG_LEVEL | |
default_language: Language | None = None # ENV: DEFAULT_LANGUAGE | |
default_response_format: ResponseFormat = ( | |
ResponseFormat.JSON | |
) # ENV: DEFAULT_RESPONSE_FORMAT | |
whisper: WhisperConfig = WhisperConfig() # ENV: WHISPER_* | |
""" | |
Max duration to for the next audio chunk before transcription is finilized and connection is closed. | |
""" | |
max_no_data_seconds: float = 1.0 # ENV: MAX_NO_DATA_SECONDS | |
min_duration: float = 1.0 # ENV: MIN_DURATION | |
word_timestamp_error_margin: float = 0.2 # ENV: WORD_TIMESTAMP_ERROR_MARGIN | |
""" | |
Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed. | |
""" | |
max_inactivity_seconds: float = 2.0 # ENV: MAX_INACTIVITY_SECONDS | |
""" | |
Controls how many latest seconds of audio are being passed through VAD. | |
Should be greater than `max_inactivity_seconds` | |
""" | |
inactivity_window_seconds: float = 3.0 # ENV: INACTIVITY_WINDOW_SECONDS | |
config = Config() | |