Fedir Zadniprovskyi
chore: change default whisper models
836d1b1
raw
history blame
4.25 kB
import enum
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from speaches.server_models import ResponseFormat
SAMPLES_PER_SECOND = 16000
BYTES_PER_SAMPLE = 2
BYTES_PER_SECOND = SAMPLES_PER_SECOND * BYTES_PER_SAMPLE
# 2 BYTES = 16 BITS = 1 SAMPLE
# 1 SECOND OF AUDIO = 32000 BYTES = 16000 SAMPLES
# https://huggingface.co/Systran
class Model(enum.StrEnum):
TINY_EN = "tiny.en"
TINY = "tiny"
BASE_EN = "base.en"
BASE = "base"
SMALL_EN = "small.en"
SMALL = "small"
MEDIUM_EN = "medium.en"
MEDIUM = "medium"
LARGE = "large"
LARGE_V1 = "large-v1"
LARGE_V2 = "large-v2"
LARGE_V3 = "large-v3"
DISTIL_SMALL_EN = "distil-small.en"
DISTIL_MEDIUM_EN = "distil-medium.en"
DISTIL_LARGE_V2 = "distil-large-v2"
DISTIL_LARGE_V3 = "distil-large-v3"
class Device(enum.StrEnum):
CPU = "cpu"
CUDA = "cuda"
AUTO = "auto"
# https://github.com/OpenNMT/CTranslate2/blob/master/docs/quantization.md
# NOTE: `Precision` might be a better name
class Quantization(enum.StrEnum):
INT8 = "int8"
INT8_FLOAT16 = "int8_float16"
INT8_BFLOAT16 = "int8_bfloat16"
INT8_FLOAT32 = "int8_float32"
INT16 = "int16"
FLOAT16 = "float16"
BFLOAT16 = "bfloat16"
FLOAT32 = "float32"
DEFAULT = "default"
class Language(enum.StrEnum):
AF = "af"
AM = "am"
AR = "ar"
AS = "as"
AZ = "az"
BA = "ba"
BE = "be"
BG = "bg"
BN = "bn"
BO = "bo"
BR = "br"
BS = "bs"
CA = "ca"
CS = "cs"
CY = "cy"
DA = "da"
DE = "de"
EL = "el"
EN = "en"
ES = "es"
ET = "et"
EU = "eu"
FA = "fa"
FI = "fi"
FO = "fo"
FR = "fr"
GL = "gl"
GU = "gu"
HA = "ha"
HAW = "haw"
HE = "he"
HI = "hi"
HR = "hr"
HT = "ht"
HU = "hu"
HY = "hy"
ID = "id"
IS = "is"
IT = "it"
JA = "ja"
JW = "jw"
KA = "ka"
KK = "kk"
KM = "km"
KN = "kn"
KO = "ko"
LA = "la"
LB = "lb"
LN = "ln"
LO = "lo"
LT = "lt"
LV = "lv"
MG = "mg"
MI = "mi"
MK = "mk"
ML = "ml"
MN = "mn"
MR = "mr"
MS = "ms"
MT = "mt"
MY = "my"
NE = "ne"
NL = "nl"
NN = "nn"
NO = "no"
OC = "oc"
PA = "pa"
PL = "pl"
PS = "ps"
PT = "pt"
RO = "ro"
RU = "ru"
SA = "sa"
SD = "sd"
SI = "si"
SK = "sk"
SL = "sl"
SN = "sn"
SO = "so"
SQ = "sq"
SR = "sr"
SU = "su"
SV = "sv"
SW = "sw"
TA = "ta"
TE = "te"
TG = "tg"
TH = "th"
TK = "tk"
TL = "tl"
TR = "tr"
TT = "tt"
UK = "uk"
UR = "ur"
UZ = "uz"
VI = "vi"
YI = "yi"
YO = "yo"
YUE = "yue"
ZH = "zh"
class WhisperConfig(BaseModel):
model: Model = Field(default=Model.DISTIL_MEDIUM_EN) # ENV: WHISPER_MODEL
inference_device: Device = Field(
default=Device.AUTO
) # ENV: WHISPER_INFERENCE_DEVICE
compute_type: Quantization = Field(
default=Quantization.DEFAULT
) # ENV: WHISPER_COMPUTE_TYPE
class Config(BaseSettings):
model_config = SettingsConfigDict(env_nested_delimiter="_")
log_level: str = "info" # ENV: LOG_LEVEL
default_language: Language | None = None # ENV: DEFAULT_LANGUAGE
default_response_format: ResponseFormat = (
ResponseFormat.JSON
) # ENV: DEFAULT_RESPONSE_FORMAT
whisper: WhisperConfig = WhisperConfig() # ENV: WHISPER_*
"""
Max duration to for the next audio chunk before transcription is finilized and connection is closed.
"""
max_no_data_seconds: float = 1.0 # ENV: MAX_NO_DATA_SECONDS
min_duration: float = 1.0 # ENV: MIN_DURATION
word_timestamp_error_margin: float = 0.2 # ENV: WORD_TIMESTAMP_ERROR_MARGIN
"""
Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
"""
max_inactivity_seconds: float = 2.0 # ENV: MAX_INACTIVITY_SECONDS
"""
Controls how many latest seconds of audio are being passed through VAD.
Should be greater than `max_inactivity_seconds`
"""
inactivity_window_seconds: float = 3.0 # ENV: INACTIVITY_WINDOW_SECONDS
config = Config()