Fedir Zadniprovskyi commited on
Commit
db7bf9a
·
1 Parent(s): af21424

feat: improve openai compatability

Browse files
Files changed (2) hide show
  1. speaches/main.py +9 -18
  2. speaches/server_models.py +40 -8
speaches/main.py CHANGED
@@ -7,14 +7,8 @@ from contextlib import asynccontextmanager
7
  from io import BytesIO
8
  from typing import Annotated
9
 
10
- from fastapi import (
11
- Depends,
12
- FastAPI,
13
- Response,
14
- UploadFile,
15
- WebSocket,
16
- WebSocketDisconnect,
17
- )
18
  from fastapi.websockets import WebSocketState
19
  from faster_whisper import WhisperModel
20
  from faster_whisper.vad import VadOptions, get_speech_timestamps
@@ -24,11 +18,8 @@ from speaches.audio import AudioStream, audio_samples_from_file
24
  from speaches.config import SAMPLES_PER_SECOND, Language, config
25
  from speaches.core import Transcription
26
  from speaches.logger import logger
27
- from speaches.server_models import (
28
- ResponseFormat,
29
- TranscriptionResponse,
30
- TranscriptionVerboseResponse,
31
- )
32
  from speaches.transcriber import audio_transcriber
33
 
34
  whisper: WhisperModel = None # type: ignore
@@ -132,12 +123,12 @@ def format_transcription(
132
  if response_format == ResponseFormat.TEXT:
133
  return transcription.text
134
  elif response_format == ResponseFormat.JSON:
135
- return TranscriptionResponse(text=transcription.text).model_dump_json()
 
 
136
  elif response_format == ResponseFormat.VERBOSE_JSON:
137
- return TranscriptionVerboseResponse(
138
- duration=transcription.duration,
139
- text=transcription.text,
140
- words=transcription.words,
141
  ).model_dump_json()
142
 
143
 
 
7
  from io import BytesIO
8
  from typing import Annotated
9
 
10
+ from fastapi import (Depends, FastAPI, Response, UploadFile, WebSocket,
11
+ WebSocketDisconnect)
 
 
 
 
 
 
12
  from fastapi.websockets import WebSocketState
13
  from faster_whisper import WhisperModel
14
  from faster_whisper.vad import VadOptions, get_speech_timestamps
 
18
  from speaches.config import SAMPLES_PER_SECOND, Language, config
19
  from speaches.core import Transcription
20
  from speaches.logger import logger
21
+ from speaches.server_models import (ResponseFormat, TranscriptionJsonResponse,
22
+ TranscriptionVerboseJsonResponse)
 
 
 
23
  from speaches.transcriber import audio_transcriber
24
 
25
  whisper: WhisperModel = None # type: ignore
 
123
  if response_format == ResponseFormat.TEXT:
124
  return transcription.text
125
  elif response_format == ResponseFormat.JSON:
126
+ return TranscriptionJsonResponse.from_transcription(
127
+ transcription
128
+ ).model_dump_json()
129
  elif response_format == ResponseFormat.VERBOSE_JSON:
130
+ return TranscriptionVerboseJsonResponse.from_transcription(
131
+ transcription
 
 
132
  ).model_dump_json()
133
 
134
 
speaches/server_models.py CHANGED
@@ -1,26 +1,58 @@
 
 
1
  import enum
2
 
 
3
  from pydantic import BaseModel
4
 
5
- from speaches.core import Word
6
 
7
 
 
8
  class ResponseFormat(enum.StrEnum):
9
- JSON = "json"
10
  TEXT = "text"
 
11
  VERBOSE_JSON = "verbose_json"
 
 
12
 
13
 
14
  # https://platform.openai.com/docs/api-reference/audio/json-object
15
- class TranscriptionResponse(BaseModel):
16
  text: str
17
 
 
 
 
 
 
 
18
 
19
- # Subset of https://platform.openai.com/docs/api-reference/audio/verbose-json-object
20
- class TranscriptionVerboseResponse(BaseModel):
21
  task: str = "transcribe"
 
22
  duration: float
23
  text: str
24
- words: list[
25
- Word
26
- ] # Different from OpenAI's `words`. `Word.text` instead of `Word.word`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
  import enum
4
 
5
+ from faster_whisper.transcribe import Segment, Word
6
  from pydantic import BaseModel
7
 
8
+ from speaches.core import Transcription
9
 
10
 
11
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format
12
  class ResponseFormat(enum.StrEnum):
 
13
  TEXT = "text"
14
+ JSON = "json"
15
  VERBOSE_JSON = "verbose_json"
16
+ # VTT = "vtt"
17
+ # SRT = "srt"
18
 
19
 
20
  # https://platform.openai.com/docs/api-reference/audio/json-object
21
+ class TranscriptionJsonResponse(BaseModel):
22
  text: str
23
 
24
+ @classmethod
25
+ def from_transcription(
26
+ cls, transcription: Transcription
27
+ ) -> TranscriptionJsonResponse:
28
+ return cls(text=transcription.text)
29
+
30
 
31
+ # https://platform.openai.com/docs/api-reference/audio/verbose-json-object
32
+ class TranscriptionVerboseJsonResponse(BaseModel):
33
  task: str = "transcribe"
34
+ language: str
35
  duration: float
36
  text: str
37
+ words: list[Word]
38
+ segments: list[Segment]
39
+
40
+ @classmethod
41
+ def from_transcription(
42
+ cls, transcription: Transcription
43
+ ) -> TranscriptionVerboseJsonResponse:
44
+ return cls(
45
+ language="english", # FIX: hardcoded
46
+ duration=transcription.duration,
47
+ text=transcription.text,
48
+ words=[
49
+ Word(
50
+ start=word.start,
51
+ end=word.end,
52
+ word=word.text,
53
+ probability=word.probability,
54
+ )
55
+ for word in transcription.words
56
+ ],
57
+ segments=[], # FIX: hardcoded
58
+ )