Spaces:
Configuration error
Configuration error
Fedir Zadniprovskyi
commited on
Commit
·
e01d72d
1
Parent(s):
48ce933
feat: add streaming responses
Browse files- speaches/main.py +61 -24
- speaches/server_models.py +16 -0
speaches/main.py
CHANGED
@@ -9,6 +9,7 @@ from typing import Annotated, Literal
|
|
9 |
|
10 |
from fastapi import (FastAPI, Form, Query, Response, UploadFile, WebSocket,
|
11 |
WebSocketDisconnect)
|
|
|
12 |
from fastapi.websockets import WebSocketState
|
13 |
from faster_whisper import WhisperModel
|
14 |
from faster_whisper.vad import VadOptions, get_speech_timestamps
|
@@ -56,6 +57,7 @@ async def translate_file(
|
|
56 |
prompt: Annotated[str | None, Form()] = None,
|
57 |
response_format: Annotated[ResponseFormat, Form()] = ResponseFormat.JSON,
|
58 |
temperature: Annotated[float, Form()] = 0.0,
|
|
|
59 |
):
|
60 |
assert (
|
61 |
model == config.whisper.model
|
@@ -68,19 +70,36 @@ async def translate_file(
|
|
68 |
temperature=temperature,
|
69 |
vad_filter=True,
|
70 |
)
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
|
86 |
# https://platform.openai.com/docs/api-reference/audio/createTranscription
|
@@ -97,6 +116,7 @@ async def transcribe_file(
|
|
97 |
list[Literal["segments"] | Literal["words"]],
|
98 |
Form(alias="timestamp_granularities[]"),
|
99 |
] = ["segments"],
|
|
|
100 |
):
|
101 |
assert (
|
102 |
model == config.whisper.model
|
@@ -111,19 +131,36 @@ async def transcribe_file(
|
|
111 |
temperature=temperature,
|
112 |
vad_filter=True,
|
113 |
)
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
|
|
|
9 |
|
10 |
from fastapi import (FastAPI, Form, Query, Response, UploadFile, WebSocket,
|
11 |
WebSocketDisconnect)
|
12 |
+
from fastapi.responses import StreamingResponse
|
13 |
from fastapi.websockets import WebSocketState
|
14 |
from faster_whisper import WhisperModel
|
15 |
from faster_whisper.vad import VadOptions, get_speech_timestamps
|
|
|
57 |
prompt: Annotated[str | None, Form()] = None,
|
58 |
response_format: Annotated[ResponseFormat, Form()] = ResponseFormat.JSON,
|
59 |
temperature: Annotated[float, Form()] = 0.0,
|
60 |
+
stream: Annotated[bool, Form()] = False,
|
61 |
):
|
62 |
assert (
|
63 |
model == config.whisper.model
|
|
|
70 |
temperature=temperature,
|
71 |
vad_filter=True,
|
72 |
)
|
73 |
+
|
74 |
+
def segment_responses():
|
75 |
+
for segment in segments:
|
76 |
+
if response_format == ResponseFormat.TEXT:
|
77 |
+
yield segment.text
|
78 |
+
elif response_format == ResponseFormat.JSON:
|
79 |
+
yield TranscriptionJsonResponse.from_segments(
|
80 |
+
[segment]
|
81 |
+
).model_dump_json()
|
82 |
+
elif response_format == ResponseFormat.VERBOSE_JSON:
|
83 |
+
yield TranscriptionVerboseJsonResponse.from_segment(
|
84 |
+
segment, transcription_info
|
85 |
+
).model_dump_json()
|
86 |
+
|
87 |
+
if not stream:
|
88 |
+
segments = list(segments)
|
89 |
+
end = time.perf_counter()
|
90 |
+
logger.info(
|
91 |
+
f"Translated {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
|
92 |
)
|
93 |
+
if response_format == ResponseFormat.TEXT:
|
94 |
+
return utils.segments_text(segments)
|
95 |
+
elif response_format == ResponseFormat.JSON:
|
96 |
+
return TranscriptionJsonResponse.from_segments(segments)
|
97 |
+
elif response_format == ResponseFormat.VERBOSE_JSON:
|
98 |
+
return TranscriptionVerboseJsonResponse.from_segments(
|
99 |
+
segments, transcription_info
|
100 |
+
)
|
101 |
+
else:
|
102 |
+
return StreamingResponse(segment_responses(), media_type="text/event-stream")
|
103 |
|
104 |
|
105 |
# https://platform.openai.com/docs/api-reference/audio/createTranscription
|
|
|
116 |
list[Literal["segments"] | Literal["words"]],
|
117 |
Form(alias="timestamp_granularities[]"),
|
118 |
] = ["segments"],
|
119 |
+
stream: Annotated[bool, Form()] = False,
|
120 |
):
|
121 |
assert (
|
122 |
model == config.whisper.model
|
|
|
131 |
temperature=temperature,
|
132 |
vad_filter=True,
|
133 |
)
|
134 |
+
|
135 |
+
def segment_responses():
|
136 |
+
for segment in segments:
|
137 |
+
if response_format == ResponseFormat.TEXT:
|
138 |
+
yield segment.text
|
139 |
+
elif response_format == ResponseFormat.JSON:
|
140 |
+
yield TranscriptionJsonResponse.from_segments(
|
141 |
+
[segment]
|
142 |
+
).model_dump_json()
|
143 |
+
elif response_format == ResponseFormat.VERBOSE_JSON:
|
144 |
+
yield TranscriptionVerboseJsonResponse.from_segment(
|
145 |
+
segment, transcription_info
|
146 |
+
).model_dump_json()
|
147 |
+
|
148 |
+
if not stream:
|
149 |
+
segments = list(segments)
|
150 |
+
end = time.perf_counter()
|
151 |
+
logger.info(
|
152 |
+
f"Transcribed {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
|
153 |
)
|
154 |
+
if response_format == ResponseFormat.TEXT:
|
155 |
+
return utils.segments_text(segments)
|
156 |
+
elif response_format == ResponseFormat.JSON:
|
157 |
+
return TranscriptionJsonResponse.from_segments(segments)
|
158 |
+
elif response_format == ResponseFormat.VERBOSE_JSON:
|
159 |
+
return TranscriptionVerboseJsonResponse.from_segments(
|
160 |
+
segments, transcription_info
|
161 |
+
)
|
162 |
+
else:
|
163 |
+
return StreamingResponse(segment_responses(), media_type="text/event-stream")
|
164 |
|
165 |
|
166 |
async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
|
speaches/server_models.py
CHANGED
@@ -86,6 +86,22 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
86 |
words: list[WordObject]
|
87 |
segments: list[SegmentObject]
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
@classmethod
|
90 |
def from_segments(
|
91 |
cls, segments: list[Segment], transcription_info: TranscriptionInfo
|
|
|
86 |
words: list[WordObject]
|
87 |
segments: list[SegmentObject]
|
88 |
|
89 |
+
@classmethod
|
90 |
+
def from_segment(
|
91 |
+
cls, segment: Segment, transcription_info: TranscriptionInfo
|
92 |
+
) -> TranscriptionVerboseJsonResponse:
|
93 |
+
return cls(
|
94 |
+
language=transcription_info.language,
|
95 |
+
duration=segment.end - segment.start,
|
96 |
+
text=segment.text,
|
97 |
+
words=(
|
98 |
+
[WordObject.from_word(word) for word in segment.words]
|
99 |
+
if type(segment.words) == list
|
100 |
+
else []
|
101 |
+
),
|
102 |
+
segments=[SegmentObject.from_segment(segment)],
|
103 |
+
)
|
104 |
+
|
105 |
@classmethod
|
106 |
def from_segments(
|
107 |
cls, segments: list[Segment], transcription_info: TranscriptionInfo
|