Fedir Zadniprovskyi commited on
Commit
e01d72d
·
1 Parent(s): 48ce933

feat: add streaming responses

Browse files
Files changed (2) hide show
  1. speaches/main.py +61 -24
  2. speaches/server_models.py +16 -0
speaches/main.py CHANGED
@@ -9,6 +9,7 @@ from typing import Annotated, Literal
9
 
10
  from fastapi import (FastAPI, Form, Query, Response, UploadFile, WebSocket,
11
  WebSocketDisconnect)
 
12
  from fastapi.websockets import WebSocketState
13
  from faster_whisper import WhisperModel
14
  from faster_whisper.vad import VadOptions, get_speech_timestamps
@@ -56,6 +57,7 @@ async def translate_file(
56
  prompt: Annotated[str | None, Form()] = None,
57
  response_format: Annotated[ResponseFormat, Form()] = ResponseFormat.JSON,
58
  temperature: Annotated[float, Form()] = 0.0,
 
59
  ):
60
  assert (
61
  model == config.whisper.model
@@ -68,19 +70,36 @@ async def translate_file(
68
  temperature=temperature,
69
  vad_filter=True,
70
  )
71
- segments = list(segments)
72
- end = time.perf_counter()
73
- logger.info(
74
- f"Translated {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
75
- )
76
- if response_format == ResponseFormat.TEXT:
77
- return utils.segments_text(segments)
78
- elif response_format == ResponseFormat.JSON:
79
- return TranscriptionJsonResponse.from_segments(segments)
80
- elif response_format == ResponseFormat.VERBOSE_JSON:
81
- return TranscriptionVerboseJsonResponse.from_segments(
82
- segments, transcription_info
 
 
 
 
 
 
 
83
  )
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  # https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -97,6 +116,7 @@ async def transcribe_file(
97
  list[Literal["segments"] | Literal["words"]],
98
  Form(alias="timestamp_granularities[]"),
99
  ] = ["segments"],
 
100
  ):
101
  assert (
102
  model == config.whisper.model
@@ -111,19 +131,36 @@ async def transcribe_file(
111
  temperature=temperature,
112
  vad_filter=True,
113
  )
114
- segments = list(segments)
115
- end = time.perf_counter()
116
- logger.info(
117
- f"Transcribed {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
118
- )
119
- if response_format == ResponseFormat.TEXT:
120
- return utils.segments_text(segments)
121
- elif response_format == ResponseFormat.JSON:
122
- return TranscriptionJsonResponse.from_segments(segments)
123
- elif response_format == ResponseFormat.VERBOSE_JSON:
124
- return TranscriptionVerboseJsonResponse.from_segments(
125
- segments, transcription_info
 
 
 
 
 
 
 
126
  )
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
  async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
 
9
 
10
  from fastapi import (FastAPI, Form, Query, Response, UploadFile, WebSocket,
11
  WebSocketDisconnect)
12
+ from fastapi.responses import StreamingResponse
13
  from fastapi.websockets import WebSocketState
14
  from faster_whisper import WhisperModel
15
  from faster_whisper.vad import VadOptions, get_speech_timestamps
 
57
  prompt: Annotated[str | None, Form()] = None,
58
  response_format: Annotated[ResponseFormat, Form()] = ResponseFormat.JSON,
59
  temperature: Annotated[float, Form()] = 0.0,
60
+ stream: Annotated[bool, Form()] = False,
61
  ):
62
  assert (
63
  model == config.whisper.model
 
70
  temperature=temperature,
71
  vad_filter=True,
72
  )
73
+
74
+ def segment_responses():
75
+ for segment in segments:
76
+ if response_format == ResponseFormat.TEXT:
77
+ yield segment.text
78
+ elif response_format == ResponseFormat.JSON:
79
+ yield TranscriptionJsonResponse.from_segments(
80
+ [segment]
81
+ ).model_dump_json()
82
+ elif response_format == ResponseFormat.VERBOSE_JSON:
83
+ yield TranscriptionVerboseJsonResponse.from_segment(
84
+ segment, transcription_info
85
+ ).model_dump_json()
86
+
87
+ if not stream:
88
+ segments = list(segments)
89
+ end = time.perf_counter()
90
+ logger.info(
91
+ f"Translated {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
92
  )
93
+ if response_format == ResponseFormat.TEXT:
94
+ return utils.segments_text(segments)
95
+ elif response_format == ResponseFormat.JSON:
96
+ return TranscriptionJsonResponse.from_segments(segments)
97
+ elif response_format == ResponseFormat.VERBOSE_JSON:
98
+ return TranscriptionVerboseJsonResponse.from_segments(
99
+ segments, transcription_info
100
+ )
101
+ else:
102
+ return StreamingResponse(segment_responses(), media_type="text/event-stream")
103
 
104
 
105
  # https://platform.openai.com/docs/api-reference/audio/createTranscription
 
116
  list[Literal["segments"] | Literal["words"]],
117
  Form(alias="timestamp_granularities[]"),
118
  ] = ["segments"],
119
+ stream: Annotated[bool, Form()] = False,
120
  ):
121
  assert (
122
  model == config.whisper.model
 
131
  temperature=temperature,
132
  vad_filter=True,
133
  )
134
+
135
+ def segment_responses():
136
+ for segment in segments:
137
+ if response_format == ResponseFormat.TEXT:
138
+ yield segment.text
139
+ elif response_format == ResponseFormat.JSON:
140
+ yield TranscriptionJsonResponse.from_segments(
141
+ [segment]
142
+ ).model_dump_json()
143
+ elif response_format == ResponseFormat.VERBOSE_JSON:
144
+ yield TranscriptionVerboseJsonResponse.from_segment(
145
+ segment, transcription_info
146
+ ).model_dump_json()
147
+
148
+ if not stream:
149
+ segments = list(segments)
150
+ end = time.perf_counter()
151
+ logger.info(
152
+ f"Transcribed {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {end - start:.2f} seconds"
153
  )
154
+ if response_format == ResponseFormat.TEXT:
155
+ return utils.segments_text(segments)
156
+ elif response_format == ResponseFormat.JSON:
157
+ return TranscriptionJsonResponse.from_segments(segments)
158
+ elif response_format == ResponseFormat.VERBOSE_JSON:
159
+ return TranscriptionVerboseJsonResponse.from_segments(
160
+ segments, transcription_info
161
+ )
162
+ else:
163
+ return StreamingResponse(segment_responses(), media_type="text/event-stream")
164
 
165
 
166
  async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
speaches/server_models.py CHANGED
@@ -86,6 +86,22 @@ class TranscriptionVerboseJsonResponse(BaseModel):
86
  words: list[WordObject]
87
  segments: list[SegmentObject]
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  @classmethod
90
  def from_segments(
91
  cls, segments: list[Segment], transcription_info: TranscriptionInfo
 
86
  words: list[WordObject]
87
  segments: list[SegmentObject]
88
 
89
+ @classmethod
90
+ def from_segment(
91
+ cls, segment: Segment, transcription_info: TranscriptionInfo
92
+ ) -> TranscriptionVerboseJsonResponse:
93
+ return cls(
94
+ language=transcription_info.language,
95
+ duration=segment.end - segment.start,
96
+ text=segment.text,
97
+ words=(
98
+ [WordObject.from_word(word) for word in segment.words]
99
+ if type(segment.words) == list
100
+ else []
101
+ ),
102
+ segments=[SegmentObject.from_segment(segment)],
103
+ )
104
+
105
  @classmethod
106
  def from_segments(
107
  cls, segments: list[Segment], transcription_info: TranscriptionInfo