Spaces:
Configuration error
Configuration error
Fedir Zadniprovskyi
commited on
Commit
·
2a79f48
1
Parent(s):
125092f
refactor
Browse files- faster_whisper_server/asr.py +3 -19
- faster_whisper_server/core.py +103 -52
- faster_whisper_server/main.py +5 -3
- faster_whisper_server/server_models.py +11 -65
- faster_whisper_server/transcriber.py +4 -23
- faster_whisper_server/utils.py +0 -14
faster_whisper_server/asr.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import asyncio
|
2 |
-
from collections.abc import Iterable
|
3 |
import time
|
4 |
|
5 |
from faster_whisper import transcribe
|
6 |
|
7 |
from faster_whisper_server.audio import Audio
|
8 |
-
from faster_whisper_server.core import Transcription, Word
|
9 |
from faster_whisper_server.logger import logger
|
10 |
|
11 |
|
@@ -30,7 +29,8 @@ class FasterWhisperASR:
|
|
30 |
word_timestamps=True,
|
31 |
**self.transcribe_opts,
|
32 |
)
|
33 |
-
|
|
|
34 |
for word in words:
|
35 |
word.offset(audio.start)
|
36 |
transcription = Transcription(words)
|
@@ -54,19 +54,3 @@ class FasterWhisperASR:
|
|
54 |
audio,
|
55 |
prompt,
|
56 |
)
|
57 |
-
|
58 |
-
|
59 |
-
def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
|
60 |
-
words: list[Word] = []
|
61 |
-
for segment in segments:
|
62 |
-
assert segment.words is not None
|
63 |
-
words.extend(
|
64 |
-
Word(
|
65 |
-
start=word.start,
|
66 |
-
end=word.end,
|
67 |
-
text=word.word,
|
68 |
-
probability=word.probability,
|
69 |
-
)
|
70 |
-
for word in segment.words
|
71 |
-
)
|
72 |
-
return words
|
|
|
1 |
import asyncio
|
|
|
2 |
import time
|
3 |
|
4 |
from faster_whisper import transcribe
|
5 |
|
6 |
from faster_whisper_server.audio import Audio
|
7 |
+
from faster_whisper_server.core import Segment, Transcription, Word
|
8 |
from faster_whisper_server.logger import logger
|
9 |
|
10 |
|
|
|
29 |
word_timestamps=True,
|
30 |
**self.transcribe_opts,
|
31 |
)
|
32 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
33 |
+
words = Word.from_segments(segments)
|
34 |
for word in words:
|
35 |
word.offset(audio.start)
|
36 |
transcription = Transcription(words)
|
|
|
54 |
audio,
|
55 |
prompt,
|
56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
faster_whisper_server/core.py
CHANGED
@@ -1,43 +1,85 @@
|
|
1 |
-
# TODO: rename module
|
2 |
from __future__ import annotations
|
3 |
|
4 |
-
from dataclasses import dataclass
|
5 |
import re
|
|
|
|
|
|
|
6 |
|
7 |
from faster_whisper_server.config import config
|
8 |
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
@dataclass
|
12 |
-
class Segment:
|
13 |
-
text: str
|
14 |
-
start: float = 0.0
|
15 |
-
end: float = 0.0
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def offset(self, seconds: float) -> None:
|
24 |
self.start += seconds
|
25 |
self.end += seconds
|
26 |
|
27 |
-
|
28 |
-
# TODO: use the `Word` from `faster-whisper.transcribe` instead
|
29 |
-
@dataclass
|
30 |
-
class Word(Segment):
|
31 |
-
probability: float = 0.0
|
32 |
-
|
33 |
@classmethod
|
34 |
def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
|
35 |
i = 0
|
36 |
-
while i < len(a) and i < len(b) and canonicalize_word(a[i].
|
37 |
i += 1
|
38 |
return a[:i]
|
39 |
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
class Transcription:
|
42 |
def __init__(self, words: list[Word] = []) -> None:
|
43 |
self.words: list[Word] = []
|
@@ -45,7 +87,7 @@ class Transcription:
|
|
45 |
|
46 |
@property
|
47 |
def text(self) -> str:
|
48 |
-
return " ".join(word.
|
49 |
|
50 |
@property
|
51 |
def start(self) -> float:
|
@@ -77,48 +119,57 @@ class Transcription:
|
|
77 |
raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
|
78 |
|
79 |
|
80 |
-
def
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
assert not
|
88 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
-
def to_full_sentences(words: list[Word]) -> list[
|
92 |
-
sentences: list[
|
93 |
for word in words:
|
94 |
-
sentences[-1]
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
)
|
99 |
-
if word.is_eos:
|
100 |
-
sentences.append(Segment(""))
|
101 |
-
if len(sentences) > 0 and not sentences[-1].is_eos:
|
102 |
sentences.pop()
|
103 |
return sentences
|
104 |
|
105 |
|
106 |
def tests_to_full_sentences() -> None:
|
|
|
|
|
|
|
107 |
assert to_full_sentences([]) == []
|
108 |
-
assert to_full_sentences([
|
109 |
-
assert to_full_sentences([
|
110 |
-
assert to_full_sentences([
|
111 |
-
assert to_full_sentences([
|
112 |
-
|
113 |
]
|
114 |
|
115 |
|
116 |
-
def
|
117 |
-
return "".join(word.
|
|
|
|
|
|
|
|
|
118 |
|
119 |
|
120 |
-
def
|
121 |
-
return "".join(
|
122 |
|
123 |
|
124 |
def canonicalize_word(text: str) -> str:
|
@@ -136,14 +187,14 @@ def test_canonicalize_word() -> None:
|
|
136 |
|
137 |
def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
|
138 |
i = 0
|
139 |
-
while i < len(a) and i < len(b) and canonicalize_word(a[i].
|
140 |
i += 1
|
141 |
return a[:i]
|
142 |
|
143 |
|
144 |
def test_common_prefix() -> None:
|
145 |
def word(text: str) -> Word:
|
146 |
-
return Word(
|
147 |
|
148 |
a = [word("a"), word("b"), word("c")]
|
149 |
b = [word("a"), word("b"), word("c")]
|
@@ -176,7 +227,7 @@ def test_common_prefix() -> None:
|
|
176 |
|
177 |
def test_common_prefix_and_canonicalization() -> None:
|
178 |
def word(text: str) -> Word:
|
179 |
-
return Word(
|
180 |
|
181 |
a = [word("A...")]
|
182 |
b = [word("a?"), word("b"), word("c")]
|
|
|
|
|
1 |
from __future__ import annotations
|
2 |
|
|
|
3 |
import re
|
4 |
+
from typing import TYPE_CHECKING
|
5 |
+
|
6 |
+
from pydantic import BaseModel
|
7 |
|
8 |
from faster_whisper_server.config import config
|
9 |
|
10 |
+
if TYPE_CHECKING:
|
11 |
+
from collections.abc import Iterable
|
12 |
|
13 |
+
import faster_whisper.transcribe
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
|
16 |
+
class Word(BaseModel):
|
17 |
+
start: float
|
18 |
+
end: float
|
19 |
+
word: str
|
20 |
+
probability: float
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
|
24 |
+
words: list[Word] = []
|
25 |
+
for segment in segments:
|
26 |
+
assert segment.words is not None
|
27 |
+
words.extend(segment.words)
|
28 |
+
return words
|
29 |
|
30 |
def offset(self, seconds: float) -> None:
|
31 |
self.start += seconds
|
32 |
self.end += seconds
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
@classmethod
|
35 |
def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
|
36 |
i = 0
|
37 |
+
while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
|
38 |
i += 1
|
39 |
return a[:i]
|
40 |
|
41 |
|
42 |
+
class Segment(BaseModel):
|
43 |
+
id: int
|
44 |
+
seek: int
|
45 |
+
start: float
|
46 |
+
end: float
|
47 |
+
text: str
|
48 |
+
tokens: list[int]
|
49 |
+
temperature: float
|
50 |
+
avg_logprob: float
|
51 |
+
compression_ratio: float
|
52 |
+
no_speech_prob: float
|
53 |
+
words: list[Word] | None
|
54 |
+
|
55 |
+
@classmethod
|
56 |
+
def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
|
57 |
+
for segment in segments:
|
58 |
+
yield cls(
|
59 |
+
id=segment.id,
|
60 |
+
seek=segment.seek,
|
61 |
+
start=segment.start,
|
62 |
+
end=segment.end,
|
63 |
+
text=segment.text,
|
64 |
+
tokens=segment.tokens,
|
65 |
+
temperature=segment.temperature,
|
66 |
+
avg_logprob=segment.avg_logprob,
|
67 |
+
compression_ratio=segment.compression_ratio,
|
68 |
+
no_speech_prob=segment.no_speech_prob,
|
69 |
+
words=[
|
70 |
+
Word(
|
71 |
+
start=word.start,
|
72 |
+
end=word.end,
|
73 |
+
word=word.word,
|
74 |
+
probability=word.probability,
|
75 |
+
)
|
76 |
+
for word in segment.words
|
77 |
+
]
|
78 |
+
if segment.words is not None
|
79 |
+
else None,
|
80 |
+
)
|
81 |
+
|
82 |
+
|
83 |
class Transcription:
|
84 |
def __init__(self, words: list[Word] = []) -> None:
|
85 |
self.words: list[Word] = []
|
|
|
87 |
|
88 |
@property
|
89 |
def text(self) -> str:
|
90 |
+
return " ".join(word.word for word in self.words).strip()
|
91 |
|
92 |
@property
|
93 |
def start(self) -> float:
|
|
|
119 |
raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
|
120 |
|
121 |
|
122 |
+
def is_eos(text: str) -> bool:
|
123 |
+
if text.endswith("..."):
|
124 |
+
return False
|
125 |
+
return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
|
126 |
+
|
127 |
+
|
128 |
+
def test_is_eos() -> None:
|
129 |
+
assert not is_eos("Hello")
|
130 |
+
assert not is_eos("Hello...")
|
131 |
+
assert is_eos("Hello.")
|
132 |
+
assert is_eos("Hello!")
|
133 |
+
assert is_eos("Hello?")
|
134 |
+
assert not is_eos("Hello. Yo")
|
135 |
+
assert not is_eos("Hello. Yo...")
|
136 |
+
assert is_eos("Hello. Yo.")
|
137 |
|
138 |
|
139 |
+
def to_full_sentences(words: list[Word]) -> list[list[Word]]:
|
140 |
+
sentences: list[list[Word]] = [[]]
|
141 |
for word in words:
|
142 |
+
sentences[-1].append(word)
|
143 |
+
if is_eos(word.word):
|
144 |
+
sentences.append([])
|
145 |
+
if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
|
|
|
|
|
|
|
|
|
146 |
sentences.pop()
|
147 |
return sentences
|
148 |
|
149 |
|
150 |
def tests_to_full_sentences() -> None:
|
151 |
+
def word(text: str) -> Word:
|
152 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
153 |
+
|
154 |
assert to_full_sentences([]) == []
|
155 |
+
assert to_full_sentences([word(text="Hello")]) == []
|
156 |
+
assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
|
157 |
+
assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
|
158 |
+
assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
|
159 |
+
[word("Hello..."), word(" world.")],
|
160 |
]
|
161 |
|
162 |
|
163 |
+
def word_to_text(words: list[Word]) -> str:
|
164 |
+
return "".join(word.word for word in words)
|
165 |
+
|
166 |
+
|
167 |
+
def words_to_text_w_ts(words: list[Word]) -> str:
|
168 |
+
return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
|
169 |
|
170 |
|
171 |
+
def segments_to_text(segments: Iterable[Segment]) -> str:
|
172 |
+
return "".join(segment.text for segment in segments).strip()
|
173 |
|
174 |
|
175 |
def canonicalize_word(text: str) -> str:
|
|
|
187 |
|
188 |
def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
|
189 |
i = 0
|
190 |
+
while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
|
191 |
i += 1
|
192 |
return a[:i]
|
193 |
|
194 |
|
195 |
def test_common_prefix() -> None:
|
196 |
def word(text: str) -> Word:
|
197 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
198 |
|
199 |
a = [word("a"), word("b"), word("c")]
|
200 |
b = [word("a"), word("b"), word("c")]
|
|
|
227 |
|
228 |
def test_common_prefix_and_canonicalization() -> None:
|
229 |
def word(text: str) -> Word:
|
230 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
231 |
|
232 |
a = [word("A...")]
|
233 |
b = [word("a?"), word("b"), word("c")]
|
faster_whisper_server/main.py
CHANGED
@@ -24,7 +24,6 @@ from faster_whisper.vad import VadOptions, get_speech_timestamps
|
|
24 |
import huggingface_hub
|
25 |
from pydantic import AfterValidator
|
26 |
|
27 |
-
from faster_whisper_server import utils
|
28 |
from faster_whisper_server.asr import FasterWhisperASR
|
29 |
from faster_whisper_server.audio import AudioStream, audio_samples_from_file
|
30 |
from faster_whisper_server.config import (
|
@@ -34,6 +33,7 @@ from faster_whisper_server.config import (
|
|
34 |
Task,
|
35 |
config,
|
36 |
)
|
|
|
37 |
from faster_whisper_server.logger import logger
|
38 |
from faster_whisper_server.server_models import (
|
39 |
ModelListResponse,
|
@@ -46,7 +46,7 @@ from faster_whisper_server.transcriber import audio_transcriber
|
|
46 |
if TYPE_CHECKING:
|
47 |
from collections.abc import Generator, Iterable
|
48 |
|
49 |
-
from faster_whisper.transcribe import
|
50 |
from huggingface_hub.hf_api import ModelInfo
|
51 |
|
52 |
loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
|
@@ -157,7 +157,7 @@ def segments_to_response(
|
|
157 |
) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
|
158 |
segments = list(segments)
|
159 |
if response_format == ResponseFormat.TEXT: # noqa: RET503
|
160 |
-
return
|
161 |
elif response_format == ResponseFormat.JSON:
|
162 |
return TranscriptionJsonResponse.from_segments(segments)
|
163 |
elif response_format == ResponseFormat.VERBOSE_JSON:
|
@@ -220,6 +220,7 @@ def translate_file(
|
|
220 |
temperature=temperature,
|
221 |
vad_filter=True,
|
222 |
)
|
|
|
223 |
|
224 |
if stream:
|
225 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
@@ -258,6 +259,7 @@ def transcribe_file(
|
|
258 |
vad_filter=True,
|
259 |
hotwords=hotwords,
|
260 |
)
|
|
|
261 |
|
262 |
if stream:
|
263 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
|
|
24 |
import huggingface_hub
|
25 |
from pydantic import AfterValidator
|
26 |
|
|
|
27 |
from faster_whisper_server.asr import FasterWhisperASR
|
28 |
from faster_whisper_server.audio import AudioStream, audio_samples_from_file
|
29 |
from faster_whisper_server.config import (
|
|
|
33 |
Task,
|
34 |
config,
|
35 |
)
|
36 |
+
from faster_whisper_server.core import Segment, segments_to_text
|
37 |
from faster_whisper_server.logger import logger
|
38 |
from faster_whisper_server.server_models import (
|
39 |
ModelListResponse,
|
|
|
46 |
if TYPE_CHECKING:
|
47 |
from collections.abc import Generator, Iterable
|
48 |
|
49 |
+
from faster_whisper.transcribe import TranscriptionInfo
|
50 |
from huggingface_hub.hf_api import ModelInfo
|
51 |
|
52 |
loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
|
|
|
157 |
) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
|
158 |
segments = list(segments)
|
159 |
if response_format == ResponseFormat.TEXT: # noqa: RET503
|
160 |
+
return segments_to_text(segments)
|
161 |
elif response_format == ResponseFormat.JSON:
|
162 |
return TranscriptionJsonResponse.from_segments(segments)
|
163 |
elif response_format == ResponseFormat.VERBOSE_JSON:
|
|
|
220 |
temperature=temperature,
|
221 |
vad_filter=True,
|
222 |
)
|
223 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
224 |
|
225 |
if stream:
|
226 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
|
|
259 |
vad_filter=True,
|
260 |
hotwords=hotwords,
|
261 |
)
|
262 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
263 |
|
264 |
if stream:
|
265 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
faster_whisper_server/server_models.py
CHANGED
@@ -4,12 +4,10 @@ from typing import TYPE_CHECKING, Literal
|
|
4 |
|
5 |
from pydantic import BaseModel, ConfigDict, Field
|
6 |
|
7 |
-
from faster_whisper_server import
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
-
from faster_whisper.transcribe import
|
11 |
-
|
12 |
-
from faster_whisper_server.core import Transcription
|
13 |
|
14 |
|
15 |
# https://platform.openai.com/docs/api-reference/audio/json-object
|
@@ -18,65 +16,21 @@ class TranscriptionJsonResponse(BaseModel):
|
|
18 |
|
19 |
@classmethod
|
20 |
def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
|
21 |
-
return cls(text=
|
22 |
|
23 |
@classmethod
|
24 |
def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
|
25 |
return cls(text=transcription.text)
|
26 |
|
27 |
|
28 |
-
class WordObject(BaseModel):
|
29 |
-
start: float
|
30 |
-
end: float
|
31 |
-
word: str
|
32 |
-
probability: float
|
33 |
-
|
34 |
-
@classmethod
|
35 |
-
def from_word(cls, word: Word) -> WordObject:
|
36 |
-
return cls(
|
37 |
-
start=word.start,
|
38 |
-
end=word.end,
|
39 |
-
word=word.word,
|
40 |
-
probability=word.probability,
|
41 |
-
)
|
42 |
-
|
43 |
-
|
44 |
-
class SegmentObject(BaseModel):
|
45 |
-
id: int
|
46 |
-
seek: int
|
47 |
-
start: float
|
48 |
-
end: float
|
49 |
-
text: str
|
50 |
-
tokens: list[int]
|
51 |
-
temperature: float
|
52 |
-
avg_logprob: float
|
53 |
-
compression_ratio: float
|
54 |
-
no_speech_prob: float
|
55 |
-
|
56 |
-
@classmethod
|
57 |
-
def from_segment(cls, segment: Segment) -> SegmentObject:
|
58 |
-
return cls(
|
59 |
-
id=segment.id,
|
60 |
-
seek=segment.seek,
|
61 |
-
start=segment.start,
|
62 |
-
end=segment.end,
|
63 |
-
text=segment.text,
|
64 |
-
tokens=segment.tokens,
|
65 |
-
temperature=segment.temperature,
|
66 |
-
avg_logprob=segment.avg_logprob,
|
67 |
-
compression_ratio=segment.compression_ratio,
|
68 |
-
no_speech_prob=segment.no_speech_prob,
|
69 |
-
)
|
70 |
-
|
71 |
-
|
72 |
# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
|
73 |
class TranscriptionVerboseJsonResponse(BaseModel):
|
74 |
task: str = "transcribe"
|
75 |
language: str
|
76 |
duration: float
|
77 |
text: str
|
78 |
-
words: list[
|
79 |
-
segments: list[
|
80 |
|
81 |
@classmethod
|
82 |
def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
|
@@ -84,8 +38,8 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
84 |
language=transcription_info.language,
|
85 |
duration=segment.end - segment.start,
|
86 |
text=segment.text,
|
87 |
-
words=(
|
88 |
-
segments=[
|
89 |
)
|
90 |
|
91 |
@classmethod
|
@@ -95,9 +49,9 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
95 |
return cls(
|
96 |
language=transcription_info.language,
|
97 |
duration=transcription_info.duration,
|
98 |
-
text=
|
99 |
-
segments=
|
100 |
-
words=
|
101 |
)
|
102 |
|
103 |
@classmethod
|
@@ -106,15 +60,7 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
106 |
language="english", # FIX: hardcoded
|
107 |
duration=transcription.duration,
|
108 |
text=transcription.text,
|
109 |
-
words=
|
110 |
-
WordObject(
|
111 |
-
start=word.start,
|
112 |
-
end=word.end,
|
113 |
-
word=word.text,
|
114 |
-
probability=word.probability,
|
115 |
-
)
|
116 |
-
for word in transcription.words
|
117 |
-
],
|
118 |
segments=[], # FIX: hardcoded
|
119 |
)
|
120 |
|
|
|
4 |
|
5 |
from pydantic import BaseModel, ConfigDict, Field
|
6 |
|
7 |
+
from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
+
from faster_whisper.transcribe import TranscriptionInfo
|
|
|
|
|
11 |
|
12 |
|
13 |
# https://platform.openai.com/docs/api-reference/audio/json-object
|
|
|
16 |
|
17 |
@classmethod
|
18 |
def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
|
19 |
+
return cls(text=segments_to_text(segments))
|
20 |
|
21 |
@classmethod
|
22 |
def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
|
23 |
return cls(text=transcription.text)
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
|
27 |
class TranscriptionVerboseJsonResponse(BaseModel):
|
28 |
task: str = "transcribe"
|
29 |
language: str
|
30 |
duration: float
|
31 |
text: str
|
32 |
+
words: list[Word]
|
33 |
+
segments: list[Segment]
|
34 |
|
35 |
@classmethod
|
36 |
def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
|
|
|
38 |
language=transcription_info.language,
|
39 |
duration=segment.end - segment.start,
|
40 |
text=segment.text,
|
41 |
+
words=(segment.words if isinstance(segment.words, list) else []),
|
42 |
+
segments=[segment],
|
43 |
)
|
44 |
|
45 |
@classmethod
|
|
|
49 |
return cls(
|
50 |
language=transcription_info.language,
|
51 |
duration=transcription_info.duration,
|
52 |
+
text=segments_to_text(segments),
|
53 |
+
segments=segments,
|
54 |
+
words=Word.from_segments(segments),
|
55 |
)
|
56 |
|
57 |
@classmethod
|
|
|
60 |
language="english", # FIX: hardcoded
|
61 |
duration=transcription.duration,
|
62 |
text=transcription.text,
|
63 |
+
words=transcription.words,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
segments=[], # FIX: hardcoded
|
65 |
)
|
66 |
|
faster_whisper_server/transcriber.py
CHANGED
@@ -4,12 +4,7 @@ from typing import TYPE_CHECKING
|
|
4 |
|
5 |
from faster_whisper_server.audio import Audio, AudioStream
|
6 |
from faster_whisper_server.config import config
|
7 |
-
from faster_whisper_server.core import
|
8 |
-
Transcription,
|
9 |
-
Word,
|
10 |
-
common_prefix,
|
11 |
-
to_full_sentences,
|
12 |
-
)
|
13 |
from faster_whisper_server.logger import logger
|
14 |
|
15 |
if TYPE_CHECKING:
|
@@ -37,30 +32,16 @@ class LocalAgreement:
|
|
37 |
|
38 |
return prefix
|
39 |
|
40 |
-
@classmethod
|
41 |
-
def prompt(cls, confirmed: Transcription) -> str | None:
|
42 |
-
sentences = to_full_sentences(confirmed.words)
|
43 |
-
if len(sentences) == 0:
|
44 |
-
return None
|
45 |
-
return sentences[-1].text
|
46 |
-
|
47 |
-
# TODO: better name
|
48 |
-
@classmethod
|
49 |
-
def needs_audio_after(cls, confirmed: Transcription) -> float:
|
50 |
-
full_sentences = to_full_sentences(confirmed.words)
|
51 |
-
return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
|
52 |
-
|
53 |
|
|
|
54 |
def needs_audio_after(confirmed: Transcription) -> float:
|
55 |
full_sentences = to_full_sentences(confirmed.words)
|
56 |
-
return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
|
57 |
|
58 |
|
59 |
def prompt(confirmed: Transcription) -> str | None:
|
60 |
sentences = to_full_sentences(confirmed.words)
|
61 |
-
if len(sentences)
|
62 |
-
return None
|
63 |
-
return sentences[-1].text
|
64 |
|
65 |
|
66 |
async def audio_transcriber(
|
|
|
4 |
|
5 |
from faster_whisper_server.audio import Audio, AudioStream
|
6 |
from faster_whisper_server.config import config
|
7 |
+
from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
|
|
|
|
|
|
|
|
|
|
|
8 |
from faster_whisper_server.logger import logger
|
9 |
|
10 |
if TYPE_CHECKING:
|
|
|
32 |
|
33 |
return prefix
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
# TODO: needs a better name
|
37 |
def needs_audio_after(confirmed: Transcription) -> float:
|
38 |
full_sentences = to_full_sentences(confirmed.words)
|
39 |
+
return full_sentences[-1][-1].end if len(full_sentences) > 0 else 0.0
|
40 |
|
41 |
|
42 |
def prompt(confirmed: Transcription) -> str | None:
|
43 |
sentences = to_full_sentences(confirmed.words)
|
44 |
+
return word_to_text(sentences[-1]) if len(sentences) > 0 else None
|
|
|
|
|
45 |
|
46 |
|
47 |
async def audio_transcriber(
|
faster_whisper_server/utils.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
from faster_whisper.transcribe import Segment, Word
|
2 |
-
|
3 |
-
|
4 |
-
def segments_text(segments: list[Segment]) -> str:
|
5 |
-
return "".join(segment.text for segment in segments).strip()
|
6 |
-
|
7 |
-
|
8 |
-
def words_from_segments(segments: list[Segment]) -> list[Word]:
|
9 |
-
words = []
|
10 |
-
for segment in segments:
|
11 |
-
if segment.words is None:
|
12 |
-
continue
|
13 |
-
words.extend(segment.words)
|
14 |
-
return words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|