File size: 1,605 Bytes
db7bf9a
 
313814b
 
db7bf9a
313814b
 
db7bf9a
313814b
 
db7bf9a
313814b
 
db7bf9a
313814b
db7bf9a
 
313814b
 
 
db7bf9a
313814b
 
db7bf9a
 
 
 
 
 
313814b
db7bf9a
 
313814b
db7bf9a
313814b
 
db7bf9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import annotations

import enum

from faster_whisper.transcribe import Segment, Word
from pydantic import BaseModel

from speaches.core import Transcription


# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format
class ResponseFormat(enum.StrEnum):
    TEXT = "text"
    JSON = "json"
    VERBOSE_JSON = "verbose_json"
    # VTT = "vtt"
    # SRT = "srt"


# https://platform.openai.com/docs/api-reference/audio/json-object
class TranscriptionJsonResponse(BaseModel):
    text: str

    @classmethod
    def from_transcription(
        cls, transcription: Transcription
    ) -> TranscriptionJsonResponse:
        return cls(text=transcription.text)


# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
class TranscriptionVerboseJsonResponse(BaseModel):
    task: str = "transcribe"
    language: str
    duration: float
    text: str
    words: list[Word]
    segments: list[Segment]

    @classmethod
    def from_transcription(
        cls, transcription: Transcription
    ) -> TranscriptionVerboseJsonResponse:
        return cls(
            language="english",  # FIX: hardcoded
            duration=transcription.duration,
            text=transcription.text,
            words=[
                Word(
                    start=word.start,
                    end=word.end,
                    word=word.text,
                    probability=word.probability,
                )
                for word in transcription.words
            ],
            segments=[],  # FIX: hardcoded
        )