Spaces:
Configuration error
Configuration error
File size: 3,936 Bytes
ec4d8ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from faster_whisper_server.api_models import TranscriptionWord
from faster_whisper_server.text_utils import (
canonicalize_word,
common_prefix,
is_eos,
srt_format_timestamp,
to_full_sentences,
vtt_format_timestamp,
)
def test_is_eos() -> None:
assert not is_eos("Hello")
assert not is_eos("Hello...")
assert is_eos("Hello.")
assert is_eos("Hello!")
assert is_eos("Hello?")
assert not is_eos("Hello. Yo")
assert not is_eos("Hello. Yo...")
assert is_eos("Hello. Yo.")
def tests_to_full_sentences() -> None:
def word(text: str) -> TranscriptionWord:
return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
assert to_full_sentences([]) == []
assert to_full_sentences([word(text="Hello")]) == []
assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
[word("Hello..."), word(" world.")],
]
def test_srt_format_timestamp() -> None:
assert srt_format_timestamp(0.0) == "00:00:00,000"
assert srt_format_timestamp(1.0) == "00:00:01,000"
assert srt_format_timestamp(1.234) == "00:00:01,234"
assert srt_format_timestamp(60.0) == "00:01:00,000"
assert srt_format_timestamp(61.0) == "00:01:01,000"
assert srt_format_timestamp(61.234) == "00:01:01,234"
assert srt_format_timestamp(3600.0) == "01:00:00,000"
assert srt_format_timestamp(3601.0) == "01:00:01,000"
assert srt_format_timestamp(3601.234) == "01:00:01,234"
assert srt_format_timestamp(23423.4234) == "06:30:23,423"
def test_vtt_format_timestamp() -> None:
assert vtt_format_timestamp(0.0) == "00:00:00.000"
assert vtt_format_timestamp(1.0) == "00:00:01.000"
assert vtt_format_timestamp(1.234) == "00:00:01.234"
assert vtt_format_timestamp(60.0) == "00:01:00.000"
assert vtt_format_timestamp(61.0) == "00:01:01.000"
assert vtt_format_timestamp(61.234) == "00:01:01.234"
assert vtt_format_timestamp(3600.0) == "01:00:00.000"
assert vtt_format_timestamp(3601.0) == "01:00:01.000"
assert vtt_format_timestamp(3601.234) == "01:00:01.234"
assert vtt_format_timestamp(23423.4234) == "06:30:23.423"
def test_canonicalize_word() -> None:
assert canonicalize_word("ABC") == "abc"
assert canonicalize_word("...ABC?") == "abc"
assert canonicalize_word("... AbC ...") == "abc"
def test_common_prefix() -> None:
def word(text: str) -> TranscriptionWord:
return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
a = [word("a"), word("b"), word("c")]
b = [word("a"), word("b"), word("c")]
assert common_prefix(a, b) == [word("a"), word("b"), word("c")]
a = [word("a"), word("b"), word("c")]
b = [word("a"), word("b"), word("d")]
assert common_prefix(a, b) == [word("a"), word("b")]
a = [word("a"), word("b"), word("c")]
b = [word("a")]
assert common_prefix(a, b) == [word("a")]
a = [word("a")]
b = [word("a"), word("b"), word("c")]
assert common_prefix(a, b) == [word("a")]
a = [word("a")]
b = []
assert common_prefix(a, b) == []
a = []
b = [word("a")]
assert common_prefix(a, b) == []
a = [word("a"), word("b"), word("c")]
b = [word("b"), word("c")]
assert common_prefix(a, b) == []
def test_common_prefix_and_canonicalization() -> None:
def word(text: str) -> TranscriptionWord:
return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
a = [word("A...")]
b = [word("a?"), word("b"), word("c")]
assert common_prefix(a, b) == [word("A...")]
a = [word("A..."), word("B?"), word("C,")]
b = [word("a??"), word(" b"), word(" ,c")]
assert common_prefix(a, b) == [word("A..."), word("B?"), word("C,")]
|