File size: 5,765 Bytes
12ab49b
ba81a8e
12ab49b
af95980
 
 
12ab49b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/**
 * Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
 */
import 'dotenv/config';
import fs from 'node:fs';
import path from 'node:path';
import WebSocket from 'ws';
import ffmpeg from 'fluent-ffmpeg';

const ffmpegPath = process.env.FFMPEG_PATH || '/usr/bin/ffmpeg';
ffmpeg.setFfmpegPath(ffmpegPath);

/**
 * Transcribe an audio file using the HTTP endpoint.
 * Supported file types include wav, mp3, webm, and other types supported by the OpenAI API.
 * I have tested with these three types.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} language - Language code
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function transcribeFile(filePath, model, language, responseFormat, temperature) {
    const formData = new FormData();
    formData.append('file', fs.createReadStream(filePath));
    formData.append('model', model);
    formData.append('language', language);
    formData.append('response_format', responseFormat);
    formData.append('temperature', temperature);

    const response = await fetch(`${process.env.TRANSCRIPTION_API_BASE_URL}/v1/audio/transcriptions`, {
        method: 'POST',
        body: formData,
    });

    const transcription = await response.json();
    console.log('Transcription Response:', transcription);
}

/**
 * Translate an audio file using the HTTP endpoint.
 * Only English is supported for translation.
 * Currently, I am using GLM-4-9b-int8 to translate various voices.
 * I am not sure if the author can add an endpoint for custom API+Key translation.
 * I plan to package my frontend, fast-whisper-server, and vllm+glm-4-9b-int8 into one Docker container for unified deployment.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function translateFile(filePath, model, responseFormat, temperature) {
    const formData = new FormData();
    formData.append('file', fs.createReadStream(filePath));
    formData.append('model', model);
    formData.append('response_format', responseFormat);
    formData.append('temperature', temperature);

    const response = await fetch(`${process.env.TRANSLATION_API_BASE_URL}/v1/audio/translations`, {
        method: 'POST',
        body: formData,
    });

    const translation = await response.json();
    console.log('Translation Response:', translation);
}

/**
 * Send audio data over WebSocket for transcription.
 * Currently, the supported file type for transcription is PCM.
 * I am not sure if other types are supported.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} language - Language code
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function sendAudioOverWebSocket(filePath, model, language, responseFormat, temperature) {
    const wsUrl = `ws://100.105.162.69:8000/v1/audio/transcriptions?model=${encodeURIComponent(model)}&language=${encodeURIComponent(language)}&response_format=${encodeURIComponent(responseFormat)}&temperature=${encodeURIComponent(temperature)}`;
    const ws = new WebSocket(wsUrl);

    ws.on('open', async () => {
        const audioBuffer = fs.readFileSync(filePath);
        ws.send(audioBuffer);
    });

    ws.on('message', (message) => {
        const response = JSON.parse(message);
        console.log('WebSocket Response:', response);
    });

    ws.on('close', () => {
        console.log('WebSocket connection closed');
    });

    ws.on('error', (error) => {
        console.error('WebSocket error:', error);
    });
}

/**
 * Convert audio file to PCM format.
 *
 * @param {string} filePath - Path to the audio file
 * @returns {string} - Path to the converted PCM file
 */
async function convertToPcm(filePath) {
    const pcmFilePath = filePath.replace(path.extname(filePath), '.pcm');

    await new Promise((resolve, reject) => {
        ffmpeg(filePath)
            .audioChannels(1)
            .audioFrequency(16000)
            .audioCodec('pcm_s16le')
            .toFormat('s16le')
            .on('end', () => {
                console.log(`Audio file successfully converted to PCM: ${pcmFilePath}`);
                resolve(pcmFilePath);
            })
            .on('error', (error) => {
                console.error(`Error converting audio to PCM: ${error.message}`);
                reject(error);
            })
            .save(pcmFilePath);
    });

    return pcmFilePath;
}

async function main() {
    const model = 'Systran/faster-whisper-large-v3';
    const language = 'en';
    const responseFormat = 'json';
    const temperature = '0';
    const filePath = './path/to/your/audio.webm';  // Replace with the actual file path

    // Convert the audio file to PCM format
    const pcmFilePath = await convertToPcm(filePath);

    // Transcribe the audio file using the HTTP endpoint
    await transcribeFile(pcmFilePath, model, language, responseFormat, temperature);

    // Translate the audio file using the HTTP endpoint
    await translateFile(pcmFilePath, model, responseFormat, temperature);

    // Transcribe the audio file using the WebSocket endpoint
    await sendAudioOverWebSocket(pcmFilePath, model, language, responseFormat, temperature);
}

// Make sure to use ffmpeg version 7 or above. The default apt-get install only installs version 4.x. Also, Ubuntu 22.04 or above is required to support version 7.x.
main().catch(console.error);

// Project URL: https://github.com/Gan-Xing/whisper