Fedir Zadniprovskyi
rename to `speaches`
ba81a8e
/**
* Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
*/
import 'dotenv/config';
import fs from 'node:fs';
import path from 'node:path';
import WebSocket from 'ws';
import ffmpeg from 'fluent-ffmpeg';
const ffmpegPath = process.env.FFMPEG_PATH || '/usr/bin/ffmpeg';
ffmpeg.setFfmpegPath(ffmpegPath);
/**
* Transcribe an audio file using the HTTP endpoint.
* Supported file types include wav, mp3, webm, and other types supported by the OpenAI API.
* I have tested with these three types.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} language - Language code
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function transcribeFile(filePath, model, language, responseFormat, temperature) {
const formData = new FormData();
formData.append('file', fs.createReadStream(filePath));
formData.append('model', model);
formData.append('language', language);
formData.append('response_format', responseFormat);
formData.append('temperature', temperature);
const response = await fetch(`${process.env.TRANSCRIPTION_API_BASE_URL}/v1/audio/transcriptions`, {
method: 'POST',
body: formData,
});
const transcription = await response.json();
console.log('Transcription Response:', transcription);
}
/**
* Translate an audio file using the HTTP endpoint.
* Only English is supported for translation.
* Currently, I am using GLM-4-9b-int8 to translate various voices.
* I am not sure if the author can add an endpoint for custom API+Key translation.
* I plan to package my frontend, fast-whisper-server, and vllm+glm-4-9b-int8 into one Docker container for unified deployment.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function translateFile(filePath, model, responseFormat, temperature) {
const formData = new FormData();
formData.append('file', fs.createReadStream(filePath));
formData.append('model', model);
formData.append('response_format', responseFormat);
formData.append('temperature', temperature);
const response = await fetch(`${process.env.TRANSLATION_API_BASE_URL}/v1/audio/translations`, {
method: 'POST',
body: formData,
});
const translation = await response.json();
console.log('Translation Response:', translation);
}
/**
* Send audio data over WebSocket for transcription.
* Currently, the supported file type for transcription is PCM.
* I am not sure if other types are supported.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} language - Language code
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function sendAudioOverWebSocket(filePath, model, language, responseFormat, temperature) {
const wsUrl = `ws://100.105.162.69:8000/v1/audio/transcriptions?model=${encodeURIComponent(model)}&language=${encodeURIComponent(language)}&response_format=${encodeURIComponent(responseFormat)}&temperature=${encodeURIComponent(temperature)}`;
const ws = new WebSocket(wsUrl);
ws.on('open', async () => {
const audioBuffer = fs.readFileSync(filePath);
ws.send(audioBuffer);
});
ws.on('message', (message) => {
const response = JSON.parse(message);
console.log('WebSocket Response:', response);
});
ws.on('close', () => {
console.log('WebSocket connection closed');
});
ws.on('error', (error) => {
console.error('WebSocket error:', error);
});
}
/**
* Convert audio file to PCM format.
*
* @param {string} filePath - Path to the audio file
* @returns {string} - Path to the converted PCM file
*/
async function convertToPcm(filePath) {
const pcmFilePath = filePath.replace(path.extname(filePath), '.pcm');
await new Promise((resolve, reject) => {
ffmpeg(filePath)
.audioChannels(1)
.audioFrequency(16000)
.audioCodec('pcm_s16le')
.toFormat('s16le')
.on('end', () => {
console.log(`Audio file successfully converted to PCM: ${pcmFilePath}`);
resolve(pcmFilePath);
})
.on('error', (error) => {
console.error(`Error converting audio to PCM: ${error.message}`);
reject(error);
})
.save(pcmFilePath);
});
return pcmFilePath;
}
async function main() {
const model = 'Systran/faster-whisper-large-v3';
const language = 'en';
const responseFormat = 'json';
const temperature = '0';
const filePath = './path/to/your/audio.webm'; // Replace with the actual file path
// Convert the audio file to PCM format
const pcmFilePath = await convertToPcm(filePath);
// Transcribe the audio file using the HTTP endpoint
await transcribeFile(pcmFilePath, model, language, responseFormat, temperature);
// Translate the audio file using the HTTP endpoint
await translateFile(pcmFilePath, model, responseFormat, temperature);
// Transcribe the audio file using the WebSocket endpoint
await sendAudioOverWebSocket(pcmFilePath, model, language, responseFormat, temperature);
}
// Make sure to use ffmpeg version 7 or above. The default apt-get install only installs version 4.x. Also, Ubuntu 22.04 or above is required to support version 7.x.
main().catch(console.error);
// Project URL: https://github.com/Gan-Xing/whisper