Spaces:

fedirz
/

faster-whisper-server

Configuration error

File size: 5,765 Bytes

12ab49b
ba81a8e
12ab49b
af95980
 
 
12ab49b

/**
 * Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
 */
import 'dotenv/config';
import fs from 'node:fs';
import path from 'node:path';
import WebSocket from 'ws';
import ffmpeg from 'fluent-ffmpeg';

const ffmpegPath = process.env.FFMPEG_PATH || '/usr/bin/ffmpeg';
ffmpeg.setFfmpegPath(ffmpegPath);

/**
 * Transcribe an audio file using the HTTP endpoint.
 * Supported file types include wav, mp3, webm, and other types supported by the OpenAI API.
 * I have tested with these three types.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} language - Language code
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function transcribeFile(filePath, model, language, responseFormat, temperature) {
    const formData = new FormData();
    formData.append('file', fs.createReadStream(filePath));
    formData.append('model', model);
    formData.append('language', language);
    formData.append('response_format', responseFormat);
    formData.append('temperature', temperature);

    const response = await fetch(`${process.env.TRANSCRIPTION_API_BASE_URL}/v1/audio/transcriptions`, {
        method: 'POST',
        body: formData,
    });

    const transcription = await response.json();
    console.log('Transcription Response:', transcription);
}

/**
 * Translate an audio file using the HTTP endpoint.
 * Only English is supported for translation.
 * Currently, I am using GLM-4-9b-int8 to translate various voices.
 * I am not sure if the author can add an endpoint for custom API+Key translation.
 * I plan to package my frontend, fast-whisper-server, and vllm+glm-4-9b-int8 into one Docker container for unified deployment.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function translateFile(filePath, model, responseFormat, temperature) {
    const formData = new FormData();
    formData.append('file', fs.createReadStream(filePath));
    formData.append('model', model);
    formData.append('response_format', responseFormat);
    formData.append('temperature', temperature);

    const response = await fetch(`${process.env.TRANSLATION_API_BASE_URL}/v1/audio/translations`, {
        method: 'POST',
        body: formData,
    });

    const translation = await response.json();
    console.log('Translation Response:', translation);
}

/**
 * Send audio data over WebSocket for transcription.
 * Currently, the supported file type for transcription is PCM.
 * I am not sure if other types are supported.
 *
 * @param {string} filePath - Path to the audio file
 * @param {string} model - Model name
 * @param {string} language - Language code
 * @param {string} responseFormat - Response format
 * @param {string} temperature - Temperature setting
 */
async function sendAudioOverWebSocket(filePath, model, language, responseFormat, temperature) {
    const wsUrl = `ws://100.105.162.69:8000/v1/audio/transcriptions?model=${encodeURIComponent(model)}&language=${encodeURIComponent(language)}&response_format=${encodeURIComponent(responseFormat)}&temperature=${encodeURIComponent(temperature)}`;
    const ws = new WebSocket(wsUrl);

    ws.on('open', async () => {
        const audioBuffer = fs.readFileSync(filePath);
        ws.send(audioBuffer);
    });

    ws.on('message', (message) => {
        const response = JSON.parse(message);
        console.log('WebSocket Response:', response);
    });

    ws.on('close', () => {
        console.log('WebSocket connection closed');
    });

    ws.on('error', (error) => {
        console.error('WebSocket error:', error);
    });
}

/**
 * Convert audio file to PCM format.
 *
 * @param {string} filePath - Path to the audio file
 * @returns {string} - Path to the converted PCM file
 */
async function convertToPcm(filePath) {
    const pcmFilePath = filePath.replace(path.extname(filePath), '.pcm');

    await new Promise((resolve, reject) => {
        ffmpeg(filePath)
            .audioChannels(1)
            .audioFrequency(16000)
            .audioCodec('pcm_s16le')
            .toFormat('s16le')
            .on('end', () => {
                console.log(`Audio file successfully converted to PCM: ${pcmFilePath}`);
                resolve(pcmFilePath);
            })
            .on('error', (error) => {
                console.error(`Error converting audio to PCM: ${error.message}`);
                reject(error);
            })
            .save(pcmFilePath);
    });

    return pcmFilePath;
}

async function main() {
    const model = 'Systran/faster-whisper-large-v3';
    const language = 'en';
    const responseFormat = 'json';
    const temperature = '0';
    const filePath = './path/to/your/audio.webm';  // Replace with the actual file path

    // Convert the audio file to PCM format
    const pcmFilePath = await convertToPcm(filePath);

    // Transcribe the audio file using the HTTP endpoint
    await transcribeFile(pcmFilePath, model, language, responseFormat, temperature);

    // Translate the audio file using the HTTP endpoint
    await translateFile(pcmFilePath, model, responseFormat, temperature);

    // Transcribe the audio file using the WebSocket endpoint
    await sendAudioOverWebSocket(pcmFilePath, model, language, responseFormat, temperature);
}

// Make sure to use ffmpeg version 7 or above. The default apt-get install only installs version 4.x. Also, Ubuntu 22.04 or above is required to support version 7.x.
main().catch(console.error);

// Project URL: https://github.com/Gan-Xing/whisper