Spaces:
Configuration error
Configuration error
File size: 5,765 Bytes
12ab49b ba81a8e 12ab49b af95980 12ab49b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
/**
* Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
*/
import 'dotenv/config';
import fs from 'node:fs';
import path from 'node:path';
import WebSocket from 'ws';
import ffmpeg from 'fluent-ffmpeg';
const ffmpegPath = process.env.FFMPEG_PATH || '/usr/bin/ffmpeg';
ffmpeg.setFfmpegPath(ffmpegPath);
/**
* Transcribe an audio file using the HTTP endpoint.
* Supported file types include wav, mp3, webm, and other types supported by the OpenAI API.
* I have tested with these three types.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} language - Language code
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function transcribeFile(filePath, model, language, responseFormat, temperature) {
const formData = new FormData();
formData.append('file', fs.createReadStream(filePath));
formData.append('model', model);
formData.append('language', language);
formData.append('response_format', responseFormat);
formData.append('temperature', temperature);
const response = await fetch(`${process.env.TRANSCRIPTION_API_BASE_URL}/v1/audio/transcriptions`, {
method: 'POST',
body: formData,
});
const transcription = await response.json();
console.log('Transcription Response:', transcription);
}
/**
* Translate an audio file using the HTTP endpoint.
* Only English is supported for translation.
* Currently, I am using GLM-4-9b-int8 to translate various voices.
* I am not sure if the author can add an endpoint for custom API+Key translation.
* I plan to package my frontend, fast-whisper-server, and vllm+glm-4-9b-int8 into one Docker container for unified deployment.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function translateFile(filePath, model, responseFormat, temperature) {
const formData = new FormData();
formData.append('file', fs.createReadStream(filePath));
formData.append('model', model);
formData.append('response_format', responseFormat);
formData.append('temperature', temperature);
const response = await fetch(`${process.env.TRANSLATION_API_BASE_URL}/v1/audio/translations`, {
method: 'POST',
body: formData,
});
const translation = await response.json();
console.log('Translation Response:', translation);
}
/**
* Send audio data over WebSocket for transcription.
* Currently, the supported file type for transcription is PCM.
* I am not sure if other types are supported.
*
* @param {string} filePath - Path to the audio file
* @param {string} model - Model name
* @param {string} language - Language code
* @param {string} responseFormat - Response format
* @param {string} temperature - Temperature setting
*/
async function sendAudioOverWebSocket(filePath, model, language, responseFormat, temperature) {
const wsUrl = `ws://100.105.162.69:8000/v1/audio/transcriptions?model=${encodeURIComponent(model)}&language=${encodeURIComponent(language)}&response_format=${encodeURIComponent(responseFormat)}&temperature=${encodeURIComponent(temperature)}`;
const ws = new WebSocket(wsUrl);
ws.on('open', async () => {
const audioBuffer = fs.readFileSync(filePath);
ws.send(audioBuffer);
});
ws.on('message', (message) => {
const response = JSON.parse(message);
console.log('WebSocket Response:', response);
});
ws.on('close', () => {
console.log('WebSocket connection closed');
});
ws.on('error', (error) => {
console.error('WebSocket error:', error);
});
}
/**
* Convert audio file to PCM format.
*
* @param {string} filePath - Path to the audio file
* @returns {string} - Path to the converted PCM file
*/
async function convertToPcm(filePath) {
const pcmFilePath = filePath.replace(path.extname(filePath), '.pcm');
await new Promise((resolve, reject) => {
ffmpeg(filePath)
.audioChannels(1)
.audioFrequency(16000)
.audioCodec('pcm_s16le')
.toFormat('s16le')
.on('end', () => {
console.log(`Audio file successfully converted to PCM: ${pcmFilePath}`);
resolve(pcmFilePath);
})
.on('error', (error) => {
console.error(`Error converting audio to PCM: ${error.message}`);
reject(error);
})
.save(pcmFilePath);
});
return pcmFilePath;
}
async function main() {
const model = 'Systran/faster-whisper-large-v3';
const language = 'en';
const responseFormat = 'json';
const temperature = '0';
const filePath = './path/to/your/audio.webm'; // Replace with the actual file path
// Convert the audio file to PCM format
const pcmFilePath = await convertToPcm(filePath);
// Transcribe the audio file using the HTTP endpoint
await transcribeFile(pcmFilePath, model, language, responseFormat, temperature);
// Translate the audio file using the HTTP endpoint
await translateFile(pcmFilePath, model, responseFormat, temperature);
// Transcribe the audio file using the WebSocket endpoint
await sendAudioOverWebSocket(pcmFilePath, model, language, responseFormat, temperature);
}
// Make sure to use ffmpeg version 7 or above. The default apt-get install only installs version 4.x. Also, Ubuntu 22.04 or above is required to support version 7.x.
main().catch(console.error);
// Project URL: https://github.com/Gan-Xing/whisper
|