| | const axios = require('axios'); |
| | const { logger } = require('@librechat/data-schemas'); |
| | const { HttpsProxyAgent } = require('https-proxy-agent'); |
| | const { genAzureEndpoint, logAxiosError } = require('@librechat/api'); |
| | const { extractEnvVariable, TTSProviders } = require('librechat-data-provider'); |
| | const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio'); |
| | const { getAppConfig } = require('~/server/services/Config'); |
| |
|
| | |
| | |
| | |
| | |
| | class TTSService { |
| | |
| | |
| | |
| | constructor() { |
| | this.providerStrategies = { |
| | [TTSProviders.OPENAI]: this.openAIProvider.bind(this), |
| | [TTSProviders.AZURE_OPENAI]: this.azureOpenAIProvider.bind(this), |
| | [TTSProviders.ELEVENLABS]: this.elevenLabsProvider.bind(this), |
| | [TTSProviders.LOCALAI]: this.localAIProvider.bind(this), |
| | }; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | static async getInstance() { |
| | return new TTSService(); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | getProvider(appConfig) { |
| | const ttsSchema = appConfig?.speech?.tts; |
| | if (!ttsSchema) { |
| | throw new Error( |
| | 'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?', |
| | ); |
| | } |
| | const providers = Object.entries(ttsSchema).filter( |
| | ([, value]) => Object.keys(value).length > 0, |
| | ); |
| |
|
| | if (providers.length !== 1) { |
| | throw new Error( |
| | providers.length > 1 |
| | ? 'Multiple providers are set. Please set only one provider.' |
| | : 'No provider is set. Please set a provider.', |
| | ); |
| | } |
| | return providers[0][0]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async getVoice(providerSchema, requestVoice) { |
| | const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL'); |
| | let voice = requestVoice; |
| | if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) { |
| | voice = getRandomVoiceId(voices); |
| | } |
| | return voice; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | removeUndefined(obj) { |
| | Object.keys(obj).forEach((key) => { |
| | if (obj[key] && typeof obj[key] === 'object') { |
| | this.removeUndefined(obj[key]); |
| | if (Object.keys(obj[key]).length === 0) { |
| | delete obj[key]; |
| | } |
| | } else if (obj[key] === undefined) { |
| | delete obj[key]; |
| | } |
| | }); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | openAIProvider(ttsSchema, input, voice) { |
| | const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech'; |
| |
|
| | if ( |
| | ttsSchema?.voices && |
| | ttsSchema.voices.length > 0 && |
| | !ttsSchema.voices.includes(voice) && |
| | !ttsSchema.voices.includes('ALL') |
| | ) { |
| | throw new Error(`Voice ${voice} is not available.`); |
| | } |
| |
|
| | const data = { |
| | input, |
| | model: ttsSchema?.model, |
| | voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| | backend: ttsSchema?.backend, |
| | }; |
| |
|
| | const headers = { |
| | 'Content-Type': 'application/json', |
| | Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`, |
| | }; |
| |
|
| | return [url, data, headers]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | azureOpenAIProvider(ttsSchema, input, voice) { |
| | const url = `${genAzureEndpoint({ |
| | azureOpenAIApiInstanceName: extractEnvVariable(ttsSchema?.instanceName), |
| | azureOpenAIApiDeploymentName: extractEnvVariable(ttsSchema?.deploymentName), |
| | })}/audio/speech?api-version=${extractEnvVariable(ttsSchema?.apiVersion)}`; |
| |
|
| | if ( |
| | ttsSchema?.voices && |
| | ttsSchema.voices.length > 0 && |
| | !ttsSchema.voices.includes(voice) && |
| | !ttsSchema.voices.includes('ALL') |
| | ) { |
| | throw new Error(`Voice ${voice} is not available.`); |
| | } |
| |
|
| | const data = { |
| | model: extractEnvVariable(ttsSchema?.model), |
| | input, |
| | voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| | }; |
| |
|
| | const headers = { |
| | 'Content-Type': 'application/json', |
| | 'api-key': ttsSchema.apiKey ? extractEnvVariable(ttsSchema.apiKey) : '', |
| | }; |
| |
|
| | return [url, data, headers]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | elevenLabsProvider(ttsSchema, input, voice, stream) { |
| | let url = |
| | ttsSchema?.url || |
| | `https://api.elevenlabs.io/v1/text-to-speech/${voice}${stream ? '/stream' : ''}`; |
| |
|
| | if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) { |
| | throw new Error(`Voice ${voice} is not available.`); |
| | } |
| |
|
| | const data = { |
| | model_id: ttsSchema?.model, |
| | text: input, |
| | voice_settings: { |
| | similarity_boost: ttsSchema?.voice_settings?.similarity_boost, |
| | stability: ttsSchema?.voice_settings?.stability, |
| | style: ttsSchema?.voice_settings?.style, |
| | use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost, |
| | }, |
| | pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators, |
| | }; |
| |
|
| | const headers = { |
| | 'Content-Type': 'application/json', |
| | 'xi-api-key': extractEnvVariable(ttsSchema?.apiKey), |
| | Accept: 'audio/mpeg', |
| | }; |
| |
|
| | return [url, data, headers]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | localAIProvider(ttsSchema, input, voice) { |
| | const url = ttsSchema?.url; |
| |
|
| | if ( |
| | ttsSchema?.voices && |
| | ttsSchema.voices.length > 0 && |
| | !ttsSchema.voices.includes(voice) && |
| | !ttsSchema.voices.includes('ALL') |
| | ) { |
| | throw new Error(`Voice ${voice} is not available.`); |
| | } |
| |
|
| | const data = { |
| | input, |
| | model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| | backend: ttsSchema?.backend, |
| | }; |
| |
|
| | const headers = { |
| | 'Content-Type': 'application/json', |
| | Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`, |
| | }; |
| |
|
| | if (extractEnvVariable(ttsSchema.apiKey) === '') { |
| | delete headers.Authorization; |
| | } |
| |
|
| | return [url, data, headers]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async ttsRequest(provider, ttsSchema, { input, voice, stream = true }) { |
| | const strategy = this.providerStrategies[provider]; |
| | if (!strategy) { |
| | throw new Error('Invalid provider'); |
| | } |
| |
|
| | const [url, data, headers] = strategy.call(this, ttsSchema, input, voice, stream); |
| |
|
| | [data, headers].forEach(this.removeUndefined.bind(this)); |
| |
|
| | const options = { headers, responseType: stream ? 'stream' : 'arraybuffer' }; |
| |
|
| | if (process.env.PROXY) { |
| | options.httpsAgent = new HttpsProxyAgent(process.env.PROXY); |
| | } |
| |
|
| | try { |
| | return await axios.post(url, data, options); |
| | } catch (error) { |
| | logAxiosError({ message: `TTS request failed for provider ${provider}:`, error }); |
| | throw error; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async processTextToSpeech(req, res) { |
| | const { input, voice: requestVoice } = req.body; |
| |
|
| | if (!input) { |
| | return res.status(400).send('Missing text in request body'); |
| | } |
| |
|
| | const appConfig = |
| | req.config ?? |
| | (await getAppConfig({ |
| | role: req.user?.role, |
| | })); |
| | try { |
| | res.setHeader('Content-Type', 'audio/mpeg'); |
| | const provider = this.getProvider(appConfig); |
| | const ttsSchema = appConfig?.speech?.tts?.[provider]; |
| | const voice = await this.getVoice(ttsSchema, requestVoice); |
| |
|
| | if (input.length < 4096) { |
| | const response = await this.ttsRequest(provider, ttsSchema, { input, voice }); |
| | response.data.pipe(res); |
| | return; |
| | } |
| |
|
| | const textChunks = splitTextIntoChunks(input, 1000); |
| |
|
| | for (const chunk of textChunks) { |
| | try { |
| | const response = await this.ttsRequest(provider, ttsSchema, { |
| | voice, |
| | input: chunk.text, |
| | stream: true, |
| | }); |
| |
|
| | logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`); |
| | await new Promise((resolve) => { |
| | response.data.pipe(res, { end: chunk.isFinished }); |
| | response.data.on('end', resolve); |
| | }); |
| |
|
| | if (chunk.isFinished) { |
| | break; |
| | } |
| | } catch (innerError) { |
| | logAxiosError({ |
| | message: `[TTS] Error processing manual update for chunk: ${chunk?.text?.substring(0, 50)}...`, |
| | error: innerError, |
| | }); |
| | if (!res.headersSent) { |
| | return res.status(500).end(); |
| | } |
| | return; |
| | } |
| | } |
| |
|
| | if (!res.headersSent) { |
| | res.end(); |
| | } |
| | } catch (error) { |
| | logAxiosError({ message: '[TTS] Error creating the audio stream:', error }); |
| | if (!res.headersSent) { |
| | return res.status(500).send('An error occurred'); |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async streamAudio(req, res) { |
| | res.setHeader('Content-Type', 'audio/mpeg'); |
| | const appConfig = |
| | req.config ?? |
| | (await getAppConfig({ |
| | role: req.user?.role, |
| | })); |
| | const provider = this.getProvider(appConfig); |
| | const ttsSchema = appConfig?.speech?.tts?.[provider]; |
| | const voice = await this.getVoice(ttsSchema, req.body.voice); |
| |
|
| | let shouldContinue = true; |
| |
|
| | req.on('close', () => { |
| | logger.warn('[streamAudio] Audio Stream Request closed by client'); |
| | shouldContinue = false; |
| | }); |
| |
|
| | const processChunks = createChunkProcessor(req.user.id, req.body.messageId); |
| |
|
| | try { |
| | while (shouldContinue) { |
| | const updates = await processChunks(); |
| | if (typeof updates === 'string') { |
| | logger.error(`Error processing audio stream updates: ${updates}`); |
| | return res.status(500).end(); |
| | } |
| |
|
| | if (updates.length === 0) { |
| | await new Promise((resolve) => setTimeout(resolve, 1250)); |
| | continue; |
| | } |
| |
|
| | for (const update of updates) { |
| | try { |
| | const response = await this.ttsRequest(provider, ttsSchema, { |
| | voice, |
| | input: update.text, |
| | stream: true, |
| | }); |
| |
|
| | if (!shouldContinue) { |
| | break; |
| | } |
| |
|
| | logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`); |
| | await new Promise((resolve) => { |
| | response.data.pipe(res, { end: update.isFinished }); |
| | response.data.on('end', resolve); |
| | }); |
| |
|
| | if (update.isFinished) { |
| | shouldContinue = false; |
| | break; |
| | } |
| | } catch (innerError) { |
| | logAxiosError({ |
| | message: `[TTS] Error processing audio stream update: ${update?.text?.substring(0, 50)}...`, |
| | error: innerError, |
| | }); |
| | if (!res.headersSent) { |
| | return res.status(500).end(); |
| | } |
| | return; |
| | } |
| | } |
| |
|
| | if (!shouldContinue) { |
| | break; |
| | } |
| | } |
| |
|
| | if (!res.headersSent) { |
| | res.end(); |
| | } |
| | } catch (error) { |
| | logAxiosError({ message: '[TTS] Failed to fetch audio:', error }); |
| | if (!res.headersSent) { |
| | res.status(500).end(); |
| | } |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | async function createTTSService() { |
| | return TTSService.getInstance(); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async function textToSpeech(req, res) { |
| | const ttsService = await createTTSService(); |
| | await ttsService.processTextToSpeech(req, res); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async function streamAudio(req, res) { |
| | const ttsService = await createTTSService(); |
| | await ttsService.streamAudio(req, res); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | async function getProvider(appConfig) { |
| | const ttsService = await createTTSService(); |
| | return ttsService.getProvider(appConfig); |
| | } |
| |
|
| | module.exports = { |
| | textToSpeech, |
| | streamAudio, |
| | getProvider, |
| | }; |
| |
|