const express = require('express'); const fs = require('fs'); const fsp = fs.promises; const path = require('path'); const crypto = require('crypto'); const { spawn } = require('child_process'); const fetch = require('node-fetch'); const { v4: uuidv4 } = require('uuid'); const cors = require('cors'); const {generateImage} = require('./image.js') const app = express(); app.use(express.json()); // To parse JSON payloads app.use(cors()); // Enable CORS for all routes require('dotenv').config() const MEDIA_FOLDER = `public/media` const OPENAI_API_KEY = process.env.OPENAI_API_KEY // Ensure the MEDIA_FOLDER directory exists async function ensureDir(dir) { try { await fsp.mkdir(dir, { recursive: true }); } catch (err) { if (err.code !== 'EEXIST') throw err; } } (async () => { await ensureDir(MEDIA_FOLDER); })(); const audioCache = {}; // { [scriptHash]: audioFilePath } function parseScript(script) { const segments = script.trim().split('\n\n'); const parsedSegments = []; for (const segment of segments) { const [speaker_name, ...contentParts] = segment.split(': '); const content = contentParts.join(': '); parsedSegments.push({ speaker_name, content }); } return parsedSegments; } async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') { if (!OPENAI_API_KEY) { throw new Error('OPENAI_API_KEY is not set.'); } // Replace the URL below with the actual OpenAI TTS endpoint if available const response = await fetch('https://api.openai.com/v1/audio/speech', { method: 'POST', headers: { Authorization: `Bearer ${OPENAI_API_KEY}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ model: ttsModel, voice: voiceId, input: text, }), }); if (!response.ok) { const errorText = await response.text(); throw new Error(`OpenAI TTS request failed: ${errorText}`); } const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); await fsp.writeFile(audioFilename, buffer); } //this supports all openai voices with tts-1 and tts-1-hd models //voice name can be in openai format or one of the aliases in voiceLookupTable below async function generateAudio(speakerName, content, ttsModel="tts-1") { const voiceLookupTable = { DEFAULT: 'alloy', ALICE: 'shimmer', BOB: 'echo', JENNIFER: 'nova', PROFESSOR: 'fable', MALE_GUEST: 'onyx', FEMALE_GUEST: 'alloy', }; const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx'] const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']); const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`); await runOpenAITTS(content, fileName, actualVoiceId, ttsModel); return fileName; } async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) { try { /* TODO if (apiKey !== 'DEFAULT_API_KEY') { // Replace "DEFAULT_API_KEY" with your actual method of managing API keys res.status(401).send('Unauthorized'); return; } */ if (!script) { res.status(400).send('Bad Request: Missing payload'); return; } const hash = crypto.createHash('sha1'); hash.update(script); const scriptHash = hash.digest('hex'); if (audioCache[scriptHash]) { const filePath = audioCache[scriptHash]; res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } }); return; } const parsedSegments = parseScript(script); const audioSegments = []; for (const segment of parsedSegments) { const audioPath = await generateAudio(segment.speaker_name, segment.content); audioSegments.push(audioPath); } if (audioSegments.length === 0) { res.status(400).send('No audio generated'); return; } // Concatenate audio files into one using FFmpeg const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`); await concatenateAudioFiles(audioSegments, combinedAudioPath); audioCache[scriptHash] = combinedAudioPath; res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } }); } catch (error) { console.error('Error generating speech:', error); res.status(500).send('Internal Server Error'); } } function concatenateAudioFiles(audioFiles, outputFilePath) { return new Promise((resolve, reject) => { if (audioFiles.length === 1) { // If only one audio file, copy it directly fs.copyFileSync(audioFiles[0], outputFilePath); resolve(); return; } const listContent = audioFiles.join('|'); // Run FFmpeg with the concat protocol // ffmpeg -i "concat:file1.mp3|file2.mp3" -acodec copy output.mp3 const ffmpeg = spawn('ffmpeg', [ '-i', `concat:${listContent}`, '-acodec', 'copy', outputFilePath, ]); ffmpeg.stdout.on('data', (data) => { console.log(`stdout: ${data}`); }); ffmpeg.stderr.on('data', (data) => { console.error(`stderr: ${data}`); }); ffmpeg.on('close', (code) => { if (code === 0) { resolve(); } else { reject(new Error(`FFmpeg failed with exit code ${code}`)); } }); }); } // Payload should be film script style: speakernames in all caps and a blank line between them // ALICE: Hi bob,how are you? // // BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch // This is for multi-party TTS... For ordinary TTS call api/generate/utterance app.get('api/generate/speech', async (req, res) => { const {payload} = req.query await generateSpeechFromScript(payload, res) }) app.post('api/generate/speech', async (req, res) =>{ const {payload} = req.body await generateSpeechFromScript(payload, res) }) app.get('/api/hello', async(req, res) => { await res.status(200).send({"hello": "world"}, {headers: {"Content-Type":"application/json"}}) res.end() }) // This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table //Image generation parameters //response_format: image | url //prompt: the text prompt for generating the image. Length is not capped, however, most models only see the first 50 words (70 tokens) //model: you can use any model that's on huggingface and has serverless inference enabled. Specify in vendor/modelId format, ex: stabilityai/stable-diffusion-3-medium-diffusers //width: in pixels, optional, defaults to 1024 //height: in pixels, optional, defaults to 1024 //please note: individual models will have different limits for width and height... some older models cap out at 512 or 768. If your image gen requests are failing, try lowering width and height //The GET version defaults to an image response format (returns the binary content of the image directly) //This enables the creation of HTML, markdown, and other types of hypertext documents with "self-generating images" that call this API declaratively... no code needed //For example: //