Spaces:
Running
Running
DeFactOfficial
commited on
Commit
•
ae8262b
1
Parent(s):
5a69576
Cleanup TTS API and add utterance route for simple one-party TTS
Browse files
api.js
CHANGED
@@ -76,7 +76,9 @@ async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') {
|
|
76 |
await fsp.writeFile(audioFilename, buffer);
|
77 |
}
|
78 |
|
79 |
-
|
|
|
|
|
80 |
const voiceLookupTable = {
|
81 |
DEFAULT: 'alloy',
|
82 |
ALICE: 'shimmer',
|
@@ -86,14 +88,64 @@ async function generateAudio(speakerName, content) {
|
|
86 |
MALE_GUEST: 'onyx',
|
87 |
FEMALE_GUEST: 'alloy',
|
88 |
};
|
89 |
-
|
90 |
-
|
|
|
91 |
const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
|
92 |
|
93 |
-
await runOpenAITTS(content, fileName, actualVoiceId,
|
94 |
return fileName;
|
95 |
}
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
function concatenateAudioFiles(audioFiles, outputFilePath) {
|
98 |
return new Promise((resolve, reject) => {
|
99 |
if (audioFiles.length === 1) {
|
@@ -134,64 +186,44 @@ function concatenateAudioFiles(audioFiles, outputFilePath) {
|
|
134 |
});
|
135 |
}
|
136 |
|
137 |
-
//
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
if (apiKey !== 'their_api_key') {
|
148 |
-
// Replace "their_api_key" with your actual method of managing API keys
|
149 |
-
res.status(401).send('Unauthorized');
|
150 |
-
return;
|
151 |
-
}
|
152 |
|
153 |
-
const script = req.query.payload;
|
154 |
-
if (!script) {
|
155 |
-
res.status(400).send('Bad Request: Missing payload');
|
156 |
-
return;
|
157 |
-
}
|
158 |
|
159 |
-
const hash = crypto.createHash('sha1');
|
160 |
-
hash.update(script);
|
161 |
-
const scriptHash = hash.digest('hex');
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
}
|
168 |
|
169 |
-
|
170 |
-
|
|
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
}
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
}
|
181 |
|
182 |
-
// Concatenate audio files into one using FFmpeg
|
183 |
-
const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
|
184 |
-
await concatenateAudioFiles(audioSegments, combinedAudioPath);
|
185 |
|
186 |
-
audioCache[scriptHash] = combinedAudioPath;
|
187 |
-
res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
188 |
-
} catch (error) {
|
189 |
-
console.error('Error generating speech:', error);
|
190 |
-
res.status(500).send('Internal Server Error');
|
191 |
-
}
|
192 |
-
});
|
193 |
|
194 |
-
//
|
|
|
195 |
app.post('/api/generate/speech/stream', async (req, res) => {
|
196 |
try {
|
197 |
const apiKey = req.query.api_key || 'their_api_key';
|
|
|
76 |
await fsp.writeFile(audioFilename, buffer);
|
77 |
}
|
78 |
|
79 |
+
//this supports all openai voices with tts-1 and tts-1-hd models
|
80 |
+
//voice name can be in openai format or one of the aliases in voiceLookupTable below
|
81 |
+
async function generateAudio(speakerName, content, ttsModel="tts-1") {
|
82 |
const voiceLookupTable = {
|
83 |
DEFAULT: 'alloy',
|
84 |
ALICE: 'shimmer',
|
|
|
88 |
MALE_GUEST: 'onyx',
|
89 |
FEMALE_GUEST: 'alloy',
|
90 |
};
|
91 |
+
const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx']
|
92 |
+
|
93 |
+
const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']);
|
94 |
const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
|
95 |
|
96 |
+
await runOpenAITTS(content, fileName, actualVoiceId, ttsModel);
|
97 |
return fileName;
|
98 |
}
|
99 |
|
100 |
+
async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) {
|
101 |
+
try {
|
102 |
+
/* TODO
|
103 |
+
if (apiKey !== 'DEFAULT_API_KEY') {
|
104 |
+
// Replace "DEFAULT_API_KEY" with your actual method of managing API keys
|
105 |
+
res.status(401).send('Unauthorized');
|
106 |
+
return;
|
107 |
+
} */
|
108 |
+
|
109 |
+
if (!script) {
|
110 |
+
res.status(400).send('Bad Request: Missing payload');
|
111 |
+
return;
|
112 |
+
}
|
113 |
+
|
114 |
+
const hash = crypto.createHash('sha1');
|
115 |
+
hash.update(script);
|
116 |
+
const scriptHash = hash.digest('hex');
|
117 |
+
|
118 |
+
if (audioCache[scriptHash]) {
|
119 |
+
const filePath = audioCache[scriptHash];
|
120 |
+
res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
121 |
+
return;
|
122 |
+
}
|
123 |
+
|
124 |
+
const parsedSegments = parseScript(script);
|
125 |
+
const audioSegments = [];
|
126 |
+
|
127 |
+
for (const segment of parsedSegments) {
|
128 |
+
const audioPath = await generateAudio(segment.speaker_name, segment.content);
|
129 |
+
audioSegments.push(audioPath);
|
130 |
+
}
|
131 |
+
|
132 |
+
if (audioSegments.length === 0) {
|
133 |
+
res.status(400).send('No audio generated');
|
134 |
+
return;
|
135 |
+
}
|
136 |
+
|
137 |
+
// Concatenate audio files into one using FFmpeg
|
138 |
+
const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
|
139 |
+
await concatenateAudioFiles(audioSegments, combinedAudioPath);
|
140 |
+
|
141 |
+
audioCache[scriptHash] = combinedAudioPath;
|
142 |
+
res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
143 |
+
} catch (error) {
|
144 |
+
console.error('Error generating speech:', error);
|
145 |
+
res.status(500).send('Internal Server Error');
|
146 |
+
}
|
147 |
+
}
|
148 |
+
|
149 |
function concatenateAudioFiles(audioFiles, outputFilePath) {
|
150 |
return new Promise((resolve, reject) => {
|
151 |
if (audioFiles.length === 1) {
|
|
|
186 |
});
|
187 |
}
|
188 |
|
189 |
+
// Payload should be film script style: speakernames in all caps and a blank line between them
|
190 |
+
// ALICE: Hi bob,how are you?
|
191 |
+
//
|
192 |
+
// BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch
|
193 |
+
// This is for multi-party TTS... For ordinary TTS call api/generate/utterance
|
194 |
+
app.get('api/generate/speech', async (req, res) => {
|
195 |
+
const {payload} = req.query
|
196 |
+
await generateSpeechFromScript(payload)
|
197 |
+
})
|
198 |
|
199 |
+
app.post('api/generate/speech', async (req, res) =>{
|
200 |
+
const {payload} = req.body
|
201 |
+
await generateSpeechFromScript(payload)
|
202 |
+
})
|
|
|
|
|
|
|
|
|
|
|
203 |
|
|
|
|
|
|
|
|
|
|
|
204 |
|
|
|
|
|
|
|
205 |
|
206 |
+
// This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table
|
207 |
+
app.get('api/generate/utterance', async (req, res) => {
|
208 |
+
const {voice, text, model} = req.query
|
209 |
+
const outputFilename= await generateAudio(voice, text, model || "tts-1")
|
|
|
210 |
|
211 |
+
// We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
|
212 |
+
res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
|
213 |
+
})
|
214 |
|
215 |
+
app.post('api/generate/utterance', async (req, res) =>{
|
216 |
+
const {voice, text, model} = req.body
|
217 |
+
const outputFilename= await generateAudio(voice, text, model || "tts-1")
|
|
|
218 |
|
219 |
+
// We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
|
220 |
+
res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
|
221 |
+
})
|
|
|
222 |
|
|
|
|
|
|
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
+
// This returns a stream of SSE (application/event-stream) similar to a streaming response from an LLM
|
226 |
+
// See example in public/client for how to consume the stream
|
227 |
app.post('/api/generate/speech/stream', async (req, res) => {
|
228 |
try {
|
229 |
const apiKey = req.query.api_key || 'their_api_key';
|