Yhhxhfh commited on
Commit
056b100
verified
1 Parent(s): 0b6568c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -74
app.py CHANGED
@@ -77,7 +77,7 @@ def detect_audio_language(audio_path):
77
  language = detect(f.read())
78
  os.remove(temp_filepath)
79
  return language
80
- except:
81
  return None
82
 
83
  def split_text_into_chunks(text, max_chunk_length=200):
@@ -121,12 +121,11 @@ async def generate_music_with_voice(description, melody_audio, voice_audio, dura
121
 
122
  music_filename = await save_audio_to_storage(wav_music[0].cpu(), "music_" + str(uuid.uuid4()) + ".wav")
123
 
124
-
125
  if language not in supported_languages:
126
  raise ValueError(f"Language {language} not supported")
127
 
128
  if not text_prompt and not voice_audio:
129
- raise ValueError("Text prompt or voice audio is required")
130
 
131
  if text_prompt and len(text_prompt) > 1000:
132
  raise ValueError("Text prompt is too long, please keep it under 1000 characters")
@@ -177,7 +176,8 @@ async def generate_music_with_voice(description, melody_audio, voice_audio, dura
177
 
178
  return music_filename, voice_filename
179
 
180
-
 
181
  except Exception as e:
182
  return str(e), str(e)
183
 
@@ -202,81 +202,19 @@ iface = gr.Interface(
202
 
203
  iface.launch(share=True)
204
 
205
-
206
  app = FastAPI()
207
 
208
  @app.post("/synthesize")
209
  async def api_synthesize(prompt: str, language: str = "en", audio_file: UploadFile = File(...)):
210
  try:
211
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
212
- temp_audio_path = temp_audio_file.name
213
- with open(temp_audio_path, "wb") as f:
214
- f.write(await audio_file.read())
215
-
216
- audio_output_path, metrics_text = await predict(prompt, language, temp_audio_path)
217
- os.remove(temp_audio_path)
218
 
219
- if audio_output_path is None:
220
- return JSONResponse({"error": metrics_text}, status_code=500)
221
-
222
- return FileResponse(audio_output_path, media_type="audio/wav")
223
  except Exception as e:
224
- return JSONResponse({"error": str(e)}, status_code=500)
225
-
226
- async def predict(prompt, language, audio_file_pth):
227
- if language not in supported_languages:
228
- return None, f"Language {language} not supported"
229
-
230
- speaker_wav = audio_file_pth
231
-
232
- if len(prompt) < 2:
233
- return None, "Text prompt is too short"
234
 
235
- if len(prompt) > 1000:
236
- return None, "Text prompt is too long, please keep it under 1000 characters"
237
-
238
- try:
239
- gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
240
-
241
- detected_language = detect_audio_language(audio_file_pth)
242
- if detected_language:
243
- language = detected_language
244
-
245
- emotion = analyze_music_for_emotion(audio_file_pth)
246
-
247
- prosody_strength = 1.0
248
- speaking_rate = 1.0
249
-
250
- if emotion == "energetic":
251
- prosody_strength = 1.2
252
- speaking_rate = 1.1
253
- elif emotion == "sad":
254
- prosody_strength = 0.8
255
- speaking_rate = 0.9
256
- elif emotion == "happy":
257
- prosody_strength = 1.1
258
- speaking_rate = 1.05
259
-
260
- text_chunks = split_text_into_chunks(prompt)
261
- wav_chunks = []
262
- for chunk in tqdm(text_chunks, desc="Synthesizing voice chunks"):
263
- out = xtts_model.inference(
264
- chunk,
265
- language,
266
- gpt_cond_latent,
267
- speaker_embedding,
268
- repetition_penalty=5.0,
269
- temperature=0.75,
270
- enable_text_splitting=True,
271
- prosody_strength=prosody_strength,
272
- speaking_rate=speaking_rate
273
- )
274
- wav_chunks.append(torch.tensor(out["wav"]))
275
-
276
- final_wav = torch.cat(wav_chunks, dim=-1)
277
-
278
- output_filename = await save_audio_to_storage(final_wav, "output_" + str(uuid.uuid4()) + ".wav")
279
- return output_filename, None
280
-
281
- except Exception as e:
282
- return None, str(e)
 
77
  language = detect(f.read())
78
  os.remove(temp_filepath)
79
  return language
80
+ except Exception:
81
  return None
82
 
83
  def split_text_into_chunks(text, max_chunk_length=200):
 
121
 
122
  music_filename = await save_audio_to_storage(wav_music[0].cpu(), "music_" + str(uuid.uuid4()) + ".wav")
123
 
 
124
  if language not in supported_languages:
125
  raise ValueError(f"Language {language} not supported")
126
 
127
  if not text_prompt and not voice_audio:
128
+ raise ValueError("Text prompt or voice audio is required")
129
 
130
  if text_prompt and len(text_prompt) > 1000:
131
  raise ValueError("Text prompt is too long, please keep it under 1000 characters")
 
176
 
177
  return music_filename, voice_filename
178
 
179
+ except IsADirectoryError:
180
+ return "Error: Provided path is a directory, not a file.", "Error: Provided path is a directory, not a file."
181
  except Exception as e:
182
  return str(e), str(e)
183
 
 
202
 
203
  iface.launch(share=True)
204
 
 
205
  app = FastAPI()
206
 
207
  @app.post("/synthesize")
208
  async def api_synthesize(prompt: str, language: str = "en", audio_file: UploadFile = File(...)):
209
  try:
210
+ temp_audio_file = tempfile.NamedTemporaryFile(delete=False)
211
+ temp_audio_file.write(audio_file.file.read())
212
+ temp_audio_file.close()
 
 
 
 
213
 
214
+ music_output, voice_output = await generate_music_with_voice(prompt, None, temp_audio_file.name, None, None, language)
215
+ return JSONResponse(content={"music_output": music_output, "voice_output": voice_output})
 
 
216
  except Exception as e:
217
+ return JSONResponse(content={"error": str(e)})
 
 
 
 
 
 
 
 
 
218
 
219
+ if __name__ == "__main__":
220
+ app.run()