musicgen-songstarter

Sleeping

App Files Files Community

Yhhxhfh commited on Sep 29, 2024

Commit

056b100

verified ·

1 Parent(s): 0b6568c

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -74

app.py CHANGED Viewed

@@ -77,7 +77,7 @@ def detect_audio_language(audio_path):
             language = detect(f.read())
         os.remove(temp_filepath)
         return language
-    except:
         return None
 def split_text_into_chunks(text, max_chunk_length=200):
@@ -121,12 +121,11 @@ async def generate_music_with_voice(description, melody_audio, voice_audio, dura
             music_filename = await save_audio_to_storage(wav_music[0].cpu(), "music_" + str(uuid.uuid4()) + ".wav")
         if language not in supported_languages:
             raise ValueError(f"Language {language} not supported")
         if not text_prompt and not voice_audio:
-           raise ValueError("Text prompt or voice audio is required")
         if text_prompt and len(text_prompt) > 1000:
             raise ValueError("Text prompt is too long, please keep it under 1000 characters")
@@ -177,7 +176,8 @@ async def generate_music_with_voice(description, melody_audio, voice_audio, dura
         return music_filename, voice_filename
     except Exception as e:
         return str(e), str(e)
@@ -202,81 +202,19 @@ iface = gr.Interface(
 iface.launch(share=True)
 app = FastAPI()
 @app.post("/synthesize")
 async def api_synthesize(prompt: str, language: str = "en", audio_file: UploadFile = File(...)):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
-            temp_audio_path = temp_audio_file.name
-            with open(temp_audio_path, "wb") as f:
-                f.write(await audio_file.read())
-        audio_output_path, metrics_text = await predict(prompt, language, temp_audio_path)
-        os.remove(temp_audio_path)
-        if audio_output_path is None:
-            return JSONResponse({"error": metrics_text}, status_code=500)
-        return FileResponse(audio_output_path, media_type="audio/wav")
     except Exception as e:
-        return JSONResponse({"error": str(e)}, status_code=500)
-async def predict(prompt, language, audio_file_pth):
-    if language not in supported_languages:
-        return None, f"Language {language} not supported"
-    speaker_wav = audio_file_pth
-    if len(prompt) < 2:
-        return None, "Text prompt is too short"
-    if len(prompt) > 1000:
-        return None, "Text prompt is too long, please keep it under 1000 characters"
-    try:
-        gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
-        detected_language = detect_audio_language(audio_file_pth)
-        if detected_language:
-            language = detected_language
-        emotion = analyze_music_for_emotion(audio_file_pth)
-        prosody_strength = 1.0
-        speaking_rate = 1.0
-        if emotion == "energetic":
-            prosody_strength = 1.2
-            speaking_rate = 1.1
-        elif emotion == "sad":
-            prosody_strength = 0.8
-            speaking_rate = 0.9
-        elif emotion == "happy":
-            prosody_strength = 1.1
-            speaking_rate = 1.05
-        text_chunks = split_text_into_chunks(prompt)
-        wav_chunks = []
-        for chunk in tqdm(text_chunks, desc="Synthesizing voice chunks"):
-            out = xtts_model.inference(
-                chunk,
-                language,
-                gpt_cond_latent,
-                speaker_embedding,
-                repetition_penalty=5.0,
-                temperature=0.75,
-                enable_text_splitting=True,
-                prosody_strength=prosody_strength,
-                speaking_rate=speaking_rate
-            )
-            wav_chunks.append(torch.tensor(out["wav"]))
-        final_wav = torch.cat(wav_chunks, dim=-1)
-        output_filename = await save_audio_to_storage(final_wav, "output_" + str(uuid.uuid4()) + ".wav")
-        return output_filename, None
-    except Exception as e:
-        return None, str(e)

             language = detect(f.read())
         os.remove(temp_filepath)
         return language
+    except Exception:
         return None
 def split_text_into_chunks(text, max_chunk_length=200):
             music_filename = await save_audio_to_storage(wav_music[0].cpu(), "music_" + str(uuid.uuid4()) + ".wav")
         if language not in supported_languages:
             raise ValueError(f"Language {language} not supported")
         if not text_prompt and not voice_audio:
+            raise ValueError("Text prompt or voice audio is required")
         if text_prompt and len(text_prompt) > 1000:
             raise ValueError("Text prompt is too long, please keep it under 1000 characters")
         return music_filename, voice_filename
+    except IsADirectoryError:
+        return "Error: Provided path is a directory, not a file.", "Error: Provided path is a directory, not a file."
     except Exception as e:
         return str(e), str(e)
 iface.launch(share=True)
 app = FastAPI()
 @app.post("/synthesize")
 async def api_synthesize(prompt: str, language: str = "en", audio_file: UploadFile = File(...)):
     try:
+        temp_audio_file = tempfile.NamedTemporaryFile(delete=False)
+        temp_audio_file.write(audio_file.file.read())
+        temp_audio_file.close()
+        music_output, voice_output = await generate_music_with_voice(prompt, None, temp_audio_file.name, None, None, language)
+        return JSONResponse(content={"music_output": music_output, "voice_output": voice_output})
     except Exception as e:
+        return JSONResponse(content={"error": str(e)})
+if __name__ == "__main__":
+    app.run()