Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from TTS.api import TTS
|
| 3 |
-
from fastapi import FastAPI,
|
| 4 |
-
from fastapi.responses import FileResponse
|
| 5 |
import uvicorn
|
| 6 |
import time
|
| 7 |
import os
|
|
@@ -10,7 +10,10 @@ import os
|
|
| 10 |
# Configuration
|
| 11 |
# -----------------------------
|
| 12 |
YOURTTS_MODEL = "tts_models/multilingual/multi-dataset/your_tts"
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
OUTPUT_PATH = "output.wav"
|
| 15 |
|
| 16 |
tts = TTS(YOURTTS_MODEL, gpu=False)
|
|
@@ -18,15 +21,16 @@ tts = TTS(YOURTTS_MODEL, gpu=False)
|
|
| 18 |
# -----------------------------
|
| 19 |
# Core synthesis function
|
| 20 |
# -----------------------------
|
| 21 |
-
def synthesize(text: str):
|
| 22 |
-
|
| 23 |
-
if not os.path.exists(
|
| 24 |
-
return None, {"error": f"❌ Speaker file not found: {
|
| 25 |
|
|
|
|
| 26 |
try:
|
| 27 |
tts.tts_to_file(
|
| 28 |
text=text,
|
| 29 |
-
speaker_wav=
|
| 30 |
file_path=OUTPUT_PATH,
|
| 31 |
language="en"
|
| 32 |
)
|
|
@@ -41,7 +45,7 @@ def synthesize(text: str):
|
|
| 41 |
"processing_time_sec": round(total_time, 3),
|
| 42 |
"real_time_factor": rtf,
|
| 43 |
"model_used": YOURTTS_MODEL,
|
| 44 |
-
"speaker_used": os.path.basename(
|
| 45 |
}
|
| 46 |
return OUTPUT_PATH, info
|
| 47 |
|
|
@@ -51,16 +55,17 @@ def synthesize(text: str):
|
|
| 51 |
app = FastAPI(title="YourTTS FastAPI", description="Text-to-Speech API")
|
| 52 |
|
| 53 |
@app.post("/synthesize")
|
| 54 |
-
async def predict(request:
|
| 55 |
-
|
|
|
|
|
|
|
| 56 |
if not text:
|
| 57 |
-
return {"error": "Missing 'text' field"}
|
| 58 |
|
| 59 |
-
audio_path, info = synthesize(text)
|
| 60 |
if audio_path is None:
|
| 61 |
-
return info
|
| 62 |
|
| 63 |
-
# Use headers to send metadata, file as response
|
| 64 |
headers = {f"x-{k}": str(v) for k, v in info.items()}
|
| 65 |
return FileResponse(audio_path, media_type="audio/wav", filename="output.wav", headers=headers)
|
| 66 |
|
|
@@ -69,9 +74,12 @@ async def predict(request: dict):
|
|
| 69 |
# -----------------------------
|
| 70 |
demo = gr.Interface(
|
| 71 |
fn=synthesize,
|
| 72 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
| 73 |
outputs=[gr.Audio(type="filepath"), gr.JSON()],
|
| 74 |
-
title="YourTTS Voice Cloning (English Only,
|
| 75 |
allow_flagging="never"
|
| 76 |
)
|
| 77 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from TTS.api import TTS
|
| 3 |
+
from fastapi import FastAPI, Request
|
| 4 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 5 |
import uvicorn
|
| 6 |
import time
|
| 7 |
import os
|
|
|
|
| 10 |
# Configuration
|
| 11 |
# -----------------------------
|
| 12 |
YOURTTS_MODEL = "tts_models/multilingual/multi-dataset/your_tts"
|
| 13 |
+
SPEAKERS = {
|
| 14 |
+
"male": "speakers/voice1.wav",
|
| 15 |
+
"female": "speakers/voice2.wav"
|
| 16 |
+
}
|
| 17 |
OUTPUT_PATH = "output.wav"
|
| 18 |
|
| 19 |
tts = TTS(YOURTTS_MODEL, gpu=False)
|
|
|
|
| 21 |
# -----------------------------
|
| 22 |
# Core synthesis function
|
| 23 |
# -----------------------------
|
| 24 |
+
def synthesize(text: str, speaker: str = "female"):
|
| 25 |
+
speaker_path = SPEAKERS.get(speaker.lower())
|
| 26 |
+
if not speaker_path or not os.path.exists(speaker_path):
|
| 27 |
+
return None, {"error": f"❌ Speaker file not found: {speaker_path}"}
|
| 28 |
|
| 29 |
+
start_time = time.time()
|
| 30 |
try:
|
| 31 |
tts.tts_to_file(
|
| 32 |
text=text,
|
| 33 |
+
speaker_wav=speaker_path,
|
| 34 |
file_path=OUTPUT_PATH,
|
| 35 |
language="en"
|
| 36 |
)
|
|
|
|
| 45 |
"processing_time_sec": round(total_time, 3),
|
| 46 |
"real_time_factor": rtf,
|
| 47 |
"model_used": YOURTTS_MODEL,
|
| 48 |
+
"speaker_used": os.path.basename(speaker_path),
|
| 49 |
}
|
| 50 |
return OUTPUT_PATH, info
|
| 51 |
|
|
|
|
| 55 |
app = FastAPI(title="YourTTS FastAPI", description="Text-to-Speech API")
|
| 56 |
|
| 57 |
@app.post("/synthesize")
|
| 58 |
+
async def predict(request: Request):
|
| 59 |
+
data = await request.json()
|
| 60 |
+
text = data.get("text")
|
| 61 |
+
speaker = data.get("speaker", "female")
|
| 62 |
if not text:
|
| 63 |
+
return JSONResponse({"error": "Missing 'text' field"}, status_code=400)
|
| 64 |
|
| 65 |
+
audio_path, info = synthesize(text, speaker)
|
| 66 |
if audio_path is None:
|
| 67 |
+
return JSONResponse(info, status_code=500)
|
| 68 |
|
|
|
|
| 69 |
headers = {f"x-{k}": str(v) for k, v in info.items()}
|
| 70 |
return FileResponse(audio_path, media_type="audio/wav", filename="output.wav", headers=headers)
|
| 71 |
|
|
|
|
| 74 |
# -----------------------------
|
| 75 |
demo = gr.Interface(
|
| 76 |
fn=synthesize,
|
| 77 |
+
inputs=[
|
| 78 |
+
gr.Textbox(label="Text"),
|
| 79 |
+
gr.Dropdown(choices=["male", "female"], value="female", label="Speaker")
|
| 80 |
+
],
|
| 81 |
outputs=[gr.Audio(type="filepath"), gr.JSON()],
|
| 82 |
+
title="YourTTS Voice Cloning (English Only, Select Speaker)",
|
| 83 |
allow_flagging="never"
|
| 84 |
)
|
| 85 |
|