eaglelandsonce's picture
Update app.py
caae255 verified
import os, io, uuid, re, tempfile, traceback
from typing import List
# ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ----
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
os.environ.setdefault("COQUI_TOS_AGREED", "1") # add this line
import numpy as np
import gradio as gr
# Lazy flags
_TTS = None
_SR = 24000 # XTTS v2 typical output rate
# ---------- Utilities ----------
_SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
def chunk_text(text: str, max_len: int = 480) -> List[str]:
text = re.sub(r"\s+", " ", text).strip()
if not text:
return []
if len(text) <= max_len:
return [text]
sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
chunks, buf = [], ""
for s in sents:
if len(buf) + 1 + len(s) <= max_len:
buf = f"{buf} {s}".strip() if buf else s
else:
if buf:
chunks.append(buf)
if len(s) > max_len: # very long single sentence
for i in range(0, len(s), max_len):
chunks.append(s[i:i+max_len])
buf = ""
else:
buf = s
if buf:
chunks.append(buf)
return chunks
def read_text_from_file(file_obj) -> str:
if not file_obj:
return ""
# gr.File in v4 gives a TempFile with .name path string
path = getattr(file_obj, "name", None)
if not path or not os.path.exists(path):
return ""
ext = os.path.splitext(path)[1].lower()
if ext == ".txt":
with open(path, "rb") as f:
return f.read().decode("utf-8", errors="ignore")
elif ext == ".docx":
try:
import docx
except Exception:
raise gr.Error("python-docx not installed. Check requirements.txt")
d = docx.Document(path)
return "\n".join(p.text for p in d.paragraphs).strip()
else:
raise gr.Error("Unsupported file type. Please upload .txt or .docx")
def get_tts():
global _TTS, _SR
if _TTS is None:
try:
from TTS.api import TTS
except Exception as e:
raise gr.Error(
"Coqui TTS is not installed or failed to import. "
"Make sure your Space installed requirements.txt.\n\n" + str(e)
)
# CPU-safe init
_TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
# sample rate if exposed
_SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000)
return _TTS
def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str:
import soundfile as sf
with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f:
for a in chunks_audio:
a = np.asarray(a).flatten().astype("float32")
# guard against NaNs/Infs
a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
# clamp to [-1, 1]
a = np.clip(a, -1.0, 1.0)
f.write(a)
return out_path
# ---------- Core pipeline ----------
def synthesize_pipeline(text_input, file_input, language, voice_ref):
# Gather text
user = (text_input or "").strip()
from_file = read_text_from_file(file_input) if file_input else ""
final_text = (user + ("\n" if user and from_file else "") + from_file).strip()
if not final_text:
raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
# Limit very long inputs so Spaces don't OOM
if len(final_text) > 20000:
final_text = final_text[:20000] + " ..."
chunks = chunk_text(final_text, max_len=480)
if not chunks:
raise gr.Error("No readable text found.")
tts = get_tts()
# Optional voice clone
speaker_wav = None
if voice_ref is not None:
try:
speaker_wav = getattr(voice_ref, "name", None)
except Exception:
speaker_wav = None
# Synthesize
audios = []
for i, ch in enumerate(chunks, 1):
audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav)
audios.append(audio)
# Write single WAV
out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
return safe_concat_wav(audios, _SR, out_path)
# ---------- Gradio UI ----------
LANG_OPTIONS = [
("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"),
("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"),
("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"),
("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"),
]
with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
gr.Markdown(
"""
# ๐Ÿ”Š High-Quality Text-to-Speech (Coqui XTTS v2)
- **Type/paste** text or **upload** `.docx` / `.txt`
- Optional: upload a short **.wav** (10โ€“30s) to clone voice
- Click **Generate Audio**
"""
)
text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text hereโ€ฆ")
file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"])
with gr.Row():
voice_ref = gr.File(label="Optional voice reference (.wav, 10โ€“30s)", file_types=[".wav"])
lang = gr.Dropdown(
choices=[code for (_, code) in LANG_OPTIONS],
value="en",
label="Language",
)
run_btn = gr.Button("๐ŸŽ™๏ธ Generate Audio", variant="primary")
audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
download = gr.File(label="Download WAV")
err_box = gr.Markdown("", elem_id="error_box")
def run(text_input, file_input, language, voice_ref_file):
try:
path = synthesize_pipeline(text_input, file_input, language, voice_ref_file)
return path, path, "" # clear errors
except Exception as e:
tb = traceback.format_exc()
# Show a compact, readable error in the UI
msg = f"**Error:** {e}\n\n```\n{tb[-1500:]}\n```"
return None, None, msg
run_btn.click(
run,
inputs=[text_in, file_in, lang, voice_ref],
outputs=[audio_out, download, err_box],
)
if __name__ == "__main__":
demo.launch()