Spaces:
Sleeping
Sleeping
import os, io, uuid, re, tempfile, traceback | |
from typing import List | |
# ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ---- | |
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") | |
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1") | |
os.environ.setdefault("COQUI_TOS_AGREED", "1") # add this line | |
import numpy as np | |
import gradio as gr | |
# Lazy flags | |
_TTS = None | |
_SR = 24000 # XTTS v2 typical output rate | |
# ---------- Utilities ---------- | |
_SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+") | |
def chunk_text(text: str, max_len: int = 480) -> List[str]: | |
text = re.sub(r"\s+", " ", text).strip() | |
if not text: | |
return [] | |
if len(text) <= max_len: | |
return [text] | |
sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()] | |
chunks, buf = [], "" | |
for s in sents: | |
if len(buf) + 1 + len(s) <= max_len: | |
buf = f"{buf} {s}".strip() if buf else s | |
else: | |
if buf: | |
chunks.append(buf) | |
if len(s) > max_len: # very long single sentence | |
for i in range(0, len(s), max_len): | |
chunks.append(s[i:i+max_len]) | |
buf = "" | |
else: | |
buf = s | |
if buf: | |
chunks.append(buf) | |
return chunks | |
def read_text_from_file(file_obj) -> str: | |
if not file_obj: | |
return "" | |
# gr.File in v4 gives a TempFile with .name path string | |
path = getattr(file_obj, "name", None) | |
if not path or not os.path.exists(path): | |
return "" | |
ext = os.path.splitext(path)[1].lower() | |
if ext == ".txt": | |
with open(path, "rb") as f: | |
return f.read().decode("utf-8", errors="ignore") | |
elif ext == ".docx": | |
try: | |
import docx | |
except Exception: | |
raise gr.Error("python-docx not installed. Check requirements.txt") | |
d = docx.Document(path) | |
return "\n".join(p.text for p in d.paragraphs).strip() | |
else: | |
raise gr.Error("Unsupported file type. Please upload .txt or .docx") | |
def get_tts(): | |
global _TTS, _SR | |
if _TTS is None: | |
try: | |
from TTS.api import TTS | |
except Exception as e: | |
raise gr.Error( | |
"Coqui TTS is not installed or failed to import. " | |
"Make sure your Space installed requirements.txt.\n\n" + str(e) | |
) | |
# CPU-safe init | |
_TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) | |
# sample rate if exposed | |
_SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000) | |
return _TTS | |
def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str: | |
import soundfile as sf | |
with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f: | |
for a in chunks_audio: | |
a = np.asarray(a).flatten().astype("float32") | |
# guard against NaNs/Infs | |
a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) | |
# clamp to [-1, 1] | |
a = np.clip(a, -1.0, 1.0) | |
f.write(a) | |
return out_path | |
# ---------- Core pipeline ---------- | |
def synthesize_pipeline(text_input, file_input, language, voice_ref): | |
# Gather text | |
user = (text_input or "").strip() | |
from_file = read_text_from_file(file_input) if file_input else "" | |
final_text = (user + ("\n" if user and from_file else "") + from_file).strip() | |
if not final_text: | |
raise gr.Error("Please paste/type text or upload a .txt/.docx file.") | |
# Limit very long inputs so Spaces don't OOM | |
if len(final_text) > 20000: | |
final_text = final_text[:20000] + " ..." | |
chunks = chunk_text(final_text, max_len=480) | |
if not chunks: | |
raise gr.Error("No readable text found.") | |
tts = get_tts() | |
# Optional voice clone | |
speaker_wav = None | |
if voice_ref is not None: | |
try: | |
speaker_wav = getattr(voice_ref, "name", None) | |
except Exception: | |
speaker_wav = None | |
# Synthesize | |
audios = [] | |
for i, ch in enumerate(chunks, 1): | |
audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav) | |
audios.append(audio) | |
# Write single WAV | |
out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav") | |
return safe_concat_wav(audios, _SR, out_path) | |
# ---------- Gradio UI ---------- | |
LANG_OPTIONS = [ | |
("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), | |
("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"), | |
("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"), | |
("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"), | |
] | |
with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo: | |
gr.Markdown( | |
""" | |
# ๐ High-Quality Text-to-Speech (Coqui XTTS v2) | |
- **Type/paste** text or **upload** `.docx` / `.txt` | |
- Optional: upload a short **.wav** (10โ30s) to clone voice | |
- Click **Generate Audio** | |
""" | |
) | |
text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text hereโฆ") | |
file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"]) | |
with gr.Row(): | |
voice_ref = gr.File(label="Optional voice reference (.wav, 10โ30s)", file_types=[".wav"]) | |
lang = gr.Dropdown( | |
choices=[code for (_, code) in LANG_OPTIONS], | |
value="en", | |
label="Language", | |
) | |
run_btn = gr.Button("๐๏ธ Generate Audio", variant="primary") | |
audio_out = gr.Audio(label="Result", type="filepath", autoplay=True) | |
download = gr.File(label="Download WAV") | |
err_box = gr.Markdown("", elem_id="error_box") | |
def run(text_input, file_input, language, voice_ref_file): | |
try: | |
path = synthesize_pipeline(text_input, file_input, language, voice_ref_file) | |
return path, path, "" # clear errors | |
except Exception as e: | |
tb = traceback.format_exc() | |
# Show a compact, readable error in the UI | |
msg = f"**Error:** {e}\n\n```\n{tb[-1500:]}\n```" | |
return None, None, msg | |
run_btn.click( | |
run, | |
inputs=[text_in, file_in, lang, voice_ref], | |
outputs=[audio_out, download, err_box], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |