import os, io, uuid, re, tempfile, traceback from typing import List # ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ---- os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1") os.environ.setdefault("COQUI_TOS_AGREED", "1") # add this line import numpy as np import gradio as gr # Lazy flags _TTS = None _SR = 24000 # XTTS v2 typical output rate # ---------- Utilities ---------- _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+") def chunk_text(text: str, max_len: int = 480) -> List[str]: text = re.sub(r"\s+", " ", text).strip() if not text: return [] if len(text) <= max_len: return [text] sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()] chunks, buf = [], "" for s in sents: if len(buf) + 1 + len(s) <= max_len: buf = f"{buf} {s}".strip() if buf else s else: if buf: chunks.append(buf) if len(s) > max_len: # very long single sentence for i in range(0, len(s), max_len): chunks.append(s[i:i+max_len]) buf = "" else: buf = s if buf: chunks.append(buf) return chunks def read_text_from_file(file_obj) -> str: if not file_obj: return "" # gr.File in v4 gives a TempFile with .name path string path = getattr(file_obj, "name", None) if not path or not os.path.exists(path): return "" ext = os.path.splitext(path)[1].lower() if ext == ".txt": with open(path, "rb") as f: return f.read().decode("utf-8", errors="ignore") elif ext == ".docx": try: import docx except Exception: raise gr.Error("python-docx not installed. Check requirements.txt") d = docx.Document(path) return "\n".join(p.text for p in d.paragraphs).strip() else: raise gr.Error("Unsupported file type. Please upload .txt or .docx") def get_tts(): global _TTS, _SR if _TTS is None: try: from TTS.api import TTS except Exception as e: raise gr.Error( "Coqui TTS is not installed or failed to import. " "Make sure your Space installed requirements.txt.\n\n" + str(e) ) # CPU-safe init _TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) # sample rate if exposed _SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000) return _TTS def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str: import soundfile as sf with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f: for a in chunks_audio: a = np.asarray(a).flatten().astype("float32") # guard against NaNs/Infs a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) # clamp to [-1, 1] a = np.clip(a, -1.0, 1.0) f.write(a) return out_path # ---------- Core pipeline ---------- def synthesize_pipeline(text_input, file_input, language, voice_ref): # Gather text user = (text_input or "").strip() from_file = read_text_from_file(file_input) if file_input else "" final_text = (user + ("\n" if user and from_file else "") + from_file).strip() if not final_text: raise gr.Error("Please paste/type text or upload a .txt/.docx file.") # Limit very long inputs so Spaces don't OOM if len(final_text) > 20000: final_text = final_text[:20000] + " ..." chunks = chunk_text(final_text, max_len=480) if not chunks: raise gr.Error("No readable text found.") tts = get_tts() # Optional voice clone speaker_wav = None if voice_ref is not None: try: speaker_wav = getattr(voice_ref, "name", None) except Exception: speaker_wav = None # Synthesize audios = [] for i, ch in enumerate(chunks, 1): audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav) audios.append(audio) # Write single WAV out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav") return safe_concat_wav(audios, _SR, out_path) # ---------- Gradio UI ---------- LANG_OPTIONS = [ ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"), ("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"), ("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"), ] with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo: gr.Markdown( """ # πŸ”Š High-Quality Text-to-Speech (Coqui XTTS v2) - **Type/paste** text or **upload** `.docx` / `.txt` - Optional: upload a short **.wav** (10–30s) to clone voice - Click **Generate Audio** """ ) text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text here…") file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"]) with gr.Row(): voice_ref = gr.File(label="Optional voice reference (.wav, 10–30s)", file_types=[".wav"]) lang = gr.Dropdown( choices=[code for (_, code) in LANG_OPTIONS], value="en", label="Language", ) run_btn = gr.Button("πŸŽ™οΈ Generate Audio", variant="primary") audio_out = gr.Audio(label="Result", type="filepath", autoplay=True) download = gr.File(label="Download WAV") err_box = gr.Markdown("", elem_id="error_box") def run(text_input, file_input, language, voice_ref_file): try: path = synthesize_pipeline(text_input, file_input, language, voice_ref_file) return path, path, "" # clear errors except Exception as e: tb = traceback.format_exc() # Show a compact, readable error in the UI msg = f"**Error:** {e}\n\n```\n{tb[-1500:]}\n```" return None, None, msg run_btn.click( run, inputs=[text_in, file_in, lang, voice_ref], outputs=[audio_out, download, err_box], ) if __name__ == "__main__": demo.launch()