Spaces:

eaglelandsonce
/

Text_Audio_Reader

Sleeping

App Files Files Community

Text_Audio_Reader / app.py

eaglelandsonce

Update app.py

caae255 verified 16 days ago

raw

history blame contribute delete

6.36 kB

	import os, io, uuid, re, tempfile, traceback
	from typing import List

	# ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ----
	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
	os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
	os.environ.setdefault("COQUI_TOS_AGREED", "1") # add this line

	import numpy as np
	import gradio as gr

	# Lazy flags
	_TTS = None
	_SR = 24000 # XTTS v2 typical output rate

	# ---------- Utilities ----------
	_SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")

	def chunk_text(text: str, max_len: int = 480) -> List[str]:
	text = re.sub(r"\s+", " ", text).strip()
	if not text:
	return []
	if len(text) <= max_len:
	return [text]
	sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
	chunks, buf = [], ""
	for s in sents:
	if len(buf) + 1 + len(s) <= max_len:
	buf = f"{buf} {s}".strip() if buf else s
	else:
	if buf:
	chunks.append(buf)
	if len(s) > max_len: # very long single sentence
	for i in range(0, len(s), max_len):
	chunks.append(s[i:i+max_len])
	buf = ""
	else:
	buf = s
	if buf:
	chunks.append(buf)
	return chunks

	def read_text_from_file(file_obj) -> str:
	if not file_obj:
	return ""
	# gr.File in v4 gives a TempFile with .name path string
	path = getattr(file_obj, "name", None)
	if not path or not os.path.exists(path):
	return ""
	ext = os.path.splitext(path)[1].lower()
	if ext == ".txt":
	with open(path, "rb") as f:
	return f.read().decode("utf-8", errors="ignore")
	elif ext == ".docx":
	try:
	import docx
	except Exception:
	raise gr.Error("python-docx not installed. Check requirements.txt")
	d = docx.Document(path)
	return "\n".join(p.text for p in d.paragraphs).strip()
	else:
	raise gr.Error("Unsupported file type. Please upload .txt or .docx")

	def get_tts():
	global _TTS, _SR
	if _TTS is None:
	try:
	from TTS.api import TTS
	except Exception as e:
	raise gr.Error(
	"Coqui TTS is not installed or failed to import. "
	"Make sure your Space installed requirements.txt.\n\n" + str(e)
	)
	# CPU-safe init
	_TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
	# sample rate if exposed
	_SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000)
	return _TTS

	def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str:
	import soundfile as sf
	with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f:
	for a in chunks_audio:
	a = np.asarray(a).flatten().astype("float32")
	# guard against NaNs/Infs
	a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
	# clamp to [-1, 1]
	a = np.clip(a, -1.0, 1.0)
	f.write(a)
	return out_path

	# ---------- Core pipeline ----------
	def synthesize_pipeline(text_input, file_input, language, voice_ref):
	# Gather text
	user = (text_input or "").strip()
	from_file = read_text_from_file(file_input) if file_input else ""
	final_text = (user + ("\n" if user and from_file else "") + from_file).strip()

	if not final_text:
	raise gr.Error("Please paste/type text or upload a .txt/.docx file.")

	# Limit very long inputs so Spaces don't OOM
	if len(final_text) > 20000:
	final_text = final_text[:20000] + " ..."

	chunks = chunk_text(final_text, max_len=480)
	if not chunks:
	raise gr.Error("No readable text found.")

	tts = get_tts()

	# Optional voice clone
	speaker_wav = None
	if voice_ref is not None:
	try:
	speaker_wav = getattr(voice_ref, "name", None)
	except Exception:
	speaker_wav = None

	# Synthesize
	audios = []
	for i, ch in enumerate(chunks, 1):
	audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav)
	audios.append(audio)

	# Write single WAV
	out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
	return safe_concat_wav(audios, _SR, out_path)

	# ---------- Gradio UI ----------
	LANG_OPTIONS = [
	("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"),
	("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"),
	("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"),
	("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"),
	]

	with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
	gr.Markdown(
	"""
	# 🔊 High-Quality Text-to-Speech (Coqui XTTS v2)
	- Type/paste text or upload `.docx` / `.txt`
	- Optional: upload a short .wav (10–30s) to clone voice
	- Click Generate Audio
	"""
	)
	text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text here…")
	file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"])
	with gr.Row():
	voice_ref = gr.File(label="Optional voice reference (.wav, 10–30s)", file_types=[".wav"])
	lang = gr.Dropdown(
	choices=[code for (_, code) in LANG_OPTIONS],
	value="en",
	label="Language",
	)
	run_btn = gr.Button("🎙️ Generate Audio", variant="primary")
	audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
	download = gr.File(label="Download WAV")
	err_box = gr.Markdown("", elem_id="error_box")

	def run(text_input, file_input, language, voice_ref_file):
	try:
	path = synthesize_pipeline(text_input, file_input, language, voice_ref_file)
	return path, path, "" # clear errors
	except Exception as e:
	tb = traceback.format_exc()
	# Show a compact, readable error in the UI
	msg = f"Error: {e}\n\n```\n{tb[-1500:]}\n```"
	return None, None, msg

	run_btn.click(
	run,
	inputs=[text_in, file_in, lang, voice_ref],
	outputs=[audio_out, download, err_box],
	)

	if __name__ == "__main__":
	demo.launch()