Spaces:

LucaCappelletti94
/

talking-snake

Sleeping

App Files Files Community

LucaCappelletti94 commited on Feb 7

Commit

19933fe

1 Parent(s): 618f5ab

Initial deployment with LFS

Browse files

Files changed (25) hide show

.gitattributes +2 -0
Dockerfile +31 -0
README.md +10 -5
pyproject.toml +111 -0
src/talking_snake/__init__.py +3 -0
src/talking_snake/__main__.py +103 -0
src/talking_snake/__pycache__/__init__.cpython-312.pyc +0 -0
src/talking_snake/__pycache__/__main__.cpython-312.pyc +0 -0
src/talking_snake/__pycache__/app.cpython-312.pyc +0 -0
src/talking_snake/__pycache__/extract.cpython-312.pyc +0 -0
src/talking_snake/__pycache__/tts.cpython-312.pyc +0 -0
src/talking_snake/app.py +935 -0
src/talking_snake/extract.py +489 -0
src/talking_snake/static/app.js +773 -0
src/talking_snake/static/apple-touch-icon.png +3 -0
src/talking_snake/static/favicon.png +3 -0
src/talking_snake/static/icon-192.png +3 -0
src/talking_snake/static/icon-512.png +3 -0
src/talking_snake/static/index.html +142 -0
src/talking_snake/static/manifest.json +36 -0
src/talking_snake/static/sample.wav +3 -0
src/talking_snake/static/styles.css +848 -0
src/talking_snake/static/talking_snake.png +3 -0
src/talking_snake/tts.py +381 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.12-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    sox \
+    libsox-dev \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install uv
+RUN pip install uv
+# Create non-root user for HF Spaces
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy project files
+COPY --chown=user . .
+# Install dependencies
+RUN uv sync --no-dev
+# Expose port 7860 (HF Spaces default)
+EXPOSE 7860
+# Run the app
+CMD ["uv", "run", "talking-snake", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
 ---
 title: Talking Snake
-emoji: 😻
-colorFrom: purple
-colorTo: blue
 sdk: docker
 pinned: false
 license: mit
-short_description: 'Just a talking snake that reads PDFs and web pages aloud. '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Talking Snake
+emoji: 🐍
+colorFrom: green
+colorTo: purple
 sdk: docker
 pinned: false
 license: mit
+app_port: 7860
+suggested_hardware: l4x1
 ---
+# Talking Snake
+PDF and web page to speech using Qwen3-TTS.
+Click "Duplicate this Space" to deploy your own instance (L4 or A100 recommended for speed).

pyproject.toml ADDED Viewed

	@@ -0,0 +1,111 @@

+[project]
+name = "talking-snake"
+version = "0.1.0"
+description = "Just a talking snake that reads PDFs and web pages aloud."
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.11"
+authors = [{ name = "Luca" }]
+keywords = ["tts", "pdf", "speech", "audiobook", "text-to-speech", "listening"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: End Users/Desktop",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+]
+# System dependencies (not installable via pip):
+# - sox: Audio processing tool required by qwen-tts
+#   Ubuntu/Debian: sudo apt-get install sox libsox-dev
+#   macOS: brew install sox
+#   Fedora: sudo dnf install sox sox-devel
+dependencies = [
+    "fastapi>=0.115.0",
+    "uvicorn[standard]>=0.32.0",
+    "qwen-tts>=0.1.1",
+    "torch>=2.5.0",
+    "pdfminer.six>=20260107",
+    "python-multipart>=0.0.12",
+    "jinja2>=3.1.4",
+    "httpx>=0.27.0",
+    "trafilatura>=2.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.3.0",
+    "pytest-asyncio>=0.24.0",
+    "pytest-cov>=6.0.0",
+    "httpx>=0.27.0",
+    "ruff>=0.8.0",
+    "mypy>=1.14.0",
+    "pre-commit>=4.0.0",
+]
+# Flash Attention for ~2x faster inference (requires CUDA 11.6+)
+# Install separately: pip install flash-attn --no-build-isolation
+fast = [
+    "flash-attn>=2.5.0",
+]
+[project.scripts]
+talking-snake = "talking_snake.__main__:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/talking_snake"]
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+markers = [
+    "slow: marks tests as slow (run with --run-slow)",
+]
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+[tool.ruff.lint]
+select = ["E", "F", "I", "N", "W", "UP"]
+[tool.ruff.lint.per-file-ignores]
+# PDF xref tables require trailing whitespace per spec
+"tests/conftest.py" = ["W291"]
+[tool.coverage.run]
+source = ["src/talking_snake"]
+branch = true
+omit = [
+    "*/tests/*",
+    "*/__main__.py",
+]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+    "class QwenTTSEngine",
+    "def _audio_to_wav",
+    "def _split_text",
+    "import torch",
+    "from qwen_tts",
+]
+show_missing = true
+skip_covered = true
+fail_under = 70
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+ignore_missing_imports = true

src/talking_snake/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """PDF-to-Speech web server using Qwen3-TTS - listen to any content."""
2	+
3	+ __version__ = "0.1.0"

src/talking_snake/__main__.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""CLI entry point for the Reader server."""
+from __future__ import annotations
+import argparse
+import sys
+def main() -> int:
+    """Main entry point for the Reader CLI.
+    Returns:
+        Exit code (0 for success).
+    """
+    parser = argparse.ArgumentParser(
+        prog="reader",
+        description="PDF-to-Speech web server - listen to any content",
+    )
+    parser.add_argument(
+        "--voice",
+        type=str,
+        default=None,
+        help="Voice name for TTS. Options: Vivian, Serena, Uncle_Fu, Dylan, Eric, "
+        "Ryan, Aiden, Ono_Anna, Sohee (default: auto based on language)",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default="english",
+        choices=["english", "chinese", "japanese", "korean"],
+        help="Language for TTS (default: english). Sets default voice if --voice not specified.",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to bind the server to (default: 0.0.0.0)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port to bind the server to (default: 8000)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device to run the TTS model on (default: cuda)",
+    )
+    parser.add_argument(
+        "--reload",
+        action="store_true",
+        help="Enable auto-reload for development",
+    )
+    args = parser.parse_args()
+    print("🚀 Starting Reader server...")
+    print(f"   Language: {args.language}")
+    print(f"   Voice:    {args.voice or 'auto'}")
+    print(f"   Device:   {args.device}")
+    print(f"   URL:      http://{args.host}:{args.port}")
+    print()
+    # Import here to avoid slow startup for --help
+    import uvicorn
+    from talking_snake.app import create_app
+    from talking_snake.tts import QwenTTSEngine
+    # Initialize TTS engine
+    print("📦 Loading TTS model (this may take a moment)...")
+    try:
+        tts_engine = QwenTTSEngine(
+            voice=args.voice,
+            language=args.language,
+            device=args.device,
+        )
+    except Exception as e:
+        print(f"❌ Failed to load TTS model: {e}", file=sys.stderr)
+        return 1
+    print("✅ TTS model loaded!")
+    print()
+    # Create app with engine
+    app = create_app(tts_engine=tts_engine)
+    # Run server
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+    )
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

src/talking_snake/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (260 Bytes). View file

src/talking_snake/__pycache__/__main__.cpython-312.pyc ADDED Viewed

Binary file (3.44 kB). View file

src/talking_snake/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (34.7 kB). View file

src/talking_snake/__pycache__/extract.cpython-312.pyc ADDED Viewed

Binary file (18.6 kB). View file

src/talking_snake/__pycache__/tts.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

src/talking_snake/app.py ADDED Viewed

	@@ -0,0 +1,935 @@

+"""FastAPI application for PDF-to-Speech server."""
+from __future__ import annotations
+import io
+import json
+import queue
+import struct
+import threading
+import time
+import uuid
+from pathlib import Path
+from typing import TYPE_CHECKING
+from urllib.parse import urlparse
+import httpx
+import trafilatura
+from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
+from fastapi.responses import HTMLResponse, StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from talking_snake.extract import clean_text, extract_text, get_page_count
+from talking_snake.tts import (
+    DEFAULT_CHUNK_SIZE,
+    LANGUAGE_VOICES,
+    MockTTSEngine,
+    TTSEngineProtocol,
+)
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+# Request timeout for fetching URLs (seconds)
+URL_FETCH_TIMEOUT = 60.0
+# Maximum file size to fetch (50MB)
+MAX_FILE_SIZE = 50 * 1024 * 1024
+# Initial estimate for time calculation before calibration
+# This value is refined after the first chunk is processed
+# RTX 4090 + flash-attn: ~0.001s/char, RTX 4090: ~0.002s/char, RTX 3060: ~0.005s/char
+INITIAL_SECONDS_PER_CHAR = 0.002  # Optimistic GPU estimate, calibrates after first chunk
+# Job timeout (seconds) - jobs are cleaned up after this time
+JOB_TIMEOUT = 3600  # 1 hour
+class AudioJob:
+    """Represents an audio generation job with a queue for streaming."""
+    def __init__(self, job_id: str):
+        self.job_id = job_id
+        self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
+        self.started = time.time()
+        self.completed = False
+        self.error: str | None = None
+        self.sample_rate = 24000  # Default, will be set by TTS engine
+        self.header_sent = False
+    def put_audio(self, audio_bytes: bytes) -> None:
+        """Add audio data to the queue."""
+        self.audio_queue.put(audio_bytes)
+    def finish(self) -> None:
+        """Signal that audio generation is complete."""
+        self.completed = True
+        self.audio_queue.put(None)  # Sentinel to signal end
+    def set_error(self, error: str) -> None:
+        """Set an error and finish the job."""
+        self.error = error
+        self.completed = True
+        self.audio_queue.put(None)
+class JobManager:
+    """Manages audio generation jobs."""
+    def __init__(self) -> None:
+        self._jobs: dict[str, AudioJob] = {}
+        self._lock = threading.Lock()
+    def create_job(self) -> AudioJob:
+        """Create a new job and return it."""
+        job_id = str(uuid.uuid4())
+        job = AudioJob(job_id)
+        with self._lock:
+            self._jobs[job_id] = job
+            self._cleanup_old_jobs()
+        return job
+    def get_job(self, job_id: str) -> AudioJob | None:
+        """Get a job by ID."""
+        with self._lock:
+            return self._jobs.get(job_id)
+    def remove_job(self, job_id: str) -> None:
+        """Remove a job."""
+        with self._lock:
+            self._jobs.pop(job_id, None)
+    def _cleanup_old_jobs(self) -> None:
+        """Remove jobs older than JOB_TIMEOUT."""
+        now = time.time()
+        to_remove = [jid for jid, job in self._jobs.items() if now - job.started > JOB_TIMEOUT]
+        for jid in to_remove:
+            del self._jobs[jid]
+# Global job manager
+_job_manager = JobManager()
+class UrlRequest(BaseModel):
+    """Request body for URL-based reading."""
+    url: str
+    language: str = "english"
+class TextRequest(BaseModel):
+    """Request body for direct text reading."""
+    text: str
+    language: str = "english"
+class EstimateResponse(BaseModel):
+    """Response for time estimation."""
+    text_length: int
+    chunk_count: int
+    estimated_seconds: float
+    estimated_minutes: float
+# Global TTS engine instance (set during startup)
+_tts_engine: TTSEngineProtocol | None = None
+def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
+    """Create and configure the FastAPI application.
+    Args:
+        tts_engine: TTS engine to use. If None, uses MockTTSEngine.
+    Returns:
+        Configured FastAPI application.
+    """
+    global _tts_engine
+    _tts_engine = tts_engine or MockTTSEngine()
+    app = FastAPI(
+        title="Reader",
+        description="PDF-to-Speech web server - listen to any content",
+        version="0.1.0",
+    )
+    # Mount static files
+    static_dir = Path(__file__).parent / "static"
+    if static_dir.exists():
+        app.mount("/static", StaticFiles(directory=static_dir), name="static")
+    # Register routes
+    app.add_api_route("/", index, methods=["GET"], response_class=HTMLResponse)
+    app.add_api_route("/api/read", read_pdf, methods=["POST"])
+    app.add_api_route("/api/read-url", read_url, methods=["POST"])
+    app.add_api_route("/api/read-stream", read_pdf_stream, methods=["POST"])
+    app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
+    app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
+    app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
+    app.add_api_route("/api/languages", get_languages, methods=["GET"])
+    app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
+    app.add_api_route("/api/health", health_check, methods=["GET"])
+    return app
+async def index(request: Request) -> HTMLResponse:
+    """Serve the main page.
+    Args:
+        request: The incoming request.
+    Returns:
+        HTML response with the main page.
+    """
+    static_dir = Path(__file__).parent / "static"
+    index_file = static_dir / "index.html"
+    if not index_file.exists():
+        return HTMLResponse(
+            content="<h1>Reader</h1><p>Static files not found.</p>",
+            status_code=200,
+        )
+    return HTMLResponse(content=index_file.read_text())
+async def read_pdf(file: UploadFile = File(...)) -> StreamingResponse:
+    """Read a PDF and return synthesized speech.
+    Args:
+        file: Uploaded PDF file.
+    Returns:
+        Streaming WAV audio response.
+    Raises:
+        HTTPException: If file is not a PDF or extraction fails.
+    """
+    if _tts_engine is None:
+        raise HTTPException(status_code=500, detail="TTS engine not initialized")
+    # Validate file type
+    if not file.filename or not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    # Read file content
+    try:
+        pdf_bytes = await file.read()
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
+    if not pdf_bytes:
+        raise HTTPException(status_code=400, detail="Empty file")
+    # Extract text
+    try:
+        text = extract_text(pdf_bytes)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="No text found in PDF")
+    # Stream TTS audio
+    def generate_audio() -> Iterator[bytes]:
+        assert _tts_engine is not None
+        yield from _tts_engine.synthesize(text)
+    return StreamingResponse(
+        generate_audio(),
+        media_type="audio/wav",
+        headers={
+            "Content-Disposition": f'inline; filename="{Path(file.filename).stem}.wav"',
+        },
+    )
+async def read_url(request: UrlRequest) -> StreamingResponse:
+    """Read content from a URL (PDF or web page) and return synthesized speech.
+    For PDFs: extracts text and removes headers/footers/page numbers.
+    For web pages: extracts main article content, removing navigation,
+    sidebars, footers, ads, and other boilerplate.
+    Args:
+        request: Request containing the URL to fetch.
+    Returns:
+        Streaming WAV audio response.
+    Raises:
+        HTTPException: If URL is invalid, fetch fails, or extraction fails.
+    """
+    if _tts_engine is None:
+        raise HTTPException(status_code=500, detail="TTS engine not initialized")
+    # Validate URL
+    url = request.url.strip()
+    if not url:
+        raise HTTPException(status_code=400, detail="URL is required")
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise HTTPException(status_code=400, detail="Only HTTP/HTTPS URLs are supported")
+    # Determine if this is a PDF or web page
+    is_pdf = parsed.path.lower().endswith(".pdf")
+    # Fetch the content
+    try:
+        async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            # Check content length if available
+            content_length = response.headers.get("content-length")
+            if content_length and int(content_length) > MAX_FILE_SIZE:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
+                )
+            content = response.content
+            if len(content) > MAX_FILE_SIZE:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
+                )
+            # Also check content-type header to detect PDFs served without .pdf extension
+            content_type = response.headers.get("content-type", "").lower()
+            if "application/pdf" in content_type:
+                is_pdf = True
+    except httpx.TimeoutException:
+        raise HTTPException(status_code=408, detail="Request timed out while fetching URL")
+    except httpx.HTTPStatusError as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Failed to fetch URL: HTTP {e.response.status_code}",
+        )
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
+    if not content:
+        raise HTTPException(status_code=400, detail="Empty content at URL")
+    # Extract text based on content type
+    if is_pdf:
+        try:
+            text = extract_text(content)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
+    else:
+        # Use trafilatura to extract main content from HTML
+        # This removes navigation, sidebars, footers, ads, etc.
+        try:
+            extracted = trafilatura.extract(
+                content,
+                include_comments=False,
+                include_tables=True,
+                no_fallback=False,
+                favor_precision=True,
+            )
+            if extracted:
+                # Apply additional cleaning for TTS
+                text = clean_text(extracted)
+            else:
+                text = ""
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
+    if not text or not text.strip():
+        raise HTTPException(status_code=400, detail="No readable content found at URL")
+    # Extract filename from URL for the response
+    filename = Path(parsed.path).stem or parsed.netloc or "document"
+    # Stream TTS audio
+    def generate_audio() -> Iterator[bytes]:
+        assert _tts_engine is not None
+        yield from _tts_engine.synthesize(text)
+    return StreamingResponse(
+        generate_audio(),
+        media_type="audio/wav",
+        headers={
+            "Content-Disposition": f'inline; filename="{filename}.wav"',
+        },
+    )
+async def health_check() -> dict[str, str]:
+    """Health check endpoint.
+    Returns:
+        Status information.
+    """
+    return {"status": "ok"}
+async def get_languages() -> dict[str, list[str]]:
+    """Get available languages.
+    Returns:
+        List of available language names.
+    """
+    return {"languages": list(LANGUAGE_VOICES.keys())}
+def _get_device_info() -> dict:
+    """Get device and model information with real-time memory stats.
+    Returns:
+        Device type, memory usage, and model info.
+    """
+    import torch
+    info = {
+        "device": "cpu",
+        "device_name": "CPU",
+        "memory_used_gb": 0,
+        "memory_total_gb": 0,
+        "memory_percent": 0,
+        "batch_size": 1,
+    }
+    if torch.cuda.is_available():
+        props = torch.cuda.get_device_properties(0)
+        # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
+        reserved = torch.cuda.memory_reserved(0)
+        allocated = torch.cuda.memory_allocated(0)
+        total = props.total_memory
+        # Show reserved memory (what's actually held by PyTorch)
+        used = max(reserved, allocated)
+        info["device"] = "cuda"
+        info["device_name"] = props.name
+        info["memory_used_gb"] = round(used / 1024**3, 1)
+        info["memory_total_gb"] = round(total / 1024**3, 1)
+        info["memory_percent"] = round((used / total) * 100, 1) if total > 0 else 0
+        # Also include allocated for debugging
+        info["memory_allocated_gb"] = round(allocated / 1024**3, 1)
+    if _tts_engine is not None:
+        info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
+        info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
+    return info
+async def stream_device_info() -> StreamingResponse:
+    """Stream device info updates via SSE.
+    Returns:
+        SSE stream with device info updates every 3 seconds.
+    """
+    import asyncio
+    from collections.abc import AsyncIterator
+    from concurrent.futures import ThreadPoolExecutor
+    executor = ThreadPoolExecutor(max_workers=1)
+    async def generate_events() -> AsyncIterator[str]:
+        """Generate SSE events for device info."""
+        loop = asyncio.get_event_loop()
+        while True:
+            try:
+                # Run torch calls in executor to avoid blocking
+                info = await loop.run_in_executor(executor, _get_device_info)
+                yield f"data: {json.dumps(info)}\n\n"
+            except Exception as e:
+                # Send error info but continue
+                yield f'data: {{"error": "{e!s}"}}\n\n'
+            await asyncio.sleep(3)
+    return StreamingResponse(
+        generate_events(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+def _estimate_time(
+    text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
+) -> tuple[int, float]:
+    """Estimate processing time for text.
+    Args:
+        text: Text to process.
+        seconds_per_char: Calibrated rate (defaults to initial estimate).
+    Returns:
+        Tuple of (chunk_count, estimated_seconds).
+    """
+    # Count chunks (500 chars per chunk approximately)
+    chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
+    estimated_seconds = len(text) * seconds_per_char
+    return chunk_count, estimated_seconds
+def _create_wav_header(sample_rate: int = 24000, bits_per_sample: int = 16) -> bytes:
+    """Create a WAV header for streaming (unknown length).
+    Uses maximum possible file size since we don't know the final length.
+    Args:
+        sample_rate: Audio sample rate.
+        bits_per_sample: Bits per sample.
+    Returns:
+        WAV header bytes.
+    """
+    channels = 1
+    byte_rate = sample_rate * channels * bits_per_sample // 8
+    block_align = channels * bits_per_sample // 8
+    # Use maximum size for streaming (will be truncated on close)
+    max_size = 0x7FFFFFFF
+    header = io.BytesIO()
+    header.write(b"RIFF")
+    header.write(struct.pack("<I", max_size))
+    header.write(b"WAVE")
+    header.write(b"fmt ")
+    header.write(struct.pack("<I", 16))  # fmt chunk size
+    header.write(struct.pack("<H", 1))  # PCM format
+    header.write(struct.pack("<H", channels))
+    header.write(struct.pack("<I", sample_rate))
+    header.write(struct.pack("<I", byte_rate))
+    header.write(struct.pack("<H", block_align))
+    header.write(struct.pack("<H", bits_per_sample))
+    header.write(b"data")
+    header.write(struct.pack("<I", max_size - 36))
+    return header.getvalue()
+def _generate_audio_to_job(
+    job: AudioJob,
+    text: str,
+    tts_engine: TTSEngineProtocol,
+    language: str = "english",
+    doc_name: str = "document",
+    doc_type: str = "text",
+    page_count: int | None = None,
+) -> Iterator[bytes]:
+    """Generate audio with progress events via SSE, streaming audio to job queue.
+    This function sends progress events via SSE while simultaneously writing
+    audio data to the job's queue for streaming by another endpoint.
+    Supports batched GPU inference for faster processing.
+    Args:
+        job: AudioJob to write audio data to.
+        text: Text to synthesize.
+        tts_engine: TTS engine to use.
+        language: Language for TTS (english, chinese, japanese, korean).
+        doc_name: Name of the document being processed.
+        doc_type: Type of document (pdf, url, text).
+        page_count: Number of pages (for PDFs).
+        tts_engine: TTS engine to use.
+        language: Language for TTS (english, chinese, japanese, korean).
+    Yields:
+        SSE events for progress.
+    """
+    import re
+    # Apply language if the engine supports it
+    if hasattr(tts_engine, "set_language"):
+        tts_engine.set_language(language)
+    # Get chunk size and batch size from engine
+    chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
+    batch_size = getattr(tts_engine, "batch_size", 1)
+    # Split text into chunks (same logic as TTS engine)
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks: list[str] = []
+    current_chunk: list[str] = []
+    current_length = 0
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if current_length + len(sentence) > chunk_size and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(sentence)
+        current_length += len(sentence) + 1
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    total_chunks = len(chunks) if chunks else 1
+    total_chars = sum(len(c) for c in chunks)
+    # Use initial estimate before calibration
+    seconds_per_char = INITIAL_SECONDS_PER_CHAR
+    estimated_total = total_chars * seconds_per_char
+    # Send initial progress event with job_id and batch info
+    progress_data = {
+        "type": "start",
+        "job_id": job.job_id,
+        "current": 0,
+        "total": total_chunks,
+        "percent": 0,
+        "estimated_remaining": estimated_total,
+        "batch_size": batch_size,
+        "doc_name": doc_name,
+        "doc_type": doc_type,
+        "page_count": page_count,
+        "total_chars": total_chars,
+        "status": f"Starting (batch size: {batch_size})...",
+    }
+    yield f"event: start\ndata: {json.dumps(progress_data)}\n\n".encode()
+    # Generate audio - the TTS engine handles batching internally
+    # We pass the full text and let it process in optimized batches
+    start_time = time.time()
+    chunks_processed = 0
+    try:
+        for audio_bytes in tts_engine.synthesize(text):
+            # Write audio to job queue for streaming
+            job.put_audio(audio_bytes)
+            chunks_processed += 1
+            # Calibrate time estimate
+            elapsed = time.time() - start_time
+            if chunks_processed > 0:
+                time_per_chunk = elapsed / chunks_processed
+                remaining_chunks = total_chunks - chunks_processed
+                remaining = remaining_chunks * time_per_chunk
+            else:
+                remaining = estimated_total
+            progress_data = {
+                "type": "progress",
+                "current": chunks_processed,
+                "total": total_chunks,
+                "percent": int((chunks_processed / total_chunks) * 100),
+                "estimated_remaining": round(max(0, remaining), 1),
+                "chars_processed": sum(
+                    len(chunks[i]) for i in range(min(chunks_processed, len(chunks)))
+                ),
+                "total_chars": total_chars,
+                "status": f"Processing chunk {chunks_processed}/{total_chunks}",
+            }
+            yield f"event: progress\ndata: {json.dumps(progress_data)}\n\n".encode()
+    except Exception as e:
+        error_msg = f"TTS generation failed: {e!s}"
+        error_data = {
+            "type": "error",
+            "message": error_msg,
+            "chunk": chunks_processed + 1,
+            "total_chunks": total_chunks,
+        }
+        job.set_error(error_msg)
+        yield f"event: error\ndata: {json.dumps(error_data)}\n\n".encode()
+        return
+    # Signal audio generation complete
+    job.finish()
+    # Send completion event
+    total_time = time.time() - start_time
+    complete_data = {
+        "type": "complete",
+        "total_time": round(total_time, 1),
+        "chunks_processed": chunks_processed,
+        "batch_size": batch_size,
+    }
+    yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
+async def stream_audio(job_id: str) -> StreamingResponse:
+    """Stream audio data for a job.
+    This endpoint streams the raw WAV audio as it's being generated.
+    The browser can start playing as soon as data arrives.
+    Args:
+        job_id: The job ID to stream audio for.
+    Returns:
+        Streaming WAV audio response.
+    """
+    job = _job_manager.get_job(job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail="Job not found")
+    def generate_audio() -> Iterator[bytes]:
+        # Send WAV header first
+        yield _create_wav_header(sample_rate=24000)
+        # Stream audio data as it becomes available
+        while True:
+            try:
+                # Wait for audio data with timeout
+                audio_data = job.audio_queue.get(timeout=300)  # 5 min timeout
+                if audio_data is None:
+                    # End of stream
+                    break
+                # Skip WAV headers from individual chunks, only send raw PCM
+                if audio_data[:4] == b"RIFF":
+                    # This is a WAV file, extract just the PCM data
+                    # WAV header is 44 bytes for standard PCM
+                    yield audio_data[44:]
+                else:
+                    yield audio_data
+            except queue.Empty:
+                # Timeout waiting for data
+                break
+        # Clean up job after streaming
+        _job_manager.remove_job(job_id)
+    return StreamingResponse(
+        generate_audio(),
+        media_type="audio/wav",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        },
+    )
+async def read_pdf_stream(
+    file: UploadFile = File(...),
+    language: str = Form("english"),
+) -> StreamingResponse:
+    """Read a PDF with streaming progress updates.
+    Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
+    Args:
+        file: Uploaded PDF file.
+        language: Language for TTS (english, chinese, japanese, korean).
+    Returns:
+        Streaming response with progress events including job_id.
+    """
+    if _tts_engine is None:
+        raise HTTPException(status_code=500, detail="TTS engine not initialized")
+    # Validate language
+    if language not in LANGUAGE_VOICES:
+        language = "english"
+    if not file.filename or not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    try:
+        pdf_bytes = await file.read()
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
+    if not pdf_bytes:
+        raise HTTPException(status_code=400, detail="Empty file")
+    try:
+        text = extract_text(pdf_bytes)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="No text found in PDF")
+    # Get page count for progress display
+    try:
+        page_count = get_page_count(pdf_bytes)
+    except Exception:
+        page_count = None
+    # Create a job for this request
+    job = _job_manager.create_job()
+    return StreamingResponse(
+        _generate_audio_to_job(
+            job,
+            text,
+            _tts_engine,
+            language,
+            doc_name=file.filename or "document.pdf",
+            doc_type="pdf",
+            page_count=page_count,
+        ),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+async def read_text_stream(request: TextRequest) -> StreamingResponse:
+    """Read pasted text with streaming progress updates.
+    Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
+    Args:
+        request: Text request containing the text to read and language.
+    Returns:
+        Streaming response with progress events including job_id.
+    """
+    if _tts_engine is None:
+        raise HTTPException(status_code=500, detail="TTS engine not initialized")
+    text = request.text.strip()
+    language = request.language if request.language in LANGUAGE_VOICES else "english"
+    if not text:
+        raise HTTPException(status_code=400, detail="Text is required")
+    if len(text) > 500000:  # ~500KB limit for pasted text
+        raise HTTPException(status_code=400, detail="Text too long (max 500,000 characters)")
+    # Apply text normalization
+    text = clean_text(text)
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="No readable text provided")
+    # Create a job for this request
+    job = _job_manager.create_job()
+    return StreamingResponse(
+        _generate_audio_to_job(
+            job,
+            text,
+            _tts_engine,
+            language,
+            doc_name="Pasted Text",
+            doc_type="text",
+        ),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+async def read_url_stream(request: UrlRequest) -> StreamingResponse:
+    """Read content from URL with streaming progress updates.
+    Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
+    Args:
+        request: URL request containing the URL to fetch and language.
+    Returns:
+        Streaming response with progress events including job_id.
+    """
+    if _tts_engine is None:
+        raise HTTPException(status_code=500, detail="TTS engine not initialized")
+    url = request.url.strip()
+    language = request.language if request.language in LANGUAGE_VOICES else "english"
+    if not url:
+        raise HTTPException(status_code=400, detail="URL is required")
+    try:
+        parsed = urlparse(url)
+        if parsed.scheme not in ("http", "https"):
+            raise HTTPException(status_code=400, detail="URL must use HTTP or HTTPS")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid URL: {e}")
+    # Determine if this is a PDF or HTML page
+    is_pdf = url.lower().endswith(".pdf")
+    try:
+        async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            content_type = response.headers.get("content-type", "").lower()
+            if "application/pdf" in content_type:
+                is_pdf = True
+            if len(response.content) > MAX_FILE_SIZE:
+                raise HTTPException(status_code=400, detail="File too large (max 50MB)")
+            content = response.content
+    except httpx.HTTPStatusError as e:
+        raise HTTPException(
+            status_code=400, detail=f"Failed to fetch URL: HTTP {e.response.status_code}"
+        )
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
+    if is_pdf:
+        try:
+            text = extract_text(content)
+            page_count = get_page_count(content)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
+    else:
+        page_count = None
+        try:
+            extracted = trafilatura.extract(
+                content,
+                include_comments=False,
+                include_tables=True,
+                no_fallback=False,
+                favor_precision=True,
+            )
+            if extracted:
+                text = clean_text(extracted)
+            else:
+                text = ""
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
+    if not text or not text.strip():
+        raise HTTPException(status_code=400, detail="No readable content found at URL")
+    # Extract document name from URL
+    url_path = urlparse(url).path
+    doc_name = url_path.split("/")[-1] if url_path else url
+    if not doc_name or doc_name == "/":
+        doc_name = urlparse(url).netloc
+    # Create a job for this request
+    job = _job_manager.create_job()
+    return StreamingResponse(
+        _generate_audio_to_job(
+            job,
+            text,
+            _tts_engine,
+            language,
+            doc_name=doc_name,
+            doc_type="pdf" if is_pdf else "url",
+            page_count=page_count,
+        ),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )

src/talking_snake/extract.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""PDF text extraction and cleaning for TTS processing."""
+from __future__ import annotations
+import io
+import re
+from collections import Counter
+from dataclasses import dataclass
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
+@dataclass
+class TextBlock:
+    """A block of text with positional metadata."""
+    text: str
+    y_ratio: float  # 0.0 = bottom, 1.0 = top
+    font_size: float
+    page_num: int
+def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
+    """Extract text blocks from PDF with positional information.
+    Args:
+        pdf_bytes: Raw PDF file content.
+    Returns:
+        List of TextBlock objects with text and metadata.
+    """
+    blocks: list[TextBlock] = []
+    pdf_file = io.BytesIO(pdf_bytes)
+    laparams = LAParams(
+        line_margin=0.5,
+        word_margin=0.1,
+        char_margin=2.0,
+        boxes_flow=0.5,
+    )
+    for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
+        if not isinstance(page_layout, LTPage):
+            continue
+        page_height = page_layout.height
+        for element in page_layout:
+            if not isinstance(element, LTTextBoxHorizontal):
+                continue
+            text = element.get_text().strip()
+            if not text:
+                continue
+            # Calculate Y position as ratio (0=bottom, 1=top)
+            y_ratio = element.y0 / page_height if page_height > 0 else 0.5
+            # Extract average font size from characters
+            font_sizes: list[float] = []
+            for line in element:
+                if isinstance(line, LTTextLineHorizontal):
+                    for char in line:
+                        if isinstance(char, LTChar):
+                            font_sizes.append(char.size)
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
+            blocks.append(
+                TextBlock(
+                    text=text,
+                    y_ratio=y_ratio,
+                    font_size=avg_font_size,
+                    page_num=page_num,
+                )
+            )
+    return blocks
+def get_page_count(pdf_bytes: bytes) -> int:
+    """Get the number of pages in a PDF.
+    Args:
+        pdf_bytes: Raw PDF file content.
+    Returns:
+        Number of pages in the PDF.
+    """
+    pdf_file = io.BytesIO(pdf_bytes)
+    laparams = LAParams()
+    page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
+    return page_count
+def extract_text(pdf_bytes: bytes) -> str:
+    """Extract and clean text from a PDF file.
+    Args:
+        pdf_bytes: Raw PDF file content.
+    Returns:
+        Cleaned text suitable for TTS.
+    """
+    blocks = extract_text_blocks(pdf_bytes)
+    if not blocks:
+        return ""
+    cleaned_blocks = clean_text_blocks(blocks)
+    text = "\n\n".join(block.text for block in cleaned_blocks)
+    # Apply TTS-specific normalization
+    return normalize_for_tts(text)
+def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
+    """Remove headers, footers, page numbers, and other artifacts.
+    Applies multiple heuristics:
+    1. Remove blocks in top/bottom margins (likely headers/footers)
+    2. Remove repeated text across pages (likely running headers)
+    3. Remove standalone page numbers
+    4. Remove very short lines that look like artifacts
+    Args:
+        blocks: List of TextBlock objects.
+    Returns:
+        Filtered list of TextBlock objects.
+    """
+    if not blocks:
+        return []
+    # Find repeated text patterns (headers/footers)
+    text_counts = Counter(block.text for block in blocks)
+    total_pages = max(block.page_num for block in blocks)
+    repeated_threshold = max(2, total_pages // 2)
+    repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}
+    # Calculate median font size for filtering
+    font_sizes = sorted(block.font_size for block in blocks)
+    median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0
+    cleaned: list[TextBlock] = []
+    for block in blocks:
+        # Skip if in header zone (top 10%)
+        if block.y_ratio > 0.90:
+            continue
+        # Skip if in footer zone (bottom 10%)
+        if block.y_ratio < 0.10:
+            continue
+        # Skip repeated text (running headers/footers)
+        if block.text in repeated_texts:
+            continue
+        # Skip standalone page numbers
+        if is_page_number(block.text):
+            continue
+        # Skip very short lines with small font (likely captions/footnotes)
+        if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
+            continue
+        cleaned.append(block)
+    return cleaned
+def is_page_number(text: str) -> bool:
+    """Check if text is likely a page number.
+    Args:
+        text: Text to check.
+    Returns:
+        True if text appears to be a page number.
+    """
+    text = text.strip()
+    # Pure number
+    if text.isdigit():
+        return True
+    # Roman numerals
+    if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
+        return True
+    # "Page N" or "N of M" patterns
+    if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE):
+        return True
+    # "- N -" pattern
+    if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text):
+        return True
+    return False
+def clean_text(text: str) -> str:
+    """Clean raw text for TTS processing.
+    This is a simpler function for cleaning already-extracted text,
+    without the positional information.
+    Args:
+        text: Raw text to clean.
+    Returns:
+        Cleaned text suitable for TTS.
+    """
+    lines = text.split("\n")
+    cleaned_lines: list[str] = []
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines
+        if not line:
+            continue
+        # Skip standalone page numbers
+        if is_page_number(line):
+            continue
+        # Skip very short lines (likely artifacts)
+        if len(line) < 3:
+            continue
+        cleaned_lines.append(line)
+    # Rejoin with proper spacing
+    result = "\n".join(cleaned_lines)
+    # === FIX HYPHENATED/SPLIT WORDS ===
+    # These are words broken across lines, common in PDFs and web content
+    # Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
+    result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)
+    # Pattern 2: word-\n  word (hyphen + newline + spaces)
+    result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result)
+    # Pattern 3: word- word (hyphen + space, often from copy-paste)
+    result = re.sub(r"(\w)- (\w)", r"\1\2", result)
+    # Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
+    result = re.sub(r"-\n([a-z])", r"\1", result)
+    # === FIX LINE BREAK ARTIFACTS ===
+    # Join lines that don't end with sentence-ending punctuation
+    # This handles text that was wrapped at fixed width
+    # Replace single newlines (not paragraph breaks) with spaces
+    # Keep double newlines as paragraph separators
+    result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)
+    # Normalize whitespace
+    result = re.sub(r"\n{3,}", "\n\n", result)
+    result = re.sub(r"[ \t]+", " ", result)
+    # Apply TTS-specific normalization
+    result = normalize_for_tts(result)
+    return result.strip()
+def normalize_for_tts(text: str) -> str:
+    """Normalize text for natural TTS pronunciation.
+    Handles special characters, punctuation, and formatting that can
+    cause TTS models to slow down or mispronounce.
+    Args:
+        text: Text to normalize.
+    Returns:
+        Normalized text optimized for TTS.
+    """
+    # === CODE AND TECHNICAL CONTENT ===
+    # Handle common programming patterns that read poorly
+    # === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
+    # URLs (various formats) - remove completely
+    text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
+    text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
+    text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)
+    # UUIDs (with or without dashes) - must come before git hash pattern
+    uuid_pattern = (
+        r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
+    )
+    text = re.sub(uuid_pattern, "", text)
+    # Git commit hashes (7-40 hex chars standalone)
+    text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)
+    # Hex color codes (#fff, #ffffff)
+    text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)
+    # Long hex/base64 strings (likely encoded data)
+    text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)
+    # File paths (Unix and Windows style)
+    text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)
+    # IP addresses
+    text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)
+    # Port numbers after colon
+    text = re.sub(r":\d{2,5}\b", "", text)
+    # Remove email addresses
+    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)
+    # SHA/MD5 style hashes with prefix
+    text = re.sub(r"\b(sha\d*|md5|hash)[:\s]*[0-9a-f]+\b", "", text, flags=re.IGNORECASE)
+    # CamelCase: split into words (e.g., "getUserName" -> "get User Name")
+    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
+    # snake_case: replace underscores with spaces
+    text = re.sub(r"(\w)_(\w)", r"\1 \2", text)
+    # Function calls: "func()" -> "func"
+    text = re.sub(r"(\w+)\(\)", r"\1", text)
+    # Arrow functions/operators: -> and =>
+    text = text.replace("->", " returns ")
+    text = text.replace("=>", " arrow ")
+    # Common code operators spoken naturally
+    text = text.replace("!=", " not equals ")
+    text = text.replace("==", " equals ")
+    text = text.replace("===", " strictly equals ")
+    text = text.replace("!==", " strictly not equals ")
+    text = text.replace("&&", " and ")
+    text = text.replace("||", " or ")
+    text = text.replace("++", " increment ")
+    text = text.replace("--", " decrement ")
+    # File extensions: ".py" -> " dot py" (only for common extensions)
+    ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b"
+    text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)
+    # Remove standalone hashes/pound signs (not hashtags)
+    text = re.sub(r"(?<!\w)#(?!\w)", "", text)
+    # Backticks (often used in markdown for code)
+    text = text.replace("`", "")
+    # Triple quotes
+    text = text.replace('"""', "")
+    text = text.replace("'''", "")
+    # === UNICODE NORMALIZATION ===
+    # Convert smart quotes to simple quotes
+    text = text.replace(""", '"').replace(""", '"')
+    text = text.replace("'", "'").replace("'", "'")
+    text = text.replace("„", '"').replace("‟", '"')
+    # Normalize dashes to standard hyphen or remove
+    text = text.replace("–", "-")  # en-dash
+    text = text.replace("—", " - ")  # em-dash (add spaces for pause)
+    text = text.replace("―", " - ")  # horizontal bar
+    text = text.replace("‐", "-")  # Unicode hyphen
+    text = text.replace("‑", "-")  # non-breaking hyphen
+    text = text.replace("⁃", "-")  # hyphen bullet
+    text = text.replace("−", "-")  # minus sign
+    # Normalize ellipsis
+    text = text.replace("…", "...")
+    text = re.sub(r"\.{4,}", "...", text)  # Limit to 3 dots
+    # Normalize other Unicode punctuation
+    text = text.replace("•", ",")  # Bullet points
+    text = text.replace("·", " ")  # Middle dot
+    text = text.replace("‧", " ")  # Hyphenation point
+    text = text.replace("※", " ")  # Reference mark
+    text = text.replace("†", "")  # Dagger (footnote)
+    text = text.replace("‡", "")  # Double dagger
+    text = text.replace("§", "section ")
+    text = text.replace("¶", "")  # Pilcrow
+    text = text.replace("©", "copyright ")
+    text = text.replace("®", " registered ")
+    text = text.replace("™", " trademark ")
+    text = text.replace("°", " degrees ")
+    # === SPACING AROUND PUNCTUATION ===
+    # Ensure proper spacing around dashes used as separators
+    text = re.sub(r"\s*-\s*-\s*", " - ", text)  # Double dash
+    text = re.sub(r"(\w)\s*-\s*(\w)", r"\1 - \2", text)  # Word-dash-word with spaces
+    # Fix missing space after punctuation
+    text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
+    text = re.sub(r",([A-Za-z])", r", \1", text)
+    # Fix multiple punctuation marks
+    text = re.sub(r"[,]{2,}", ",", text)
+    text = re.sub(r"[;]{2,}", ";", text)
+    text = re.sub(r"[:]{2,}", ":", text)
+    text = re.sub(r"[!]{2,}", "!", text)
+    text = re.sub(r"[?]{2,}", "?", text)
+    # === NUMBERS AND SPECIAL NOTATIONS ===
+    # Convert common fractions
+    text = text.replace("½", " one half ")
+    text = text.replace("⅓", " one third ")
+    text = text.replace("⅔", " two thirds ")
+    text = text.replace("¼", " one quarter ")
+    text = text.replace("¾", " three quarters ")
+    text = text.replace("⅕", " one fifth ")
+    text = text.replace("⅖", " two fifths ")
+    text = text.replace("⅗", " three fifths ")
+    text = text.replace("⅘", " four fifths ")
+    text = text.replace("⅙", " one sixth ")
+    text = text.replace("⅚", " five sixths ")
+    text = text.replace("⅛", " one eighth ")
+    text = text.replace("⅜", " three eighths ")
+    text = text.replace("⅝", " five eighths ")
+    text = text.replace("⅞", " seven eighths ")
+    # Handle percentage and math symbols
+    text = text.replace("%", " percent")
+    text = text.replace("&", " and ")
+    text = text.replace("+", " plus ")
+    text = text.replace("=", " equals ")
+    text = text.replace("<", " less than ")
+    text = text.replace(">", " greater than ")
+    text = text.replace("≤", " less than or equal to ")
+    text = text.replace("≥", " greater than or equal to ")
+    text = text.replace("≠", " not equal to ")
+    text = text.replace("±", " plus or minus ")
+    text = text.replace("×", " times ")
+    text = text.replace("÷", " divided by ")
+    # === ABBREVIATIONS AND SPECIAL CASES ===
+    # Common abbreviations that might cause issues
+    text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
+    text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bDr\.", "Doctor", text)
+    text = re.sub(r"\bMr\.", "Mister", text)
+    text = re.sub(r"\bMrs\.", "Missus", text)
+    text = re.sub(r"\bMs\.", "Miss", text)
+    text = re.sub(r"\bProf\.", "Professor", text)
+    text = re.sub(r"\bSt\.", "Saint", text)
+    text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
+    text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)
+    # === BRACKETS AND PARENTHESES ===
+    # Remove or simplify brackets that might cause pauses
+    text = re.sub(r"\[([^\]]+)\]", r"(\1)", text)  # Square to round
+    text = re.sub(r"\{([^}]+)\}", r"(\1)", text)  # Curly to round
+    # Remove citation numbers like [1], [2,3], [1-5]
+    text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
+    text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text)
+    # === CLEANUP ===
+    # Remove standalone special characters
+    text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text)
+    # Remove content in angle brackets (often HTML/XML artifacts)
+    text = re.sub(r"<[^>]+>", "", text)
+    # Normalize multiple spaces
+    text = re.sub(r"[ \t]+", " ", text)
+    # Remove spaces before punctuation
+    text = re.sub(r"\s+([.,;:!?])", r"\1", text)
+    # Ensure space after punctuation (but not before another punctuation)
+    text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
+    # Remove leading/trailing whitespace from lines
+    text = "\n".join(line.strip() for line in text.split("\n"))
+    # Remove empty lines that resulted from cleaning
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text

src/talking_snake/static/app.js ADDED Viewed

	@@ -0,0 +1,773 @@

+/**
+ * Talking Snake - Main Application Script
+ * Handles file upload, URL submission, and audio streaming
+ */
+// DOM Elements
+const dropZone = document.getElementById("dropZone");
+const fileInput = document.getElementById("fileInput");
+const urlInput = document.getElementById("urlInput");
+const urlSubmit = document.getElementById("urlSubmit");
+const textInput = document.getElementById("textInput");
+const textSubmit = document.getElementById("textSubmit");
+const status = document.getElementById("status");
+const player = document.getElementById("player");
+const audio = document.getElementById("audio");
+const filename = document.getElementById("filename");
+const tabs = document.querySelectorAll(".tab");
+const tabContents = document.querySelectorAll(".tab-content");
+const inputSection = document.getElementById("inputSection");
+const processingSection = document.getElementById("processingSection");
+const stopBtn = document.getElementById("stopBtn");
+const pauseBtn = document.getElementById("pauseBtn");
+const deviceInfo = document.getElementById("deviceInfo");
+const docInfo = document.getElementById("docInfo");
+const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
+const processingProgressBar = document.getElementById("processingProgressBar");
+// Custom player elements
+const playerPlayBtn = document.getElementById("playerPlayBtn");
+const progressBar = document.getElementById("progressBar");
+const progressSlider = document.getElementById("progressSlider");
+const timeDisplay = document.getElementById("timeDisplay");
+const volumeBtn = document.getElementById("volumeBtn");
+const downloadBtn = document.getElementById("downloadBtn");
+// Constants
+const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
+// State
+let currentAbortController = null;
+let selectedLanguage = "english";
+let isPaused = false;
+let estimatedDuration = 0; // Estimated total duration from server
+let isMuted = false;
+let currentAudioBlob = null; // Store audio blob for download
+let currentDocName = ""; // Store document name for download filename
+/**
+ * Format time in seconds to MM:SS
+ */
+function formatTime(seconds) {
+    if (!isFinite(seconds) || seconds < 0) {
+        return "0:00";
+    }
+    const mins = Math.floor(seconds / 60);
+    const secs = Math.floor(seconds % 60);
+    return `${mins}:${secs.toString().padStart(2, "0")}`;
+}
+/**
+ * Format a number in human-readable form (1.2K, 3.4M, etc.)
+ */
+function formatNumber(num) {
+    if (num >= 1000000) {
+        return (num / 1000000).toFixed(1).replace(/\.0$/, "") + "M";
+    }
+    if (num >= 1000) {
+        return (num / 1000).toFixed(1).replace(/\.0$/, "") + "K";
+    }
+    return num.toString();
+}
+/**
+ * Get icon for document type
+ */
+function getDocTypeIcon(docType) {
+    switch (docType) {
+        case "pdf": return "fa-file-pdf";
+        case "url": return "fa-link";
+        case "text": return "fa-file-lines";
+        default: return "fa-file";
+    }
+}
+/**
+ * Update the document info display
+ */
+function updateDocInfo(data) {
+    const icon = getDocTypeIcon(data.doc_type);
+    const docName = data.doc_name || "Document";
+    const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
+    const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
+    docInfo.innerHTML = `
+        <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
+        ${pageInfo}
+        ${charInfo}
+    `;
+}
+/**
+ * Update the custom player progress bar and time display
+ */
+function updatePlayerProgress() {
+    const currentTime = audio.currentTime || 0;
+    // Use estimated duration if audio duration is unrealistic (streaming issue)
+    let duration = audio.duration;
+    if (!isFinite(duration) || duration > 36000 || duration <= 0) {
+        duration = estimatedDuration || currentTime + 60; // Fallback
+    }
+    const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
+    progressBar.style.width = `${Math.min(progress, 100)}%`;
+    progressSlider.value = progress;
+    timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
+}
+/**
+ * Handle seeking via the progress slider
+ */
+function handleSeek(e) {
+    const percent = parseFloat(e.target.value);
+    let duration = audio.duration;
+    if (!isFinite(duration) || duration > 36000) {
+        duration = estimatedDuration || 60;
+    }
+    audio.currentTime = (percent / 100) * duration;
+    updatePlayerProgress();
+}
+/**
+ * Toggle play/pause for custom player
+ */
+function togglePlayerPlay() {
+    if (audio.paused) {
+        audio.play().catch(() => {});
+    } else {
+        audio.pause();
+    }
+}
+/**
+ * Update play button icon
+ */
+function updatePlayButton() {
+    const icon = playerPlayBtn.querySelector("i");
+    if (audio.paused) {
+        icon.className = "fa-solid fa-play";
+    } else {
+        icon.className = "fa-solid fa-pause";
+    }
+}
+/**
+ * Toggle mute
+ */
+function toggleMute() {
+    isMuted = !isMuted;
+    audio.muted = isMuted;
+    const icon = volumeBtn.querySelector("i");
+    icon.className = isMuted ? "fa-solid fa-volume-xmark" : "fa-solid fa-volume-high";
+}
+/**
+ * Update device info display from SSE data
+ * @param {Object} info - Device info object
+ */
+function updateDeviceInfo(info) {
+    const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
+    const memoryInfo = info.device === "cuda"
+        ? `${info.memory_used_gb}GB / ${info.memory_total_gb}GB (${info.memory_percent}%)`
+        : "CPU mode";
+    deviceInfo.innerHTML = `
+        <i class="fa-solid ${icon}"></i>
+        <span>${info.device_name}</span>
+        <span class="device-memory">${memoryInfo}</span>
+        <span class="device-batch">Batch: ${info.batch_size}</span>
+    `;
+    deviceInfo.classList.add("visible");
+}
+/**
+ * Initialize device info SSE stream
+ */
+function initDeviceInfoStream() {
+    const eventSource = new EventSource("/api/device-info-stream");
+    eventSource.onmessage = (event) => {
+        try {
+            const info = JSON.parse(event.data);
+            updateDeviceInfo(info);
+        } catch {
+            // Silently fail - device info is optional
+        }
+    };
+    eventSource.onerror = () => {
+        // On error, close and try to reconnect after a delay
+        eventSource.close();
+        setTimeout(initDeviceInfoStream, 5000);
+    };
+}
+// Start device info SSE stream
+initDeviceInfoStream();
+// Custom player event listeners
+playerPlayBtn.addEventListener("click", togglePlayerPlay);
+progressSlider.addEventListener("input", handleSeek);
+volumeBtn.addEventListener("click", toggleMute);
+audio.addEventListener("play", updatePlayButton);
+audio.addEventListener("pause", updatePlayButton);
+audio.addEventListener("timeupdate", updatePlayerProgress);
+audio.addEventListener("ended", () => {
+    updatePlayButton();
+    progressBar.style.width = "100%";
+});
+// Show pause button when audio actually starts playing
+audio.addEventListener("playing", () => {
+    pauseBtn.classList.remove("hidden");
+});
+/**
+ * Fetch audio blob from the server for download capability
+ * @param {string} jobId - The job ID for the audio
+ */
+async function fetchAudioBlob(jobId) {
+    try {
+        const response = await fetch(`/api/audio/${jobId}`);
+        if (response.ok) {
+            currentAudioBlob = await response.blob();
+            // Show download button
+            downloadBtn.classList.remove("hidden");
+        }
+    } catch (error) {
+        console.error("Failed to fetch audio for download:", error);
+    }
+}
+/**
+ * Download the current audio as a WAV file
+ */
+function downloadAudio() {
+    if (!currentAudioBlob) {
+        return;
+    }
+    const url = URL.createObjectURL(currentAudioBlob);
+    const a = document.createElement("a");
+    a.href = url;
+    // Create filename from document name
+    let filename = currentDocName || "audio";
+    // Remove file extension if present and add .wav
+    filename = filename.replace(/\.[^.]+$/, "") + ".wav";
+    a.download = filename;
+    document.body.appendChild(a);
+    a.click();
+    document.body.removeChild(a);
+    URL.revokeObjectURL(url);
+}
+/**
+ * Get the currently selected language
+ * @returns {string} The selected language name
+ */
+function getSelectedLanguage() {
+    return selectedLanguage;
+}
+/**
+ * Show the input section and hide processing section
+ */
+function showInputSection() {
+    inputSection.classList.remove("hidden");
+    processingSection.classList.remove("visible");
+}
+/**
+ * Show the processing section and hide input section
+ */
+function showProcessingSection() {
+    inputSection.classList.add("hidden");
+    processingSection.classList.add("visible");
+    // Reset progress bar and hide pause button
+    processingProgressBar.style.width = "0%";
+    pauseBtn.classList.add("hidden");
+}
+/**
+ * Show a status message to the user
+ * @param {string} message - HTML message to display
+ * @param {string} type - Status type: 'loading', 'error', or 'success'
+ */
+function showStatus(message, type) {
+    status.innerHTML = message;
+    status.className = `status visible ${type}`;
+}
+/**
+ * Stop the current generation and audio playback
+ */
+function stopGeneration() {
+    // Stop the fetch request
+    if (currentAbortController) {
+        currentAbortController.abort();
+        currentAbortController = null;
+    }
+    // Stop audio playback and clear source
+    audio.pause();
+    audio.currentTime = 0;
+    audio.src = "";
+    audio.load(); // Force release of audio resources
+    // Reset pause state
+    isPaused = false;
+    updatePauseButton();
+    // Hide download button and pause button
+    downloadBtn.classList.add("hidden");
+    pauseBtn.classList.add("hidden");
+    currentAudioBlob = null;
+    // Reset progress bar
+    processingProgressBar.style.width = "0%";
+    showStatus('<i class="fa-solid fa-ban"></i> Generation stopped', "error");
+    showInputSection();
+}
+// Stop audio when page is closed or navigated away
+window.addEventListener("beforeunload", () => {
+    audio.pause();
+    audio.src = "";
+});
+// Also handle page hide (works better on mobile and for navigation)
+window.addEventListener("pagehide", () => {
+    audio.pause();
+    audio.src = "";
+});
+/**
+ * Toggle pause/play state
+ */
+function togglePause() {
+    if (audio.paused) {
+        audio.play().catch(() => {});
+        isPaused = false;
+    } else {
+        audio.pause();
+        isPaused = true;
+    }
+    updatePauseButton();
+}
+/**
+ * Update pause button icon based on state
+ */
+function updatePauseButton() {
+    const icon = pauseBtn.querySelector("i");
+    if (isPaused || audio.paused) {
+        icon.className = "fa-solid fa-play";
+        pauseBtn.title = "Resume";
+    } else {
+        icon.className = "fa-solid fa-pause";
+        pauseBtn.title = "Pause";
+    }
+}
+/**
+ * Format remaining time for display
+ * @param {number} seconds - Remaining time in seconds
+ * @returns {string} Formatted time string
+ */
+function formatTimeRemaining(seconds) {
+    if (seconds > 60) {
+        return `~${Math.ceil(seconds / 60)} min remaining`;
+    }
+    return `~${Math.ceil(seconds)}s remaining`;
+}
+/**
+ * Process SSE stream for progress updates
+ * Sets up audio stream once job_id is received
+ * @param {Response} response - Fetch response with SSE stream
+ * @param {string} docName - Document name for display
+ * @returns {Promise<void>}
+ * @throws {Error} If stream contains an error event or fails
+ */
+async function processStream(response, docName) {
+    const reader = response.body.getReader();
+    const decoder = new TextDecoder();
+    let lastStatus = "";
+    let jobId = null;
+    let audioStarted = false;
+    // Reset estimated duration
+    estimatedDuration = 0;
+    try {
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) {
+                break;
+            }
+            const text = decoder.decode(value, { stream: true });
+            const lines = text.split("\n");
+            for (const line of lines) {
+                if (line.startsWith("data: ")) {
+                    try {
+                        const data = JSON.parse(line.slice(6));
+                        if (data.type === "error") {
+                            throw new Error(data.message || "TTS generation failed");
+                        } else if (data.type === "start" && data.job_id) {
+                            // Got job ID - start audio stream immediately
+                            jobId = data.job_id;
+                            // Capture initial duration estimate
+                            if (data.estimated_remaining) {
+                                estimatedDuration = data.estimated_remaining;
+                            }
+                            // Display document info
+                            updateDocInfo(data);
+                            if (!audioStarted) {
+                                audioStarted = true;
+                                // Set audio source to stream endpoint
+                                // Browser will start playing as data arrives
+                                audio.src = `/api/audio/${jobId}`;
+                                audio.load();
+                                // Try to play (may need user interaction first time)
+                                audio.play().catch(() => {
+                                    // Autoplay blocked - will play when user clicks
+                                });
+                                updatePlayButton();
+                                // Pause button will be shown by the 'playing' event listener
+                            }
+                            const timeStr = formatTimeRemaining(data.estimated_remaining);
+                            showStatus(
+                                `<span class="spinner"></span>ETA ${timeStr}`,
+                                "loading"
+                            );
+                            // Update progress bar
+                            processingProgressBar.style.width = "5%";
+                        } else if (data.type === "progress") {
+                            lastStatus = data.status;
+                            const timeStr = formatTimeRemaining(data.estimated_remaining);
+                            showStatus(
+                                `<span class="spinner"></span>${data.percent}% • ETA ${timeStr}`,
+                                "loading"
+                            );
+                            // Update progress bar
+                            processingProgressBar.style.width = `${data.percent}%`;
+                        } else if (data.type === "complete") {
+                            // Generation complete - show player
+                            // Update estimated duration based on actual processing time
+                            if (data.total_time) {
+                                // Estimate audio duration: ~0.1s per char at normal speech rate
+                                // Use total_time as a rough guide
+                                estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
+                            }
+                            filename.textContent = docName;
+                            currentDocName = docName;
+                            player.classList.add("visible");
+                            // Set progress to 100%
+                            processingProgressBar.style.width = "100%";
+                            showInputSection();
+                            showStatus(
+                                `<i class="fa-solid fa-circle-check"></i> Done in ${data.total_time}s`,
+                                "success"
+                            );
+                            updatePlayerProgress();
+                            // Fetch audio blob for download capability
+                            if (jobId) {
+                                fetchAudioBlob(jobId);
+                            }
+                        }
+                    } catch (parseError) {
+                        // Check if it's our thrown error or a JSON parse error
+                        if (parseError.message && !parseError.message.includes("JSON")) {
+                            throw parseError;
+                        }
+                        // Ignore JSON parse errors for partial data
+                    }
+                }
+            }
+        }
+    } catch (streamError) {
+        // Re-throw with more context and preserve the original cause
+        const context = lastStatus ? ` (during: ${lastStatus})` : "";
+        throw new Error(`Stream error${context}: ${streamError.message}`, { cause: streamError });
+    }
+}
+/**
+ * Handle file upload and TTS conversion
+ * @param {File} file - The uploaded file
+ */
+async function handleFile(file) {
+    // Validate file type
+    if (!file.name.toLowerCase().endsWith(".pdf")) {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please select a PDF file', "error");
+        return;
+    }
+    // Validate file size
+    if (file.size > MAX_FILE_SIZE) {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> File too large. Maximum size is 50MB.', "error");
+        return;
+    }
+    showProcessingSection();
+    showStatus('<span class="spinner"></span> Extracting text...', "loading");
+    player.classList.remove("visible");
+    downloadBtn.classList.add("hidden");
+    currentAudioBlob = null;
+    const formData = new FormData();
+    formData.append("file", file);
+    formData.append("language", getSelectedLanguage());
+    // Create abort controller for this request
+    currentAbortController = new AbortController();
+    try {
+        const response = await fetch("/api/read-stream", {
+            method: "POST",
+            body: formData,
+            signal: currentAbortController.signal,
+        });
+        if (!response.ok) {
+            const error = await response.json();
+            throw new Error(error.detail || "Failed to process document");
+        }
+        // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, file.name);
+    } catch (error) {
+        if (error.name === "AbortError") {
+            // User cancelled - already handled in stopGeneration
+            return;
+        }
+        showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
+        showInputSection();
+    } finally {
+        currentAbortController = null;
+    }
+}
+/**
+ * Handle URL submission and TTS conversion
+ * @param {string} url - The URL to process
+ */
+async function handleUrl(url) {
+    url = url.trim();
+    if (!url) {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a URL', "error");
+        return;
+    }
+    // Validate URL format
+    try {
+        new URL(url);
+    } catch {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a valid URL', "error");
+        return;
+    }
+    showProcessingSection();
+    showStatus('<span class="spinner"></span> Fetching content...', "loading");
+    player.classList.remove("visible");
+    downloadBtn.classList.add("hidden");
+    currentAudioBlob = null;
+    urlSubmit.disabled = true;
+    // Create abort controller for this request
+    currentAbortController = new AbortController();
+    try {
+        const response = await fetch("/api/read-url-stream", {
+            method: "POST",
+            headers: {
+                "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+                url,
+                language: getSelectedLanguage()
+            }),
+            signal: currentAbortController.signal,
+        });
+        if (!response.ok) {
+            const error = await response.json();
+            throw new Error(error.detail || "Failed to process document");
+        }
+        // Extract filename from URL
+        const urlPath = new URL(url).pathname;
+        const docName = urlPath.split("/").pop() || "document";
+        // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, docName);
+    } catch (error) {
+        if (error.name === "AbortError") {
+            // User cancelled - already handled in stopGeneration
+            return;
+        }
+        showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
+        showInputSection();
+    } finally {
+        urlSubmit.disabled = false;
+        currentAbortController = null;
+    }
+}
+/**
+ * Handle text submission and TTS conversion
+ * @param {string} text - The text to process
+ */
+async function handleText(text) {
+    text = text.trim();
+    if (!text) {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter some text', "error");
+        return;
+    }
+    if (text.length > 500000) {
+        showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Text too long (max 500,000 characters)', "error");
+        return;
+    }
+    showProcessingSection();
+    showStatus('<span class="spinner"></span> Processing text...', "loading");
+    player.classList.remove("visible");
+    downloadBtn.classList.add("hidden");
+    currentAudioBlob = null;
+    textSubmit.disabled = true;
+    // Create abort controller for this request
+    currentAbortController = new AbortController();
+    try {
+        const response = await fetch("/api/read-text-stream", {
+            method: "POST",
+            headers: {
+                "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+                text,
+                language: getSelectedLanguage()
+            }),
+            signal: currentAbortController.signal,
+        });
+        if (!response.ok) {
+            const error = await response.json();
+            throw new Error(error.detail || "Failed to process text");
+        }
+        // Process stream handles both progress SSE and starting audio playback
+        await processStream(response, "Pasted Text");
+    } catch (error) {
+        if (error.name === "AbortError") {
+            // User cancelled - already handled in stopGeneration
+            return;
+        }
+        showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
+        showInputSection();
+    } finally {
+        textSubmit.disabled = false;
+        currentAbortController = null;
+    }
+}
+// Tab switching
+tabs.forEach((tab) => {
+    tab.addEventListener("click", () => {
+        tabs.forEach((t) => t.classList.remove("active"));
+        tabContents.forEach((tc) => tc.classList.remove("active"));
+        tab.classList.add("active");
+        document.getElementById(`${tab.dataset.tab}-tab`).classList.add("active");
+    });
+});
+// Drag and drop handlers
+dropZone.addEventListener("dragover", (e) => {
+    e.preventDefault();
+    dropZone.classList.add("dragover");
+});
+dropZone.addEventListener("dragleave", () => {
+    dropZone.classList.remove("dragover");
+});
+dropZone.addEventListener("drop", (e) => {
+    e.preventDefault();
+    dropZone.classList.remove("dragover");
+    const files = e.dataTransfer.files;
+    if (files.length > 0) {
+        handleFile(files[0]);
+    }
+});
+// Click to select file
+dropZone.addEventListener("click", (e) => {
+    if (e.target !== fileInput && !e.target.classList.contains("file-label")) {
+        fileInput.click();
+    }
+});
+fileInput.addEventListener("change", () => {
+    if (fileInput.files.length > 0) {
+        handleFile(fileInput.files[0]);
+    }
+});
+// URL submission
+urlSubmit.addEventListener("click", () => {
+    handleUrl(urlInput.value);
+});
+urlInput.addEventListener("keypress", (e) => {
+    if (e.key === "Enter") {
+        handleUrl(urlInput.value);
+    }
+});
+// Text submission
+textSubmit.addEventListener("click", () => {
+    handleText(textInput.value);
+});
+// Allow Ctrl+Enter to submit text
+textInput.addEventListener("keydown", (e) => {
+    if (e.key === "Enter" && (e.ctrlKey || e.metaKey)) {
+        handleText(textInput.value);
+    }
+});
+// Stop button
+stopBtn.addEventListener("click", stopGeneration);
+// Pause button
+pauseBtn.addEventListener("click", togglePause);
+// Download button
+downloadBtn.addEventListener("click", downloadAudio);
+// Update pause button when audio state changes
+audio.addEventListener("play", updatePauseButton);
+audio.addEventListener("pause", updatePauseButton);
+audio.addEventListener("ended", () => {
+    isPaused = false;
+    updatePauseButton();
+});
+// Language selection
+languageButtons.forEach((btn) => {
+    btn.addEventListener("click", () => {
+        languageButtons.forEach((b) => b.classList.remove("active"));
+        btn.classList.add("active");
+        selectedLanguage = btn.dataset.language;
+    });
+});

src/talking_snake/static/apple-touch-icon.png ADDED Viewed

Git LFS Details

SHA256: 989b16d28890ccb0f7448a3b9908ccd8d32f6e21905d5a19911e91c3f20b3321
Pointer size: 130 Bytes
Size of remote file: 16.2 kB

src/talking_snake/static/favicon.png ADDED Viewed

Git LFS Details

SHA256: b5e1394d12d7e68102bbc5c840d08e2273813b8c217e7672e5eead49021c699f
Pointer size: 131 Bytes
Size of remote file: 253 kB

src/talking_snake/static/icon-192.png ADDED Viewed

Git LFS Details

SHA256: f092b628061742568c93d060f3549b356b4e2d32816f30412960e34da3b18c9e
Pointer size: 130 Bytes
Size of remote file: 18.4 kB

src/talking_snake/static/icon-512.png ADDED Viewed

Git LFS Details

SHA256: 8622da6773cd5becce47e2b1e4a2ff5b5eb25d9335a472808cde51dce1f33ddc
Pointer size: 131 Bytes
Size of remote file: 107 kB

src/talking_snake/static/index.html ADDED Viewed

	@@ -0,0 +1,142 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Talking Snake - PDF & Web to Speech</title>
+    <!-- PWA / Mobile App Configuration -->
+    <meta name="application-name" content="Talking Snake">
+    <meta name="theme-color" content="#1a1a2e">
+    <meta name="mobile-web-app-capable" content="yes">
+    <link rel="manifest" href="/static/manifest.json">
+    <!-- iOS PWA Configuration -->
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
+    <meta name="apple-mobile-web-app-title" content="Talking Snake">
+    <link rel="apple-touch-icon" href="/static/apple-touch-icon.png">
+    <link rel="apple-touch-icon" sizes="180x180" href="/static/apple-touch-icon.png">
+    <link rel="apple-touch-icon" sizes="152x152" href="/static/apple-touch-icon.png">
+    <link rel="apple-touch-icon" sizes="120x120" href="/static/apple-touch-icon.png">
+    <!-- Standard favicon -->
+    <link rel="icon" type="image/png" href="/static/favicon.png">
+    <link rel="icon" type="image/png" sizes="192x192" href="/static/icon-192.png">
+    <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
+    <link rel="stylesheet" href="/static/styles.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
+    <script src="https://unpkg.com/htmx.org@2.0.4"></script>
+</head>
+<body>
+    <div class="main-content">
+        <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
+        <h1>Talking Snake</h1>
+        <p class="subtitle">Transform PDFs & Web into Audio</p>
+        <div class="container">
+            <div class="input-section" id="inputSection">
+                <div class="options-row">
+                    <div class="language-selector">
+                        <span class="style-label">Language:</span>
+                        <div class="style-buttons" id="languageButtons">
+                            <button class="style-btn lang-btn active" data-language="english" title="English">
+                                🇬🇧
+                            </button>
+                            <button class="style-btn lang-btn" data-language="chinese" title="Chinese">
+                                🇨🇳
+                            </button>
+                            <button class="style-btn lang-btn" data-language="japanese" title="Japanese">
+                                🇯🇵
+                            </button>
+                            <button class="style-btn lang-btn" data-language="korean" title="Korean">
+                                🇰🇷
+                            </button>
+                        </div>
+                    </div>
+                </div>
+                <div class="tabs">
+                    <button class="tab active" data-tab="upload"><i class="fa-solid fa-upload"></i> Upload File</button>
+                    <button class="tab" data-tab="url"><i class="fa-solid fa-link"></i> From URL</button>
+                    <button class="tab" data-tab="text"><i class="fa-solid fa-keyboard"></i> Paste Text</button>
+                </div>
+                <div class="tab-content active" id="upload-tab">
+                    <div class="drop-zone" id="dropZone">
+                        <i class="fa-solid fa-file-pdf drop-icon"></i>
+                        <p>Drag & drop a PDF here</p>
+                        <label class="file-label">
+                            <i class="fa-solid fa-folder-open"></i> Choose File
+                            <input type="file" id="fileInput" accept=".pdf">
+                        </label>
+                        <p class="hint">Supports PDF documents up to 50MB</p>
+                    </div>
+                </div>
+                <div class="tab-content" id="url-tab">
+                    <div class="url-form">
+                        <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
+                        <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i> Read Content</button>
+                        <p class="hint">Enter a link to a PDF or web page (articles, docs, blogs)</p>
+                    </div>
+                </div>
+                <div class="tab-content" id="text-tab">
+                    <div class="text-form">
+                        <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
+                        <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
+                        <p class="hint">Paste any text you want to hear read aloud</p>
+                    </div>
+                </div>
+            </div>
+            <div class="processing-section" id="processingSection">
+                <div class="processing-row-1">
+                    <div class="doc-info" id="docInfo"></div>
+                </div>
+                <div class="processing-row-2">
+                    <div class="status" id="status"></div>
+                    <div class="processing-progress-container" id="processingProgressContainer">
+                        <div class="processing-progress-bar" id="processingProgressBar"></div>
+                    </div>
+                    <div class="control-buttons">
+                        <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
+                        <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
+                    </div>
+                </div>
+            </div>
+            <div class="device-info" id="deviceInfo"></div>
+            <div class="player" id="player">
+                <div class="filename" id="filename"></div>
+                <div class="custom-player">
+                    <button class="player-btn play-btn" id="playerPlayBtn" title="Play/Pause">
+                        <i class="fa-solid fa-play"></i>
+                    </button>
+                    <div class="progress-container" id="progressContainer">
+                        <div class="progress-bar" id="progressBar"></div>
+                        <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
+                    </div>
+                    <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
+                    <button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
+                        <i class="fa-solid fa-volume-high"></i>
+                    </button>
+                    <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
+                        <i class="fa-solid fa-download"></i>
+                    </button>
+                </div>
+                <audio id="audio" preload="auto"></audio>
+            </div>
+        </div>
+    </div>
+    <footer>
+        <p>Built with <i class="fa-solid fa-heart"></i> for listeners everywhere | <a href="https://github.com/LucaCappelletti94/talking-snake" target="_blank" rel="noopener noreferrer"><i class="fa-brands fa-github"></i> GitHub</a></p>
+    </footer>
+    <script src="/static/app.js"></script>
+</body>
+</html>

src/talking_snake/static/manifest.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "name": "Talking Snake",
+  "short_name": "Talking Snake",
+  "description": "Transform PDFs & Web into Audio",
+  "start_url": "/",
+  "display": "standalone",
+  "background_color": "#1a1a2e",
+  "theme_color": "#1a1a2e",
+  "orientation": "portrait-primary",
+  "icons": [
+    {
+      "src": "/static/favicon.png",
+      "sizes": "64x64",
+      "type": "image/png"
+    },
+    {
+      "src": "/static/icon-192.png",
+      "sizes": "192x192",
+      "type": "image/png",
+      "purpose": "any maskable"
+    },
+    {
+      "src": "/static/icon-512.png",
+      "sizes": "512x512",
+      "type": "image/png",
+      "purpose": "any maskable"
+    },
+    {
+      "src": "/static/apple-touch-icon.png",
+      "sizes": "180x180",
+      "type": "image/png"
+    }
+  ],
+  "categories": ["utilities", "productivity"],
+  "lang": "en"
+}

src/talking_snake/static/sample.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:131aea479804ac10ad86674780fca80134775ef547e808339f66408eb90ffadb
+size 291884

src/talking_snake/static/styles.css ADDED Viewed

	@@ -0,0 +1,848 @@

+/**
+ * Talking Snake - Main Stylesheet
+ * A warm, accessible color scheme inspired by the talking snake logo
+ */
+:root {
+    /* Warm, friendly palette inspired by the talking snake logo */
+    --bg: #fff7e9;              /* Warm cream background */
+    --surface: #fff;          /* Clean white cards */
+    --primary: #d4763a;          /* Warm orange - friendly & energetic */
+    --primary-hover: #c06830;    /* Darker orange for hover */
+    --secondary: #5a8f5a;        /* Soft green - snake accent */
+    --text: #3d3425;             /* Warm dark brown - easy on eyes */
+    --text-muted: #7a6f5f;       /* Muted brown */
+    --border: #e5d9c8;           /* Warm border */
+    --success: #5a8f5a;          /* Green for success states */
+    --error: #c45a4a;            /* Soft red for errors */
+}
+* {
+    box-sizing: border-box;
+}
+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, sans-serif;
+    background: var(--bg);
+    color: var(--text);
+    min-height: 100vh;
+    margin: 0;
+    padding: 1.5rem;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    line-height: 1.4;
+}
+.main-content {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    flex: 1;
+    justify-content: center;
+    width: 100%;
+}
+h1 {
+    font-size: 1.75rem;
+    margin: 0 0 0.25rem;
+    color: var(--primary);
+}
+.subtitle {
+    color: var(--text-muted);
+    margin: 0 0 1rem;
+    font-size: 0.9rem;
+}
+.container {
+    max-width: 500px;
+    width: 100%;
+}
+/* Options Row - Style and Language selectors */
+.options-row {
+    display: flex;
+    justify-content: center;
+    gap: 1.5rem;
+    margin-bottom: 1rem;
+    flex-wrap: wrap;
+}
+/* Style Selector */
+.style-selector,
+.language-selector {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    flex-wrap: wrap;
+}
+.style-label {
+    font-size: 0.85rem;
+    color: var(--text-muted);
+}
+.style-buttons {
+    display: flex;
+    gap: 0.35rem;
+}
+.style-btn {
+    width: 38px;
+    height: 38px;
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    background: var(--surface);
+    color: var(--text-muted);
+    cursor: pointer;
+    font-size: 0.95rem;
+    transition: all 0.15s ease;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+/* Language buttons use emoji flags */
+.style-btn.lang-btn {
+    font-size: 1.2rem;
+}
+.style-btn:hover {
+    border-color: var(--primary);
+    color: var(--text);
+}
+.style-btn.active {
+    background: rgb(212, 118, 58, 0.15);
+    border-color: var(--primary);
+    color: var(--primary);
+}
+/* Input Section - hidden during processing */
+.input-section.hidden {
+    display: none;
+}
+/* Processing Section - two row layout */
+.processing-section {
+    display: none;
+    flex-direction: column;
+    gap: 0.75rem;
+    padding: 1rem 1.25rem;
+    background: var(--surface);
+    border-radius: 10px;
+    border: 1px solid var(--border);
+    width: 100%;
+}
+.processing-section.visible {
+    display: flex;
+}
+/* Row 1: Document info */
+.processing-row-1 {
+    display: flex;
+    align-items: center;
+    width: 100%;
+}
+/* Row 2: Status, progress, buttons */
+.processing-row-2 {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+/* Document Info - fills first row */
+.doc-info {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    font-size: 0.85rem;
+    color: var(--text);
+    width: 100%;
+    min-width: 0;
+}
+.doc-info:empty {
+    display: none;
+}
+.doc-info .doc-name {
+    font-weight: 600;
+    display: flex;
+    align-items: center;
+    gap: 0.4rem;
+    flex: 1;
+    min-width: 0;
+}
+.doc-info .doc-name i {
+    color: var(--primary);
+    flex-shrink: 0;
+}
+.doc-info .doc-name-text {
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+.doc-info .doc-pages,
+.doc-info .doc-chars {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+    white-space: nowrap;
+    flex-shrink: 0;
+}
+.doc-info .doc-pages i,
+.doc-info .doc-chars i {
+    font-size: 0.7rem;
+    opacity: 0.6;
+}
+/* Status in processing */
+.processing-section .status {
+    padding: 0;
+    background: none;
+    font-size: 0.8rem;
+    white-space: nowrap;
+    flex-shrink: 0;
+}
+/* Processing progress bar */
+.processing-progress-container {
+    flex: 1;
+    height: 6px;
+    background: var(--bg);
+    border-radius: 3px;
+    overflow: hidden;
+    min-width: 60px;
+}
+.processing-progress-bar {
+    height: 100%;
+    background: linear-gradient(90deg, var(--primary) 0%, #c06030 100%);
+    border-radius: 3px;
+    width: 0%;
+    transition: width 0.3s ease;
+}
+/* Control buttons row */
+.control-buttons {
+    display: flex;
+    gap: 0.5rem;
+    flex-shrink: 0;
+}
+.control-btn {
+    width: 36px;
+    height: 36px;
+    padding: 0;
+    color: white;
+    border: none;
+    border-radius: 8px;
+    cursor: pointer;
+    font-size: 0.9rem;
+    transition: all 0.15s ease;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.control-btn.hidden {
+    display: none;
+}
+.control-btn:hover {
+    filter: brightness(1.1);
+}
+.pause-btn {
+    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+}
+.pause-btn:hover {
+    animation: gradient-shift 0.8s ease infinite;
+}
+.stop-btn {
+    background: linear-gradient(135deg, var(--error), #8b3a30, var(--error));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+}
+.stop-btn:hover {
+    animation: gradient-shift 0.8s ease infinite;
+}
+@keyframes gradient-idle {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes gradient-shift {
+    0% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+    100% { background-position: 0% 50%; }
+}
+/* Drop Zone */
+.drop-zone {
+    border: 2px dashed var(--border);
+    border-radius: 8px;
+    padding: 1.5rem 1rem;
+    text-align: center;
+    transition: all 0.2s ease;
+    cursor: pointer;
+    background: var(--surface);
+}
+.drop-zone:hover,
+.drop-zone.dragover {
+    border-color: var(--primary);
+    background: rgb(212, 118, 58, 0.08);
+}
+.drop-zone p {
+    margin: 0 0 0.75rem;
+    font-size: 0.95rem;
+}
+.drop-zone .hint {
+    color: var(--text-muted);
+    font-size: 0.8rem;
+}
+.drop-icon {
+    font-size: 2.5rem;
+    color: var(--primary);
+    margin-bottom: 0.75rem;
+    display: block;
+}
+/* Tabs */
+.tabs {
+    display: flex;
+    gap: 0.25rem;
+    margin-bottom: 0.75rem;
+}
+.tab {
+    flex: 1;
+    padding: 0.5rem 0.75rem;
+    background: var(--surface);
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    color: var(--text-muted);
+    cursor: pointer;
+    font-size: 0.85rem;
+    transition: all 0.15s ease;
+}
+.tab:hover {
+    border-color: var(--primary);
+    color: var(--text);
+}
+.tab.active {
+    background: rgb(212, 118, 58, 0.12);
+    border-color: var(--primary);
+    color: var(--primary);
+}
+.tab-content {
+    display: none;
+}
+.tab-content.active {
+    display: block;
+}
+/* URL Form */
+.url-form {
+    background: var(--surface);
+    border-radius: 8px;
+    padding: 1rem;
+}
+.url-form input[type="url"] {
+    width: 100%;
+    padding: 0.6rem 0.75rem;
+    background: var(--bg);
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    color: var(--text);
+    font-size: 0.9rem;
+    margin-bottom: 0.75rem;
+    transition: border-color 0.15s ease;
+}
+.url-form input[type="url"]:focus {
+    outline: none;
+    border-color: var(--primary);
+}
+.url-form input[type="url"]::placeholder {
+    color: var(--text-muted);
+}
+/* Text Form */
+.text-form {
+    background: var(--surface);
+    border-radius: 8px;
+    padding: 1rem;
+}
+.text-form textarea {
+    width: 100%;
+    padding: 0.6rem 0.75rem;
+    background: var(--bg);
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    color: var(--text);
+    font-size: 0.9rem;
+    margin-bottom: 0.75rem;
+    transition: border-color 0.15s ease;
+    resize: vertical;
+    min-height: 120px;
+    font-family: inherit;
+    line-height: 1.5;
+}
+.text-form textarea:focus {
+    outline: none;
+    border-color: var(--primary);
+}
+.text-form textarea::placeholder {
+    color: var(--text-muted);
+}
+.text-form .hint {
+    color: var(--text-muted);
+    font-size: 0.8rem;
+    text-align: center;
+    margin: 0;
+}
+/* Buttons */
+.submit-btn {
+    width: 100%;
+    padding: 0.6rem 1rem;
+    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+    color: white;
+    border: none;
+    border-radius: 8px;
+    cursor: pointer;
+    font-size: 0.9rem;
+    font-weight: 500;
+    transition: filter 0.15s ease;
+    margin-bottom: 0.5rem;
+}
+.submit-btn:hover {
+    filter: brightness(1.1);
+    animation: gradient-shift 0.8s ease infinite;
+}
+.submit-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+    filter: none;
+    animation: none;
+}
+.url-form .hint {
+    color: var(--text-muted);
+    font-size: 0.8rem;
+    text-align: center;
+    margin: 0;
+}
+input[type="file"] {
+    display: none;
+}
+.file-label {
+    display: inline-block;
+    padding: 0.5rem 1rem;
+    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+    color: white;
+    border-radius: 8px;
+    cursor: pointer;
+    font-weight: 500;
+    font-size: 0.9rem;
+    transition: filter 0.15s ease;
+}
+.file-label:hover {
+    filter: brightness(1.1);
+    animation: gradient-shift 0.8s ease infinite;
+}
+/* Device Info - Subtle footer-like display */
+.device-info {
+    display: none;
+    justify-content: center;
+    align-items: center;
+    gap: 1rem;
+    padding: 0.75rem 1rem;
+    font-size: 0.7rem;
+    color: var(--text-muted);
+    margin-top: 0.5rem;
+    opacity: 0.7;
+}
+.device-info.visible {
+    display: flex;
+    flex-wrap: wrap;
+}
+.device-info i {
+    color: var(--primary);
+    opacity: 0.8;
+}
+.device-memory {
+    opacity: 0.9;
+}
+.device-batch {
+    background: var(--surface);
+    padding: 0.2rem 0.5rem;
+    border-radius: 4px;
+    font-size: 0.65rem;
+}
+/* Icon spacing in buttons and tabs */
+.tab i,
+.submit-btn i,
+.file-label i {
+    margin-right: 0.4rem;
+}
+/* Status Messages */
+.status {
+    font-size: 0.85rem;
+    display: none;
+}
+.status.visible {
+    display: block;
+}
+.status i {
+    margin-right: 0.4rem;
+}
+.status.loading {
+    color: var(--text-muted);
+}
+.status.error {
+    color: var(--error);
+}
+.status.success {
+    color: var(--success);
+}
+/* Audio Player */
+.player {
+    margin-top: 1.5rem;
+    width: 100%;
+    display: none;
+    padding: 1.25rem;
+    background: var(--surface);
+    border-radius: 12px;
+    border: 1px solid var(--border);
+}
+.player.visible {
+    display: block;
+}
+/* Hidden audio element */
+#audio {
+    display: none;
+}
+/* Custom Audio Player */
+.custom-player {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+.player-btn {
+    width: 36px;
+    height: 36px;
+    border: none;
+    border-radius: 8px;
+    background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+    color: white;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.85rem;
+    transition: filter 0.15s ease;
+    flex-shrink: 0;
+}
+.player-btn:hover {
+    filter: brightness(1.1);
+    animation: gradient-shift 0.8s ease infinite;
+}
+.player-btn.volume-btn {
+    background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+    color: var(--text-muted);
+    width: 32px;
+    height: 32px;
+    font-size: 0.8rem;
+}
+.player-btn.volume-btn:hover {
+    color: var(--text);
+    animation: gradient-shift 0.8s ease infinite;
+}
+.player-btn.download-btn {
+    background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
+    background-size: 200% 200%;
+    animation: gradient-idle 3s ease infinite;
+    color: var(--text-muted);
+    width: 32px;
+    height: 32px;
+    font-size: 0.8rem;
+}
+.player-btn.download-btn:hover {
+    color: var(--primary);
+    animation: gradient-shift 0.8s ease infinite;
+}
+.progress-container {
+    flex: 1;
+    height: 6px;
+    background: var(--bg);
+    border-radius: 3px;
+    position: relative;
+    cursor: pointer;
+}
+.progress-bar {
+    height: 100%;
+    background: var(--primary);
+    border-radius: 3px;
+    width: 0%;
+    transition: width 0.1s ease;
+    pointer-events: none;
+}
+.progress-slider {
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    opacity: 0;
+    cursor: pointer;
+    margin: 0;
+    appearance: none;
+}
+.progress-slider::-webkit-slider-thumb {
+    appearance: none;
+    width: 14px;
+    height: 14px;
+    background: var(--primary);
+    border-radius: 50%;
+    cursor: pointer;
+}
+.progress-slider::-moz-range-thumb {
+    width: 14px;
+    height: 14px;
+    background: var(--primary);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+.time-display {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    min-width: 80px;
+    text-align: center;
+    font-variant-numeric: tabular-nums;
+}
+.filename {
+    margin-bottom: 0.75rem;
+    font-size: 0.85rem;
+    font-weight: 500;
+    color: var(--text);
+    word-break: break-all;
+}
+/* Spinner Animation */
+.spinner {
+    display: inline-block;
+    width: 14px;
+    height: 14px;
+    border: 2px solid var(--text-muted);
+    border-top-color: var(--primary);
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+    margin-right: 0.4rem;
+    vertical-align: middle;
+}
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+/* Footer */
+footer {
+    margin-top: auto;
+    padding-top: 1.5rem;
+    color: var(--text-muted);
+    font-size: 0.75rem;
+    flex-shrink: 0;
+}
+footer a {
+    color: var(--primary);
+    text-decoration: none;
+}
+footer a:hover {
+    text-decoration: underline;
+}
+footer i.fa-heart {
+    color: var(--error);
+}
+footer i.fa-github {
+    margin-right: 0.2rem;
+}
+/* Logo */
+.logo {
+    width: 250px;
+    height: auto;
+    margin-bottom: 0.75rem;
+}
+/* Tablet styles */
+@media (width <= 768px) {
+    body {
+        padding: 1rem;
+    }
+    h1 {
+        font-size: 1.5rem;
+    }
+    .logo {
+        width: 200px;
+    }
+    .drop-zone {
+        padding: 1.25rem 1rem;
+    }
+    .tabs {
+        flex-direction: column;
+    }
+    .tab {
+        width: 100%;
+    }
+}
+/* Mobile styles */
+@media (width <= 480px) {
+    body {
+        padding: 0.75rem;
+    }
+    h1 {
+        font-size: 1.35rem;
+    }
+    .subtitle {
+        font-size: 0.8rem;
+    }
+    .logo {
+        width: 160px;
+    }
+    .drop-zone {
+        padding: 1rem;
+    }
+    .drop-zone p {
+        font-size: 0.9rem;
+    }
+    .url-form {
+        padding: 0.75rem;
+    }
+    .url-form input[type="url"] {
+        padding: 0.5rem;
+        font-size: 0.85rem;
+    }
+    .submit-btn,
+    .file-label {
+        padding: 0.5rem 0.75rem;
+        font-size: 0.85rem;
+    }
+    .filename {
+        font-size: 0.75rem;
+        padding: 0.4rem 0.5rem;
+    }
+    footer {
+        font-size: 0.7rem;
+        text-align: center;
+    }
+}
+/* Ensure touch targets are large enough */
+@media (pointer: coarse) {
+    .tab,
+    .submit-btn,
+    .file-label {
+        min-height: 44px;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+    }
+}

src/talking_snake/static/talking_snake.png ADDED Viewed

Git LFS Details

SHA256: 2fcd44f86e8dd2a1a7e04e3275cf46f94a7f16d24c0bae0e7debab29e1aa6305
Pointer size: 131 Bytes
Size of remote file: 280 kB

src/talking_snake/tts.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""TTS engine wrapper for Qwen3-TTS."""
+from __future__ import annotations
+import io
+import wave
+from abc import ABC, abstractmethod
+from collections.abc import Iterator
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+class TTSEngineProtocol(ABC):
+    """Protocol for TTS engines, enabling dependency injection and mocking."""
+    @abstractmethod
+    def synthesize(self, text: str) -> Iterator[bytes]:
+        """Synthesize text to audio.
+        Args:
+            text: Text to synthesize.
+        Yields:
+            WAV audio data chunks.
+        """
+        ...
+    @property
+    @abstractmethod
+    def sample_rate(self) -> int:
+        """Return the sample rate of generated audio."""
+        ...
+    @property
+    def batch_size(self) -> int:
+        """Return the batch size for parallel processing (default: 1)."""
+        return 1
+# Professional narration style prompt
+# This instructs the model to read with clear, authoritative delivery
+PROFESSIONAL_STYLE = (
+    "Read this as a professional narrator with clear enunciation, "
+    "measured pacing, and an authoritative yet warm tone. "
+    "Speak naturally as if presenting an audiobook or documentary. "
+    "Avoid sounding robotic or monotone. Emphasize key points and maintain a steady rhythm. "
+    "Use appropriate intonation to convey meaning and keep the listener engaged. "
+    "This is not casual conversation, but a polished narration style. "
+    "Use proper diction, read correctly acronyms, and pronounce all words clearly."
+)
+# Language to default voice mapping
+LANGUAGE_VOICES: dict[str, str] = {
+    "english": "Ryan",
+    "chinese": "Vivian",
+    "japanese": "Ono_Anna",
+    "korean": "Sohee",
+}
+# Default chunk size for streaming
+# Larger chunks = more stable voice, fewer artifacts at boundaries
+# Smaller chunks = faster first audio but potential voice instability
+# 1200 chars provides good balance for natural speech flow
+DEFAULT_CHUNK_SIZE = 1200
+class QwenTTSEngine(TTSEngineProtocol):
+    """TTS engine using Qwen3-TTS model."""
+    # Available voices for CustomVoice model:
+    # Chinese: Vivian, Serena, Uncle_Fu, Dylan (Beijing), Eric (Sichuan)
+    # English: Ryan, Aiden
+    # Japanese: Ono_Anna
+    # Korean: Sohee
+    AVAILABLE_VOICES = [
+        "Vivian",
+        "Serena",
+        "Uncle_Fu",
+        "Dylan",
+        "Eric",
+        "Ryan",
+        "Aiden",
+        "Ono_Anna",
+        "Sohee",
+    ]
+    def __init__(
+        self,
+        voice: str | None = None,
+        language: str = "english",
+        device: str = "cuda",
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        model_name: str = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+    ) -> None:
+        """Initialize the TTS engine.
+        Args:
+            voice: Voice name to use for synthesis. If None, uses default for language.
+                   Available voices:
+                   Chinese: Vivian, Serena, Uncle_Fu, Dylan, Eric
+                   English: Ryan, Aiden
+                   Japanese: Ono_Anna
+                   Korean: Sohee
+            language: Language for TTS. One of: english, chinese, japanese, korean.
+                      Sets default voice if voice is None.
+            device: Device to run the model on ('cuda' or 'cpu').
+            chunk_size: Maximum characters per chunk (smaller = faster streaming start).
+            model_name: HuggingFace model identifier.
+        """
+        import logging
+        import warnings
+        import torch
+        from qwen_tts import Qwen3TTSModel
+        # Suppress the pad_token_id warning from transformers
+        logging.getLogger("transformers.generation.utils").setLevel(logging.ERROR)
+        warnings.filterwarnings("ignore", message=".*pad_token_id.*")
+        self.language = language.lower()
+        self.voice = voice or LANGUAGE_VOICES.get(self.language, "Ryan")
+        self.device = device
+        self.chunk_size = chunk_size
+        self._sample_rate = 24000
+        self._batch_size = 1  # Will be calculated after model loads
+        # Determine dtype based on device
+        dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        # Try to use flash attention on CUDA
+        attn_impl = "flash_attention_2" if device == "cuda" else "eager"
+        try:
+            self.model = Qwen3TTSModel.from_pretrained(
+                model_name,
+                device_map=device,
+                dtype=dtype,
+                attn_implementation=attn_impl,
+            )
+        except Exception:
+            # Fallback without flash attention
+            self.model = Qwen3TTSModel.from_pretrained(
+                model_name,
+                device_map=device,
+                dtype=dtype,
+            )
+        # Calculate optimal batch size based on available VRAM
+        if device == "cuda":
+            self._batch_size = self._calculate_batch_size()
+            print(f"   Batch size: {self._batch_size} (based on available VRAM)")
+    def _calculate_batch_size(self) -> int:
+        """Calculate optimal batch size based on available GPU memory.
+        Returns:
+            Recommended batch size for parallel chunk processing.
+        """
+        import torch
+        if not torch.cuda.is_available():
+            return 1
+        try:
+            # Get GPU memory info
+            gpu_mem = torch.cuda.get_device_properties(0).total_memory
+            allocated = torch.cuda.memory_allocated(0)
+            reserved = torch.cuda.memory_reserved(0)
+            # Available memory (conservative estimate)
+            available = gpu_mem - max(allocated, reserved)
+            # Model uses ~6GB, each batch item needs ~2-3GB for generation
+            # Use conservative 3GB per batch item estimate
+            mem_per_batch = 3 * 1024 * 1024 * 1024  # 3GB
+            # Calculate batch size, minimum 1, cap at 8
+            batch_size = max(1, min(8, int(available / mem_per_batch)))
+            return batch_size
+        except Exception:
+            return 1
+    @property
+    def sample_rate(self) -> int:
+        """Return the sample rate of generated audio."""
+        return self._sample_rate
+    @property
+    def batch_size(self) -> int:
+        """Return the current batch size."""
+        return self._batch_size
+    def synthesize(self, text: str) -> Iterator[bytes]:
+        """Synthesize text to WAV audio using batched GPU inference.
+        Args:
+            text: Text to synthesize.
+        Yields:
+            WAV audio data chunks.
+        """
+        if not text.strip():
+            return
+        # Split text into chunks for streaming
+        chunks = self._split_text(text)
+        # First chunk includes WAV header
+        first_chunk = True
+        # Process chunks in batches for GPU efficiency
+        batch_size = self._batch_size
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i : i + batch_size]
+            # Filter empty chunks
+            batch = [c for c in batch if c.strip()]
+            if not batch:
+                continue
+            # Always use batched call for consistent GPU memory allocation
+            # Use professional narration style for clear, authoritative delivery
+            batch_instruct = (
+                [PROFESSIONAL_STYLE] * len(batch) if len(batch) > 1 else PROFESSIONAL_STYLE
+            )
+            audios, sr = self.model.generate_custom_voice(
+                text=batch if len(batch) > 1 else batch[0],
+                speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
+                instruct=batch_instruct,
+                # Use lower temperature for more stable, consistent voice
+                temperature=0.7,
+                repetition_penalty=1.1,
+            )
+            # Ensure audios is a list for consistent iteration
+            if len(batch) == 1:
+                audios = [audios]
+            # Yield each audio chunk in order
+            for audio in audios:
+                wav_bytes = self._audio_to_wav(audio, sr, include_header=first_chunk)
+                first_chunk = False
+                yield wav_bytes
+    def _split_text(self, text: str, max_chars: int | None = None) -> list[str]:
+        """Split text into chunks suitable for TTS.
+        Splits on sentence boundaries when possible.
+        Args:
+            text: Text to split.
+            max_chars: Maximum characters per chunk. Uses self.chunk_size if None.
+        Returns:
+            List of text chunks.
+        """
+        import re
+        if max_chars is None:
+            max_chars = self.chunk_size
+        # Split on sentence boundaries
+        sentences = re.split(r"(?<=[.!?])\s+", text)
+        chunks: list[str] = []
+        current_chunk: list[str] = []
+        current_length = 0
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            if current_length + len(sentence) > max_chars and current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            current_chunk.append(sentence)
+            current_length += len(sentence) + 1
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        return chunks
+    def _audio_to_wav(
+        self,
+        audio: npt.NDArray[np.float32] | list[float],
+        sample_rate: int,
+        include_header: bool = True,
+    ) -> bytes:
+        """Convert audio array to WAV bytes.
+        Args:
+            audio: Audio data as numpy array or list.
+            sample_rate: Sample rate of the audio.
+            include_header: Whether to include WAV header.
+        Returns:
+            WAV audio data as bytes.
+        """
+        import numpy as np
+        # Convert to numpy array if needed
+        if isinstance(audio, list):
+            audio = np.array(audio, dtype=np.float32)
+        # Ensure audio is 1D
+        if audio.ndim > 1:
+            audio = audio.flatten()
+        # Normalize and convert to 16-bit PCM
+        audio = np.clip(audio, -1.0, 1.0)
+        audio_int16 = (audio * 32767).astype(np.int16)
+        if include_header:
+            # Write full WAV file
+            buffer = io.BytesIO()
+            with wave.open(buffer, "wb") as wav_file:
+                wav_file.setnchannels(1)
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(sample_rate)
+                wav_file.writeframes(audio_int16.tobytes())
+            result: bytes = buffer.getvalue()
+            return result
+        else:
+            # Return raw PCM data
+            pcm_data: bytes = audio_int16.tobytes()
+            return pcm_data
+class MockTTSEngine(TTSEngineProtocol):
+    """Mock TTS engine for testing."""
+    def __init__(self, sample_rate: int = 24000) -> None:
+        """Initialize the mock TTS engine.
+        Args:
+            sample_rate: Sample rate for generated audio.
+        """
+        self._sample_rate = sample_rate
+    @property
+    def sample_rate(self) -> int:
+        """Return the sample rate of generated audio."""
+        return self._sample_rate
+    def synthesize(self, text: str) -> Iterator[bytes]:
+        """Generate silent WAV audio for testing.
+        Args:
+            text: Text to synthesize (used to determine duration).
+        Yields:
+            WAV audio data with silence.
+        """
+        if not text.strip():
+            return
+        # Generate ~0.1 seconds of silence per word
+        words = len(text.split())
+        duration_samples = int(self._sample_rate * 0.1 * max(1, words))
+        # Create silent audio
+        silence = b"\x00\x00" * duration_samples
+        # Write WAV header + silence
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(self._sample_rate)
+            wav_file.writeframes(silence)
+        yield buffer.getvalue()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff