LucaCappelletti94 commited on
Commit
19933fe
·
1 Parent(s): 618f5ab

Initial deployment with LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.wav filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ sox \
6
+ libsox-dev \
7
+ git \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Install uv
11
+ RUN pip install uv
12
+
13
+ # Create non-root user for HF Spaces
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ WORKDIR $HOME/app
20
+
21
+ # Copy project files
22
+ COPY --chown=user . .
23
+
24
+ # Install dependencies
25
+ RUN uv sync --no-dev
26
+
27
+ # Expose port 7860 (HF Spaces default)
28
+ EXPOSE 7860
29
+
30
+ # Run the app
31
+ CMD ["uv", "run", "talking-snake", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,17 @@
1
  ---
2
  title: Talking Snake
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
- short_description: 'Just a talking snake that reads PDFs and web pages aloud. '
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
  title: Talking Snake
3
+ emoji: 🐍
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ app_port: 7860
10
+ suggested_hardware: l4x1
11
  ---
12
 
13
+ # Talking Snake
14
+
15
+ PDF and web page to speech using Qwen3-TTS.
16
+
17
+ Click "Duplicate this Space" to deploy your own instance (L4 or A100 recommended for speed).
pyproject.toml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "talking-snake"
3
+ version = "0.1.0"
4
+ description = "Just a talking snake that reads PDFs and web pages aloud."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ requires-python = ">=3.11"
8
+ authors = [{ name = "Luca" }]
9
+ keywords = ["tts", "pdf", "speech", "audiobook", "text-to-speech", "listening"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: End Users/Desktop",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: POSIX :: Linux",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
18
+ ]
19
+
20
+ # System dependencies (not installable via pip):
21
+ # - sox: Audio processing tool required by qwen-tts
22
+ # Ubuntu/Debian: sudo apt-get install sox libsox-dev
23
+ # macOS: brew install sox
24
+ # Fedora: sudo dnf install sox sox-devel
25
+
26
+ dependencies = [
27
+ "fastapi>=0.115.0",
28
+ "uvicorn[standard]>=0.32.0",
29
+ "qwen-tts>=0.1.1",
30
+ "torch>=2.5.0",
31
+ "pdfminer.six>=20260107",
32
+ "python-multipart>=0.0.12",
33
+ "jinja2>=3.1.4",
34
+ "httpx>=0.27.0",
35
+ "trafilatura>=2.0.0",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ dev = [
40
+ "pytest>=8.3.0",
41
+ "pytest-asyncio>=0.24.0",
42
+ "pytest-cov>=6.0.0",
43
+ "httpx>=0.27.0",
44
+ "ruff>=0.8.0",
45
+ "mypy>=1.14.0",
46
+ "pre-commit>=4.0.0",
47
+ ]
48
+ # Flash Attention for ~2x faster inference (requires CUDA 11.6+)
49
+ # Install separately: pip install flash-attn --no-build-isolation
50
+ fast = [
51
+ "flash-attn>=2.5.0",
52
+ ]
53
+
54
+ [project.scripts]
55
+ talking-snake = "talking_snake.__main__:main"
56
+
57
+ [build-system]
58
+ requires = ["hatchling"]
59
+ build-backend = "hatchling.build"
60
+
61
+ [tool.hatch.build.targets.wheel]
62
+ packages = ["src/talking_snake"]
63
+
64
+ [tool.pytest.ini_options]
65
+ asyncio_mode = "auto"
66
+ testpaths = ["tests"]
67
+ markers = [
68
+ "slow: marks tests as slow (run with --run-slow)",
69
+ ]
70
+
71
+ [tool.ruff]
72
+ line-length = 100
73
+ target-version = "py311"
74
+
75
+ [tool.ruff.lint]
76
+ select = ["E", "F", "I", "N", "W", "UP"]
77
+
78
+ [tool.ruff.lint.per-file-ignores]
79
+ # PDF xref tables require trailing whitespace per spec
80
+ "tests/conftest.py" = ["W291"]
81
+
82
+ [tool.coverage.run]
83
+ source = ["src/talking_snake"]
84
+ branch = true
85
+ omit = [
86
+ "*/tests/*",
87
+ "*/__main__.py",
88
+ ]
89
+
90
+ [tool.coverage.report]
91
+ exclude_lines = [
92
+ "pragma: no cover",
93
+ "if TYPE_CHECKING:",
94
+ "raise NotImplementedError",
95
+ "if __name__ == .__main__.:",
96
+ "class QwenTTSEngine",
97
+ "def _audio_to_wav",
98
+ "def _split_text",
99
+ "import torch",
100
+ "from qwen_tts",
101
+ ]
102
+ show_missing = true
103
+ skip_covered = true
104
+ fail_under = 70
105
+
106
+ [tool.mypy]
107
+ python_version = "3.11"
108
+ warn_return_any = true
109
+ warn_unused_configs = true
110
+ disallow_untyped_defs = true
111
+ ignore_missing_imports = true
src/talking_snake/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """PDF-to-Speech web server using Qwen3-TTS - listen to any content."""
2
+
3
+ __version__ = "0.1.0"
src/talking_snake/__main__.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI entry point for the Reader server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+
9
+ def main() -> int:
10
+ """Main entry point for the Reader CLI.
11
+
12
+ Returns:
13
+ Exit code (0 for success).
14
+ """
15
+ parser = argparse.ArgumentParser(
16
+ prog="reader",
17
+ description="PDF-to-Speech web server - listen to any content",
18
+ )
19
+ parser.add_argument(
20
+ "--voice",
21
+ type=str,
22
+ default=None,
23
+ help="Voice name for TTS. Options: Vivian, Serena, Uncle_Fu, Dylan, Eric, "
24
+ "Ryan, Aiden, Ono_Anna, Sohee (default: auto based on language)",
25
+ )
26
+ parser.add_argument(
27
+ "--language",
28
+ type=str,
29
+ default="english",
30
+ choices=["english", "chinese", "japanese", "korean"],
31
+ help="Language for TTS (default: english). Sets default voice if --voice not specified.",
32
+ )
33
+ parser.add_argument(
34
+ "--host",
35
+ type=str,
36
+ default="0.0.0.0",
37
+ help="Host to bind the server to (default: 0.0.0.0)",
38
+ )
39
+ parser.add_argument(
40
+ "--port",
41
+ type=int,
42
+ default=8000,
43
+ help="Port to bind the server to (default: 8000)",
44
+ )
45
+ parser.add_argument(
46
+ "--device",
47
+ type=str,
48
+ default="cuda",
49
+ choices=["cuda", "cpu"],
50
+ help="Device to run the TTS model on (default: cuda)",
51
+ )
52
+ parser.add_argument(
53
+ "--reload",
54
+ action="store_true",
55
+ help="Enable auto-reload for development",
56
+ )
57
+
58
+ args = parser.parse_args()
59
+
60
+ print("🚀 Starting Reader server...")
61
+ print(f" Language: {args.language}")
62
+ print(f" Voice: {args.voice or 'auto'}")
63
+ print(f" Device: {args.device}")
64
+ print(f" URL: http://{args.host}:{args.port}")
65
+ print()
66
+
67
+ # Import here to avoid slow startup for --help
68
+ import uvicorn
69
+
70
+ from talking_snake.app import create_app
71
+ from talking_snake.tts import QwenTTSEngine
72
+
73
+ # Initialize TTS engine
74
+ print("📦 Loading TTS model (this may take a moment)...")
75
+ try:
76
+ tts_engine = QwenTTSEngine(
77
+ voice=args.voice,
78
+ language=args.language,
79
+ device=args.device,
80
+ )
81
+ except Exception as e:
82
+ print(f"❌ Failed to load TTS model: {e}", file=sys.stderr)
83
+ return 1
84
+
85
+ print("✅ TTS model loaded!")
86
+ print()
87
+
88
+ # Create app with engine
89
+ app = create_app(tts_engine=tts_engine)
90
+
91
+ # Run server
92
+ uvicorn.run(
93
+ app,
94
+ host=args.host,
95
+ port=args.port,
96
+ log_level="info",
97
+ )
98
+
99
+ return 0
100
+
101
+
102
+ if __name__ == "__main__":
103
+ sys.exit(main())
src/talking_snake/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (260 Bytes). View file
 
src/talking_snake/__pycache__/__main__.cpython-312.pyc ADDED
Binary file (3.44 kB). View file
 
src/talking_snake/__pycache__/app.cpython-312.pyc ADDED
Binary file (34.7 kB). View file
 
src/talking_snake/__pycache__/extract.cpython-312.pyc ADDED
Binary file (18.6 kB). View file
 
src/talking_snake/__pycache__/tts.cpython-312.pyc ADDED
Binary file (13.1 kB). View file
 
src/talking_snake/app.py ADDED
@@ -0,0 +1,935 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application for PDF-to-Speech server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import json
7
+ import queue
8
+ import struct
9
+ import threading
10
+ import time
11
+ import uuid
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING
14
+ from urllib.parse import urlparse
15
+
16
+ import httpx
17
+ import trafilatura
18
+ from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
19
+ from fastapi.responses import HTMLResponse, StreamingResponse
20
+ from fastapi.staticfiles import StaticFiles
21
+ from pydantic import BaseModel
22
+
23
+ from talking_snake.extract import clean_text, extract_text, get_page_count
24
+ from talking_snake.tts import (
25
+ DEFAULT_CHUNK_SIZE,
26
+ LANGUAGE_VOICES,
27
+ MockTTSEngine,
28
+ TTSEngineProtocol,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from collections.abc import Iterator
33
+
34
+
35
+ # Request timeout for fetching URLs (seconds)
36
+ URL_FETCH_TIMEOUT = 60.0
37
+ # Maximum file size to fetch (50MB)
38
+ MAX_FILE_SIZE = 50 * 1024 * 1024
39
+
40
+ # Initial estimate for time calculation before calibration
41
+ # This value is refined after the first chunk is processed
42
+ # RTX 4090 + flash-attn: ~0.001s/char, RTX 4090: ~0.002s/char, RTX 3060: ~0.005s/char
43
+ INITIAL_SECONDS_PER_CHAR = 0.002 # Optimistic GPU estimate, calibrates after first chunk
44
+
45
+ # Job timeout (seconds) - jobs are cleaned up after this time
46
+ JOB_TIMEOUT = 3600 # 1 hour
47
+
48
+
49
+ class AudioJob:
50
+ """Represents an audio generation job with a queue for streaming."""
51
+
52
+ def __init__(self, job_id: str):
53
+ self.job_id = job_id
54
+ self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
55
+ self.started = time.time()
56
+ self.completed = False
57
+ self.error: str | None = None
58
+ self.sample_rate = 24000 # Default, will be set by TTS engine
59
+ self.header_sent = False
60
+
61
+ def put_audio(self, audio_bytes: bytes) -> None:
62
+ """Add audio data to the queue."""
63
+ self.audio_queue.put(audio_bytes)
64
+
65
+ def finish(self) -> None:
66
+ """Signal that audio generation is complete."""
67
+ self.completed = True
68
+ self.audio_queue.put(None) # Sentinel to signal end
69
+
70
+ def set_error(self, error: str) -> None:
71
+ """Set an error and finish the job."""
72
+ self.error = error
73
+ self.completed = True
74
+ self.audio_queue.put(None)
75
+
76
+
77
+ class JobManager:
78
+ """Manages audio generation jobs."""
79
+
80
+ def __init__(self) -> None:
81
+ self._jobs: dict[str, AudioJob] = {}
82
+ self._lock = threading.Lock()
83
+
84
+ def create_job(self) -> AudioJob:
85
+ """Create a new job and return it."""
86
+ job_id = str(uuid.uuid4())
87
+ job = AudioJob(job_id)
88
+ with self._lock:
89
+ self._jobs[job_id] = job
90
+ self._cleanup_old_jobs()
91
+ return job
92
+
93
+ def get_job(self, job_id: str) -> AudioJob | None:
94
+ """Get a job by ID."""
95
+ with self._lock:
96
+ return self._jobs.get(job_id)
97
+
98
+ def remove_job(self, job_id: str) -> None:
99
+ """Remove a job."""
100
+ with self._lock:
101
+ self._jobs.pop(job_id, None)
102
+
103
+ def _cleanup_old_jobs(self) -> None:
104
+ """Remove jobs older than JOB_TIMEOUT."""
105
+ now = time.time()
106
+ to_remove = [jid for jid, job in self._jobs.items() if now - job.started > JOB_TIMEOUT]
107
+ for jid in to_remove:
108
+ del self._jobs[jid]
109
+
110
+
111
+ # Global job manager
112
+ _job_manager = JobManager()
113
+
114
+
115
+ class UrlRequest(BaseModel):
116
+ """Request body for URL-based reading."""
117
+
118
+ url: str
119
+ language: str = "english"
120
+
121
+
122
+ class TextRequest(BaseModel):
123
+ """Request body for direct text reading."""
124
+
125
+ text: str
126
+ language: str = "english"
127
+
128
+
129
+ class EstimateResponse(BaseModel):
130
+ """Response for time estimation."""
131
+
132
+ text_length: int
133
+ chunk_count: int
134
+ estimated_seconds: float
135
+ estimated_minutes: float
136
+
137
+
138
+ # Global TTS engine instance (set during startup)
139
+ _tts_engine: TTSEngineProtocol | None = None
140
+
141
+
142
+ def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
143
+ """Create and configure the FastAPI application.
144
+
145
+ Args:
146
+ tts_engine: TTS engine to use. If None, uses MockTTSEngine.
147
+
148
+ Returns:
149
+ Configured FastAPI application.
150
+ """
151
+ global _tts_engine
152
+ _tts_engine = tts_engine or MockTTSEngine()
153
+
154
+ app = FastAPI(
155
+ title="Reader",
156
+ description="PDF-to-Speech web server - listen to any content",
157
+ version="0.1.0",
158
+ )
159
+
160
+ # Mount static files
161
+ static_dir = Path(__file__).parent / "static"
162
+ if static_dir.exists():
163
+ app.mount("/static", StaticFiles(directory=static_dir), name="static")
164
+
165
+ # Register routes
166
+ app.add_api_route("/", index, methods=["GET"], response_class=HTMLResponse)
167
+ app.add_api_route("/api/read", read_pdf, methods=["POST"])
168
+ app.add_api_route("/api/read-url", read_url, methods=["POST"])
169
+ app.add_api_route("/api/read-stream", read_pdf_stream, methods=["POST"])
170
+ app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
171
+ app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
172
+ app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
173
+ app.add_api_route("/api/languages", get_languages, methods=["GET"])
174
+ app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
175
+ app.add_api_route("/api/health", health_check, methods=["GET"])
176
+
177
+ return app
178
+
179
+
180
+ async def index(request: Request) -> HTMLResponse:
181
+ """Serve the main page.
182
+
183
+ Args:
184
+ request: The incoming request.
185
+
186
+ Returns:
187
+ HTML response with the main page.
188
+ """
189
+ static_dir = Path(__file__).parent / "static"
190
+ index_file = static_dir / "index.html"
191
+
192
+ if not index_file.exists():
193
+ return HTMLResponse(
194
+ content="<h1>Reader</h1><p>Static files not found.</p>",
195
+ status_code=200,
196
+ )
197
+
198
+ return HTMLResponse(content=index_file.read_text())
199
+
200
+
201
+ async def read_pdf(file: UploadFile = File(...)) -> StreamingResponse:
202
+ """Read a PDF and return synthesized speech.
203
+
204
+ Args:
205
+ file: Uploaded PDF file.
206
+
207
+ Returns:
208
+ Streaming WAV audio response.
209
+
210
+ Raises:
211
+ HTTPException: If file is not a PDF or extraction fails.
212
+ """
213
+ if _tts_engine is None:
214
+ raise HTTPException(status_code=500, detail="TTS engine not initialized")
215
+
216
+ # Validate file type
217
+ if not file.filename or not file.filename.lower().endswith(".pdf"):
218
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
219
+
220
+ # Read file content
221
+ try:
222
+ pdf_bytes = await file.read()
223
+ except Exception as e:
224
+ raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
225
+
226
+ if not pdf_bytes:
227
+ raise HTTPException(status_code=400, detail="Empty file")
228
+
229
+ # Extract text
230
+ try:
231
+ text = extract_text(pdf_bytes)
232
+ except Exception as e:
233
+ raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
234
+
235
+ if not text.strip():
236
+ raise HTTPException(status_code=400, detail="No text found in PDF")
237
+
238
+ # Stream TTS audio
239
+ def generate_audio() -> Iterator[bytes]:
240
+ assert _tts_engine is not None
241
+ yield from _tts_engine.synthesize(text)
242
+
243
+ return StreamingResponse(
244
+ generate_audio(),
245
+ media_type="audio/wav",
246
+ headers={
247
+ "Content-Disposition": f'inline; filename="{Path(file.filename).stem}.wav"',
248
+ },
249
+ )
250
+
251
+
252
+ async def read_url(request: UrlRequest) -> StreamingResponse:
253
+ """Read content from a URL (PDF or web page) and return synthesized speech.
254
+
255
+ For PDFs: extracts text and removes headers/footers/page numbers.
256
+ For web pages: extracts main article content, removing navigation,
257
+ sidebars, footers, ads, and other boilerplate.
258
+
259
+ Args:
260
+ request: Request containing the URL to fetch.
261
+
262
+ Returns:
263
+ Streaming WAV audio response.
264
+
265
+ Raises:
266
+ HTTPException: If URL is invalid, fetch fails, or extraction fails.
267
+ """
268
+ if _tts_engine is None:
269
+ raise HTTPException(status_code=500, detail="TTS engine not initialized")
270
+
271
+ # Validate URL
272
+ url = request.url.strip()
273
+ if not url:
274
+ raise HTTPException(status_code=400, detail="URL is required")
275
+
276
+ parsed = urlparse(url)
277
+ if parsed.scheme not in ("http", "https"):
278
+ raise HTTPException(status_code=400, detail="Only HTTP/HTTPS URLs are supported")
279
+
280
+ # Determine if this is a PDF or web page
281
+ is_pdf = parsed.path.lower().endswith(".pdf")
282
+
283
+ # Fetch the content
284
+ try:
285
+ async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
286
+ response = await client.get(url)
287
+ response.raise_for_status()
288
+
289
+ # Check content length if available
290
+ content_length = response.headers.get("content-length")
291
+ if content_length and int(content_length) > MAX_FILE_SIZE:
292
+ raise HTTPException(
293
+ status_code=400,
294
+ detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
295
+ )
296
+
297
+ content = response.content
298
+
299
+ if len(content) > MAX_FILE_SIZE:
300
+ raise HTTPException(
301
+ status_code=400,
302
+ detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
303
+ )
304
+
305
+ # Also check content-type header to detect PDFs served without .pdf extension
306
+ content_type = response.headers.get("content-type", "").lower()
307
+ if "application/pdf" in content_type:
308
+ is_pdf = True
309
+
310
+ except httpx.TimeoutException:
311
+ raise HTTPException(status_code=408, detail="Request timed out while fetching URL")
312
+ except httpx.HTTPStatusError as e:
313
+ raise HTTPException(
314
+ status_code=400,
315
+ detail=f"Failed to fetch URL: HTTP {e.response.status_code}",
316
+ )
317
+ except httpx.RequestError as e:
318
+ raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
319
+
320
+ if not content:
321
+ raise HTTPException(status_code=400, detail="Empty content at URL")
322
+
323
+ # Extract text based on content type
324
+ if is_pdf:
325
+ try:
326
+ text = extract_text(content)
327
+ except Exception as e:
328
+ raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
329
+ else:
330
+ # Use trafilatura to extract main content from HTML
331
+ # This removes navigation, sidebars, footers, ads, etc.
332
+ try:
333
+ extracted = trafilatura.extract(
334
+ content,
335
+ include_comments=False,
336
+ include_tables=True,
337
+ no_fallback=False,
338
+ favor_precision=True,
339
+ )
340
+ if extracted:
341
+ # Apply additional cleaning for TTS
342
+ text = clean_text(extracted)
343
+ else:
344
+ text = ""
345
+ except Exception as e:
346
+ raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
347
+
348
+ if not text or not text.strip():
349
+ raise HTTPException(status_code=400, detail="No readable content found at URL")
350
+
351
+ # Extract filename from URL for the response
352
+ filename = Path(parsed.path).stem or parsed.netloc or "document"
353
+
354
+ # Stream TTS audio
355
+ def generate_audio() -> Iterator[bytes]:
356
+ assert _tts_engine is not None
357
+ yield from _tts_engine.synthesize(text)
358
+
359
+ return StreamingResponse(
360
+ generate_audio(),
361
+ media_type="audio/wav",
362
+ headers={
363
+ "Content-Disposition": f'inline; filename="{filename}.wav"',
364
+ },
365
+ )
366
+
367
+
368
+ async def health_check() -> dict[str, str]:
369
+ """Health check endpoint.
370
+
371
+ Returns:
372
+ Status information.
373
+ """
374
+ return {"status": "ok"}
375
+
376
+
377
+ async def get_languages() -> dict[str, list[str]]:
378
+ """Get available languages.
379
+
380
+ Returns:
381
+ List of available language names.
382
+ """
383
+ return {"languages": list(LANGUAGE_VOICES.keys())}
384
+
385
+
386
+ def _get_device_info() -> dict:
387
+ """Get device and model information with real-time memory stats.
388
+
389
+ Returns:
390
+ Device type, memory usage, and model info.
391
+ """
392
+ import torch
393
+
394
+ info = {
395
+ "device": "cpu",
396
+ "device_name": "CPU",
397
+ "memory_used_gb": 0,
398
+ "memory_total_gb": 0,
399
+ "memory_percent": 0,
400
+ "batch_size": 1,
401
+ }
402
+
403
+ if torch.cuda.is_available():
404
+ props = torch.cuda.get_device_properties(0)
405
+ # Use reserved memory for more accurate GPU usage (includes PyTorch cache)
406
+ reserved = torch.cuda.memory_reserved(0)
407
+ allocated = torch.cuda.memory_allocated(0)
408
+ total = props.total_memory
409
+
410
+ # Show reserved memory (what's actually held by PyTorch)
411
+ used = max(reserved, allocated)
412
+
413
+ info["device"] = "cuda"
414
+ info["device_name"] = props.name
415
+ info["memory_used_gb"] = round(used / 1024**3, 1)
416
+ info["memory_total_gb"] = round(total / 1024**3, 1)
417
+ info["memory_percent"] = round((used / total) * 100, 1) if total > 0 else 0
418
+ # Also include allocated for debugging
419
+ info["memory_allocated_gb"] = round(allocated / 1024**3, 1)
420
+
421
+ if _tts_engine is not None:
422
+ info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
423
+ info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
424
+
425
+ return info
426
+
427
+
428
+ async def stream_device_info() -> StreamingResponse:
429
+ """Stream device info updates via SSE.
430
+
431
+ Returns:
432
+ SSE stream with device info updates every 3 seconds.
433
+ """
434
+ import asyncio
435
+ from collections.abc import AsyncIterator
436
+ from concurrent.futures import ThreadPoolExecutor
437
+
438
+ executor = ThreadPoolExecutor(max_workers=1)
439
+
440
+ async def generate_events() -> AsyncIterator[str]:
441
+ """Generate SSE events for device info."""
442
+ loop = asyncio.get_event_loop()
443
+ while True:
444
+ try:
445
+ # Run torch calls in executor to avoid blocking
446
+ info = await loop.run_in_executor(executor, _get_device_info)
447
+ yield f"data: {json.dumps(info)}\n\n"
448
+ except Exception as e:
449
+ # Send error info but continue
450
+ yield f'data: {{"error": "{e!s}"}}\n\n'
451
+ await asyncio.sleep(3)
452
+
453
+ return StreamingResponse(
454
+ generate_events(),
455
+ media_type="text/event-stream",
456
+ headers={
457
+ "Cache-Control": "no-cache",
458
+ "Connection": "keep-alive",
459
+ "X-Accel-Buffering": "no",
460
+ },
461
+ )
462
+
463
+
464
+ def _estimate_time(
465
+ text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
466
+ ) -> tuple[int, float]:
467
+ """Estimate processing time for text.
468
+
469
+ Args:
470
+ text: Text to process.
471
+ seconds_per_char: Calibrated rate (defaults to initial estimate).
472
+
473
+ Returns:
474
+ Tuple of (chunk_count, estimated_seconds).
475
+ """
476
+ # Count chunks (500 chars per chunk approximately)
477
+ chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
478
+ estimated_seconds = len(text) * seconds_per_char
479
+ return chunk_count, estimated_seconds
480
+
481
+
482
+ def _create_wav_header(sample_rate: int = 24000, bits_per_sample: int = 16) -> bytes:
483
+ """Create a WAV header for streaming (unknown length).
484
+
485
+ Uses maximum possible file size since we don't know the final length.
486
+
487
+ Args:
488
+ sample_rate: Audio sample rate.
489
+ bits_per_sample: Bits per sample.
490
+
491
+ Returns:
492
+ WAV header bytes.
493
+ """
494
+ channels = 1
495
+ byte_rate = sample_rate * channels * bits_per_sample // 8
496
+ block_align = channels * bits_per_sample // 8
497
+
498
+ # Use maximum size for streaming (will be truncated on close)
499
+ max_size = 0x7FFFFFFF
500
+
501
+ header = io.BytesIO()
502
+ header.write(b"RIFF")
503
+ header.write(struct.pack("<I", max_size))
504
+ header.write(b"WAVE")
505
+ header.write(b"fmt ")
506
+ header.write(struct.pack("<I", 16)) # fmt chunk size
507
+ header.write(struct.pack("<H", 1)) # PCM format
508
+ header.write(struct.pack("<H", channels))
509
+ header.write(struct.pack("<I", sample_rate))
510
+ header.write(struct.pack("<I", byte_rate))
511
+ header.write(struct.pack("<H", block_align))
512
+ header.write(struct.pack("<H", bits_per_sample))
513
+ header.write(b"data")
514
+ header.write(struct.pack("<I", max_size - 36))
515
+
516
+ return header.getvalue()
517
+
518
+
519
+ def _generate_audio_to_job(
520
+ job: AudioJob,
521
+ text: str,
522
+ tts_engine: TTSEngineProtocol,
523
+ language: str = "english",
524
+ doc_name: str = "document",
525
+ doc_type: str = "text",
526
+ page_count: int | None = None,
527
+ ) -> Iterator[bytes]:
528
+ """Generate audio with progress events via SSE, streaming audio to job queue.
529
+
530
+ This function sends progress events via SSE while simultaneously writing
531
+ audio data to the job's queue for streaming by another endpoint.
532
+ Supports batched GPU inference for faster processing.
533
+
534
+ Args:
535
+ job: AudioJob to write audio data to.
536
+ text: Text to synthesize.
537
+ tts_engine: TTS engine to use.
538
+ language: Language for TTS (english, chinese, japanese, korean).
539
+ doc_name: Name of the document being processed.
540
+ doc_type: Type of document (pdf, url, text).
541
+ page_count: Number of pages (for PDFs).
542
+ tts_engine: TTS engine to use.
543
+ language: Language for TTS (english, chinese, japanese, korean).
544
+
545
+ Yields:
546
+ SSE events for progress.
547
+ """
548
+ import re
549
+
550
+ # Apply language if the engine supports it
551
+ if hasattr(tts_engine, "set_language"):
552
+ tts_engine.set_language(language)
553
+
554
+ # Get chunk size and batch size from engine
555
+ chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
556
+ batch_size = getattr(tts_engine, "batch_size", 1)
557
+
558
+ # Split text into chunks (same logic as TTS engine)
559
+ sentences = re.split(r"(?<=[.!?])\s+", text)
560
+ chunks: list[str] = []
561
+ current_chunk: list[str] = []
562
+ current_length = 0
563
+
564
+ for sentence in sentences:
565
+ sentence = sentence.strip()
566
+ if not sentence:
567
+ continue
568
+ if current_length + len(sentence) > chunk_size and current_chunk:
569
+ chunks.append(" ".join(current_chunk))
570
+ current_chunk = []
571
+ current_length = 0
572
+ current_chunk.append(sentence)
573
+ current_length += len(sentence) + 1
574
+
575
+ if current_chunk:
576
+ chunks.append(" ".join(current_chunk))
577
+
578
+ total_chunks = len(chunks) if chunks else 1
579
+ total_chars = sum(len(c) for c in chunks)
580
+
581
+ # Use initial estimate before calibration
582
+ seconds_per_char = INITIAL_SECONDS_PER_CHAR
583
+ estimated_total = total_chars * seconds_per_char
584
+
585
+ # Send initial progress event with job_id and batch info
586
+ progress_data = {
587
+ "type": "start",
588
+ "job_id": job.job_id,
589
+ "current": 0,
590
+ "total": total_chunks,
591
+ "percent": 0,
592
+ "estimated_remaining": estimated_total,
593
+ "batch_size": batch_size,
594
+ "doc_name": doc_name,
595
+ "doc_type": doc_type,
596
+ "page_count": page_count,
597
+ "total_chars": total_chars,
598
+ "status": f"Starting (batch size: {batch_size})...",
599
+ }
600
+ yield f"event: start\ndata: {json.dumps(progress_data)}\n\n".encode()
601
+
602
+ # Generate audio - the TTS engine handles batching internally
603
+ # We pass the full text and let it process in optimized batches
604
+ start_time = time.time()
605
+ chunks_processed = 0
606
+
607
+ try:
608
+ for audio_bytes in tts_engine.synthesize(text):
609
+ # Write audio to job queue for streaming
610
+ job.put_audio(audio_bytes)
611
+ chunks_processed += 1
612
+
613
+ # Calibrate time estimate
614
+ elapsed = time.time() - start_time
615
+ if chunks_processed > 0:
616
+ time_per_chunk = elapsed / chunks_processed
617
+ remaining_chunks = total_chunks - chunks_processed
618
+ remaining = remaining_chunks * time_per_chunk
619
+ else:
620
+ remaining = estimated_total
621
+
622
+ progress_data = {
623
+ "type": "progress",
624
+ "current": chunks_processed,
625
+ "total": total_chunks,
626
+ "percent": int((chunks_processed / total_chunks) * 100),
627
+ "estimated_remaining": round(max(0, remaining), 1),
628
+ "chars_processed": sum(
629
+ len(chunks[i]) for i in range(min(chunks_processed, len(chunks)))
630
+ ),
631
+ "total_chars": total_chars,
632
+ "status": f"Processing chunk {chunks_processed}/{total_chunks}",
633
+ }
634
+ yield f"event: progress\ndata: {json.dumps(progress_data)}\n\n".encode()
635
+
636
+ except Exception as e:
637
+ error_msg = f"TTS generation failed: {e!s}"
638
+ error_data = {
639
+ "type": "error",
640
+ "message": error_msg,
641
+ "chunk": chunks_processed + 1,
642
+ "total_chunks": total_chunks,
643
+ }
644
+ job.set_error(error_msg)
645
+ yield f"event: error\ndata: {json.dumps(error_data)}\n\n".encode()
646
+ return
647
+
648
+ # Signal audio generation complete
649
+ job.finish()
650
+
651
+ # Send completion event
652
+ total_time = time.time() - start_time
653
+ complete_data = {
654
+ "type": "complete",
655
+ "total_time": round(total_time, 1),
656
+ "chunks_processed": chunks_processed,
657
+ "batch_size": batch_size,
658
+ }
659
+ yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
660
+
661
+
662
+ async def stream_audio(job_id: str) -> StreamingResponse:
663
+ """Stream audio data for a job.
664
+
665
+ This endpoint streams the raw WAV audio as it's being generated.
666
+ The browser can start playing as soon as data arrives.
667
+
668
+ Args:
669
+ job_id: The job ID to stream audio for.
670
+
671
+ Returns:
672
+ Streaming WAV audio response.
673
+ """
674
+ job = _job_manager.get_job(job_id)
675
+ if job is None:
676
+ raise HTTPException(status_code=404, detail="Job not found")
677
+
678
+ def generate_audio() -> Iterator[bytes]:
679
+ # Send WAV header first
680
+ yield _create_wav_header(sample_rate=24000)
681
+
682
+ # Stream audio data as it becomes available
683
+ while True:
684
+ try:
685
+ # Wait for audio data with timeout
686
+ audio_data = job.audio_queue.get(timeout=300) # 5 min timeout
687
+ if audio_data is None:
688
+ # End of stream
689
+ break
690
+ # Skip WAV headers from individual chunks, only send raw PCM
691
+ if audio_data[:4] == b"RIFF":
692
+ # This is a WAV file, extract just the PCM data
693
+ # WAV header is 44 bytes for standard PCM
694
+ yield audio_data[44:]
695
+ else:
696
+ yield audio_data
697
+ except queue.Empty:
698
+ # Timeout waiting for data
699
+ break
700
+
701
+ # Clean up job after streaming
702
+ _job_manager.remove_job(job_id)
703
+
704
+ return StreamingResponse(
705
+ generate_audio(),
706
+ media_type="audio/wav",
707
+ headers={
708
+ "Cache-Control": "no-cache",
709
+ "X-Accel-Buffering": "no",
710
+ },
711
+ )
712
+
713
+
714
+ async def read_pdf_stream(
715
+ file: UploadFile = File(...),
716
+ language: str = Form("english"),
717
+ ) -> StreamingResponse:
718
+ """Read a PDF with streaming progress updates.
719
+
720
+ Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
721
+
722
+ Args:
723
+ file: Uploaded PDF file.
724
+ language: Language for TTS (english, chinese, japanese, korean).
725
+
726
+ Returns:
727
+ Streaming response with progress events including job_id.
728
+ """
729
+ if _tts_engine is None:
730
+ raise HTTPException(status_code=500, detail="TTS engine not initialized")
731
+
732
+ # Validate language
733
+ if language not in LANGUAGE_VOICES:
734
+ language = "english"
735
+
736
+ if not file.filename or not file.filename.lower().endswith(".pdf"):
737
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
738
+
739
+ try:
740
+ pdf_bytes = await file.read()
741
+ except Exception as e:
742
+ raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
743
+
744
+ if not pdf_bytes:
745
+ raise HTTPException(status_code=400, detail="Empty file")
746
+
747
+ try:
748
+ text = extract_text(pdf_bytes)
749
+ except Exception as e:
750
+ raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
751
+
752
+ if not text.strip():
753
+ raise HTTPException(status_code=400, detail="No text found in PDF")
754
+
755
+ # Get page count for progress display
756
+ try:
757
+ page_count = get_page_count(pdf_bytes)
758
+ except Exception:
759
+ page_count = None
760
+
761
+ # Create a job for this request
762
+ job = _job_manager.create_job()
763
+
764
+ return StreamingResponse(
765
+ _generate_audio_to_job(
766
+ job,
767
+ text,
768
+ _tts_engine,
769
+ language,
770
+ doc_name=file.filename or "document.pdf",
771
+ doc_type="pdf",
772
+ page_count=page_count,
773
+ ),
774
+ media_type="text/event-stream",
775
+ headers={
776
+ "Cache-Control": "no-cache",
777
+ "Connection": "keep-alive",
778
+ "X-Accel-Buffering": "no",
779
+ },
780
+ )
781
+
782
+
783
+ async def read_text_stream(request: TextRequest) -> StreamingResponse:
784
+ """Read pasted text with streaming progress updates.
785
+
786
+ Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
787
+
788
+ Args:
789
+ request: Text request containing the text to read and language.
790
+
791
+ Returns:
792
+ Streaming response with progress events including job_id.
793
+ """
794
+ if _tts_engine is None:
795
+ raise HTTPException(status_code=500, detail="TTS engine not initialized")
796
+
797
+ text = request.text.strip()
798
+ language = request.language if request.language in LANGUAGE_VOICES else "english"
799
+
800
+ if not text:
801
+ raise HTTPException(status_code=400, detail="Text is required")
802
+
803
+ if len(text) > 500000: # ~500KB limit for pasted text
804
+ raise HTTPException(status_code=400, detail="Text too long (max 500,000 characters)")
805
+
806
+ # Apply text normalization
807
+ text = clean_text(text)
808
+
809
+ if not text.strip():
810
+ raise HTTPException(status_code=400, detail="No readable text provided")
811
+
812
+ # Create a job for this request
813
+ job = _job_manager.create_job()
814
+
815
+ return StreamingResponse(
816
+ _generate_audio_to_job(
817
+ job,
818
+ text,
819
+ _tts_engine,
820
+ language,
821
+ doc_name="Pasted Text",
822
+ doc_type="text",
823
+ ),
824
+ media_type="text/event-stream",
825
+ headers={
826
+ "Cache-Control": "no-cache",
827
+ "Connection": "keep-alive",
828
+ "X-Accel-Buffering": "no",
829
+ },
830
+ )
831
+
832
+
833
+ async def read_url_stream(request: UrlRequest) -> StreamingResponse:
834
+ """Read content from URL with streaming progress updates.
835
+
836
+ Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
837
+
838
+ Args:
839
+ request: URL request containing the URL to fetch and language.
840
+
841
+ Returns:
842
+ Streaming response with progress events including job_id.
843
+ """
844
+ if _tts_engine is None:
845
+ raise HTTPException(status_code=500, detail="TTS engine not initialized")
846
+
847
+ url = request.url.strip()
848
+ language = request.language if request.language in LANGUAGE_VOICES else "english"
849
+
850
+ if not url:
851
+ raise HTTPException(status_code=400, detail="URL is required")
852
+
853
+ try:
854
+ parsed = urlparse(url)
855
+ if parsed.scheme not in ("http", "https"):
856
+ raise HTTPException(status_code=400, detail="URL must use HTTP or HTTPS")
857
+ except Exception as e:
858
+ raise HTTPException(status_code=400, detail=f"Invalid URL: {e}")
859
+
860
+ # Determine if this is a PDF or HTML page
861
+ is_pdf = url.lower().endswith(".pdf")
862
+
863
+ try:
864
+ async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
865
+ response = await client.get(url)
866
+ response.raise_for_status()
867
+
868
+ content_type = response.headers.get("content-type", "").lower()
869
+ if "application/pdf" in content_type:
870
+ is_pdf = True
871
+
872
+ if len(response.content) > MAX_FILE_SIZE:
873
+ raise HTTPException(status_code=400, detail="File too large (max 50MB)")
874
+
875
+ content = response.content
876
+
877
+ except httpx.HTTPStatusError as e:
878
+ raise HTTPException(
879
+ status_code=400, detail=f"Failed to fetch URL: HTTP {e.response.status_code}"
880
+ )
881
+ except httpx.RequestError as e:
882
+ raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
883
+
884
+ if is_pdf:
885
+ try:
886
+ text = extract_text(content)
887
+ page_count = get_page_count(content)
888
+ except Exception as e:
889
+ raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
890
+ else:
891
+ page_count = None
892
+ try:
893
+ extracted = trafilatura.extract(
894
+ content,
895
+ include_comments=False,
896
+ include_tables=True,
897
+ no_fallback=False,
898
+ favor_precision=True,
899
+ )
900
+ if extracted:
901
+ text = clean_text(extracted)
902
+ else:
903
+ text = ""
904
+ except Exception as e:
905
+ raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
906
+
907
+ if not text or not text.strip():
908
+ raise HTTPException(status_code=400, detail="No readable content found at URL")
909
+
910
+ # Extract document name from URL
911
+ url_path = urlparse(url).path
912
+ doc_name = url_path.split("/")[-1] if url_path else url
913
+ if not doc_name or doc_name == "/":
914
+ doc_name = urlparse(url).netloc
915
+
916
+ # Create a job for this request
917
+ job = _job_manager.create_job()
918
+
919
+ return StreamingResponse(
920
+ _generate_audio_to_job(
921
+ job,
922
+ text,
923
+ _tts_engine,
924
+ language,
925
+ doc_name=doc_name,
926
+ doc_type="pdf" if is_pdf else "url",
927
+ page_count=page_count,
928
+ ),
929
+ media_type="text/event-stream",
930
+ headers={
931
+ "Cache-Control": "no-cache",
932
+ "Connection": "keep-alive",
933
+ "X-Accel-Buffering": "no",
934
+ },
935
+ )
src/talking_snake/extract.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF text extraction and cleaning for TTS processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import re
7
+ from collections import Counter
8
+ from dataclasses import dataclass
9
+
10
+ from pdfminer.high_level import extract_pages
11
+ from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
12
+
13
+
14
+ @dataclass
15
+ class TextBlock:
16
+ """A block of text with positional metadata."""
17
+
18
+ text: str
19
+ y_ratio: float # 0.0 = bottom, 1.0 = top
20
+ font_size: float
21
+ page_num: int
22
+
23
+
24
+ def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
25
+ """Extract text blocks from PDF with positional information.
26
+
27
+ Args:
28
+ pdf_bytes: Raw PDF file content.
29
+
30
+ Returns:
31
+ List of TextBlock objects with text and metadata.
32
+ """
33
+ blocks: list[TextBlock] = []
34
+ pdf_file = io.BytesIO(pdf_bytes)
35
+
36
+ laparams = LAParams(
37
+ line_margin=0.5,
38
+ word_margin=0.1,
39
+ char_margin=2.0,
40
+ boxes_flow=0.5,
41
+ )
42
+
43
+ for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
44
+ if not isinstance(page_layout, LTPage):
45
+ continue
46
+
47
+ page_height = page_layout.height
48
+
49
+ for element in page_layout:
50
+ if not isinstance(element, LTTextBoxHorizontal):
51
+ continue
52
+
53
+ text = element.get_text().strip()
54
+ if not text:
55
+ continue
56
+
57
+ # Calculate Y position as ratio (0=bottom, 1=top)
58
+ y_ratio = element.y0 / page_height if page_height > 0 else 0.5
59
+
60
+ # Extract average font size from characters
61
+ font_sizes: list[float] = []
62
+ for line in element:
63
+ if isinstance(line, LTTextLineHorizontal):
64
+ for char in line:
65
+ if isinstance(char, LTChar):
66
+ font_sizes.append(char.size)
67
+
68
+ avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
69
+
70
+ blocks.append(
71
+ TextBlock(
72
+ text=text,
73
+ y_ratio=y_ratio,
74
+ font_size=avg_font_size,
75
+ page_num=page_num,
76
+ )
77
+ )
78
+
79
+ return blocks
80
+
81
+
82
+ def get_page_count(pdf_bytes: bytes) -> int:
83
+ """Get the number of pages in a PDF.
84
+
85
+ Args:
86
+ pdf_bytes: Raw PDF file content.
87
+
88
+ Returns:
89
+ Number of pages in the PDF.
90
+ """
91
+ pdf_file = io.BytesIO(pdf_bytes)
92
+ laparams = LAParams()
93
+ page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
94
+ return page_count
95
+
96
+
97
+ def extract_text(pdf_bytes: bytes) -> str:
98
+ """Extract and clean text from a PDF file.
99
+
100
+ Args:
101
+ pdf_bytes: Raw PDF file content.
102
+
103
+ Returns:
104
+ Cleaned text suitable for TTS.
105
+ """
106
+ blocks = extract_text_blocks(pdf_bytes)
107
+ if not blocks:
108
+ return ""
109
+
110
+ cleaned_blocks = clean_text_blocks(blocks)
111
+ text = "\n\n".join(block.text for block in cleaned_blocks)
112
+
113
+ # Apply TTS-specific normalization
114
+ return normalize_for_tts(text)
115
+
116
+
117
+ def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
118
+ """Remove headers, footers, page numbers, and other artifacts.
119
+
120
+ Applies multiple heuristics:
121
+ 1. Remove blocks in top/bottom margins (likely headers/footers)
122
+ 2. Remove repeated text across pages (likely running headers)
123
+ 3. Remove standalone page numbers
124
+ 4. Remove very short lines that look like artifacts
125
+
126
+ Args:
127
+ blocks: List of TextBlock objects.
128
+
129
+ Returns:
130
+ Filtered list of TextBlock objects.
131
+ """
132
+ if not blocks:
133
+ return []
134
+
135
+ # Find repeated text patterns (headers/footers)
136
+ text_counts = Counter(block.text for block in blocks)
137
+ total_pages = max(block.page_num for block in blocks)
138
+ repeated_threshold = max(2, total_pages // 2)
139
+ repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}
140
+
141
+ # Calculate median font size for filtering
142
+ font_sizes = sorted(block.font_size for block in blocks)
143
+ median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0
144
+
145
+ cleaned: list[TextBlock] = []
146
+
147
+ for block in blocks:
148
+ # Skip if in header zone (top 10%)
149
+ if block.y_ratio > 0.90:
150
+ continue
151
+
152
+ # Skip if in footer zone (bottom 10%)
153
+ if block.y_ratio < 0.10:
154
+ continue
155
+
156
+ # Skip repeated text (running headers/footers)
157
+ if block.text in repeated_texts:
158
+ continue
159
+
160
+ # Skip standalone page numbers
161
+ if is_page_number(block.text):
162
+ continue
163
+
164
+ # Skip very short lines with small font (likely captions/footnotes)
165
+ if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
166
+ continue
167
+
168
+ cleaned.append(block)
169
+
170
+ return cleaned
171
+
172
+
173
+ def is_page_number(text: str) -> bool:
174
+ """Check if text is likely a page number.
175
+
176
+ Args:
177
+ text: Text to check.
178
+
179
+ Returns:
180
+ True if text appears to be a page number.
181
+ """
182
+ text = text.strip()
183
+
184
+ # Pure number
185
+ if text.isdigit():
186
+ return True
187
+
188
+ # Roman numerals
189
+ if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
190
+ return True
191
+
192
+ # "Page N" or "N of M" patterns
193
+ if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE):
194
+ return True
195
+
196
+ # "- N -" pattern
197
+ if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text):
198
+ return True
199
+
200
+ return False
201
+
202
+
203
+ def clean_text(text: str) -> str:
204
+ """Clean raw text for TTS processing.
205
+
206
+ This is a simpler function for cleaning already-extracted text,
207
+ without the positional information.
208
+
209
+ Args:
210
+ text: Raw text to clean.
211
+
212
+ Returns:
213
+ Cleaned text suitable for TTS.
214
+ """
215
+ lines = text.split("\n")
216
+ cleaned_lines: list[str] = []
217
+
218
+ for line in lines:
219
+ line = line.strip()
220
+
221
+ # Skip empty lines
222
+ if not line:
223
+ continue
224
+
225
+ # Skip standalone page numbers
226
+ if is_page_number(line):
227
+ continue
228
+
229
+ # Skip very short lines (likely artifacts)
230
+ if len(line) < 3:
231
+ continue
232
+
233
+ cleaned_lines.append(line)
234
+
235
+ # Rejoin with proper spacing
236
+ result = "\n".join(cleaned_lines)
237
+
238
+ # === FIX HYPHENATED/SPLIT WORDS ===
239
+ # These are words broken across lines, common in PDFs and web content
240
+
241
+ # Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
242
+ result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)
243
+
244
+ # Pattern 2: word-\n word (hyphen + newline + spaces)
245
+ result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result)
246
+
247
+ # Pattern 3: word- word (hyphen + space, often from copy-paste)
248
+ result = re.sub(r"(\w)- (\w)", r"\1\2", result)
249
+
250
+ # Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
251
+ result = re.sub(r"-\n([a-z])", r"\1", result)
252
+
253
+ # === FIX LINE BREAK ARTIFACTS ===
254
+ # Join lines that don't end with sentence-ending punctuation
255
+ # This handles text that was wrapped at fixed width
256
+
257
+ # Replace single newlines (not paragraph breaks) with spaces
258
+ # Keep double newlines as paragraph separators
259
+ result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)
260
+
261
+ # Normalize whitespace
262
+ result = re.sub(r"\n{3,}", "\n\n", result)
263
+ result = re.sub(r"[ \t]+", " ", result)
264
+
265
+ # Apply TTS-specific normalization
266
+ result = normalize_for_tts(result)
267
+
268
+ return result.strip()
269
+
270
+
271
+ def normalize_for_tts(text: str) -> str:
272
+ """Normalize text for natural TTS pronunciation.
273
+
274
+ Handles special characters, punctuation, and formatting that can
275
+ cause TTS models to slow down or mispronounce.
276
+
277
+ Args:
278
+ text: Text to normalize.
279
+
280
+ Returns:
281
+ Normalized text optimized for TTS.
282
+ """
283
+ # === CODE AND TECHNICAL CONTENT ===
284
+ # Handle common programming patterns that read poorly
285
+
286
+ # === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
287
+ # URLs (various formats) - remove completely
288
+ text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
289
+ text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
290
+ text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)
291
+
292
+ # UUIDs (with or without dashes) - must come before git hash pattern
293
+ uuid_pattern = (
294
+ r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
295
+ )
296
+ text = re.sub(uuid_pattern, "", text)
297
+
298
+ # Git commit hashes (7-40 hex chars standalone)
299
+ text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)
300
+
301
+ # Hex color codes (#fff, #ffffff)
302
+ text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)
303
+
304
+ # Long hex/base64 strings (likely encoded data)
305
+ text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)
306
+
307
+ # File paths (Unix and Windows style)
308
+ text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)
309
+
310
+ # IP addresses
311
+ text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)
312
+
313
+ # Port numbers after colon
314
+ text = re.sub(r":\d{2,5}\b", "", text)
315
+
316
+ # Remove email addresses
317
+ text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)
318
+
319
+ # SHA/MD5 style hashes with prefix
320
+ text = re.sub(r"\b(sha\d*|md5|hash)[:\s]*[0-9a-f]+\b", "", text, flags=re.IGNORECASE)
321
+
322
+ # CamelCase: split into words (e.g., "getUserName" -> "get User Name")
323
+ text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
324
+
325
+ # snake_case: replace underscores with spaces
326
+ text = re.sub(r"(\w)_(\w)", r"\1 \2", text)
327
+
328
+ # Function calls: "func()" -> "func"
329
+ text = re.sub(r"(\w+)\(\)", r"\1", text)
330
+
331
+ # Arrow functions/operators: -> and =>
332
+ text = text.replace("->", " returns ")
333
+ text = text.replace("=>", " arrow ")
334
+
335
+ # Common code operators spoken naturally
336
+ text = text.replace("!=", " not equals ")
337
+ text = text.replace("==", " equals ")
338
+ text = text.replace("===", " strictly equals ")
339
+ text = text.replace("!==", " strictly not equals ")
340
+ text = text.replace("&&", " and ")
341
+ text = text.replace("||", " or ")
342
+ text = text.replace("++", " increment ")
343
+ text = text.replace("--", " decrement ")
344
+
345
+ # File extensions: ".py" -> " dot py" (only for common extensions)
346
+ ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b"
347
+ text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)
348
+
349
+ # Remove standalone hashes/pound signs (not hashtags)
350
+ text = re.sub(r"(?<!\w)#(?!\w)", "", text)
351
+
352
+ # Backticks (often used in markdown for code)
353
+ text = text.replace("`", "")
354
+
355
+ # Triple quotes
356
+ text = text.replace('"""', "")
357
+ text = text.replace("'''", "")
358
+
359
+ # === UNICODE NORMALIZATION ===
360
+ # Convert smart quotes to simple quotes
361
+ text = text.replace(""", '"').replace(""", '"')
362
+ text = text.replace("'", "'").replace("'", "'")
363
+ text = text.replace("„", '"').replace("‟", '"')
364
+
365
+ # Normalize dashes to standard hyphen or remove
366
+ text = text.replace("–", "-") # en-dash
367
+ text = text.replace("—", " - ") # em-dash (add spaces for pause)
368
+ text = text.replace("―", " - ") # horizontal bar
369
+ text = text.replace("‐", "-") # Unicode hyphen
370
+ text = text.replace("‑", "-") # non-breaking hyphen
371
+ text = text.replace("⁃", "-") # hyphen bullet
372
+ text = text.replace("−", "-") # minus sign
373
+
374
+ # Normalize ellipsis
375
+ text = text.replace("…", "...")
376
+ text = re.sub(r"\.{4,}", "...", text) # Limit to 3 dots
377
+
378
+ # Normalize other Unicode punctuation
379
+ text = text.replace("•", ",") # Bullet points
380
+ text = text.replace("·", " ") # Middle dot
381
+ text = text.replace("‧", " ") # Hyphenation point
382
+ text = text.replace("※", " ") # Reference mark
383
+ text = text.replace("†", "") # Dagger (footnote)
384
+ text = text.replace("‡", "") # Double dagger
385
+ text = text.replace("§", "section ")
386
+ text = text.replace("¶", "") # Pilcrow
387
+ text = text.replace("©", "copyright ")
388
+ text = text.replace("®", " registered ")
389
+ text = text.replace("™", " trademark ")
390
+ text = text.replace("°", " degrees ")
391
+
392
+ # === SPACING AROUND PUNCTUATION ===
393
+ # Ensure proper spacing around dashes used as separators
394
+ text = re.sub(r"\s*-\s*-\s*", " - ", text) # Double dash
395
+ text = re.sub(r"(\w)\s*-\s*(\w)", r"\1 - \2", text) # Word-dash-word with spaces
396
+
397
+ # Fix missing space after punctuation
398
+ text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
399
+ text = re.sub(r",([A-Za-z])", r", \1", text)
400
+
401
+ # Fix multiple punctuation marks
402
+ text = re.sub(r"[,]{2,}", ",", text)
403
+ text = re.sub(r"[;]{2,}", ";", text)
404
+ text = re.sub(r"[:]{2,}", ":", text)
405
+ text = re.sub(r"[!]{2,}", "!", text)
406
+ text = re.sub(r"[?]{2,}", "?", text)
407
+
408
+ # === NUMBERS AND SPECIAL NOTATIONS ===
409
+ # Convert common fractions
410
+ text = text.replace("½", " one half ")
411
+ text = text.replace("⅓", " one third ")
412
+ text = text.replace("⅔", " two thirds ")
413
+ text = text.replace("¼", " one quarter ")
414
+ text = text.replace("¾", " three quarters ")
415
+ text = text.replace("⅕", " one fifth ")
416
+ text = text.replace("⅖", " two fifths ")
417
+ text = text.replace("⅗", " three fifths ")
418
+ text = text.replace("⅘", " four fifths ")
419
+ text = text.replace("⅙", " one sixth ")
420
+ text = text.replace("⅚", " five sixths ")
421
+ text = text.replace("⅛", " one eighth ")
422
+ text = text.replace("⅜", " three eighths ")
423
+ text = text.replace("⅝", " five eighths ")
424
+ text = text.replace("⅞", " seven eighths ")
425
+
426
+ # Handle percentage and math symbols
427
+ text = text.replace("%", " percent")
428
+ text = text.replace("&", " and ")
429
+ text = text.replace("+", " plus ")
430
+ text = text.replace("=", " equals ")
431
+ text = text.replace("<", " less than ")
432
+ text = text.replace(">", " greater than ")
433
+ text = text.replace("≤", " less than or equal to ")
434
+ text = text.replace("≥", " greater than or equal to ")
435
+ text = text.replace("≠", " not equal to ")
436
+ text = text.replace("±", " plus or minus ")
437
+ text = text.replace("×", " times ")
438
+ text = text.replace("÷", " divided by ")
439
+
440
+ # === ABBREVIATIONS AND SPECIAL CASES ===
441
+ # Common abbreviations that might cause issues
442
+ text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
443
+ text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
444
+ text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
445
+ text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
446
+ text = re.sub(r"\bDr\.", "Doctor", text)
447
+ text = re.sub(r"\bMr\.", "Mister", text)
448
+ text = re.sub(r"\bMrs\.", "Missus", text)
449
+ text = re.sub(r"\bMs\.", "Miss", text)
450
+ text = re.sub(r"\bProf\.", "Professor", text)
451
+ text = re.sub(r"\bSt\.", "Saint", text)
452
+ text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
453
+ text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
454
+ text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
455
+ text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
456
+ text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)
457
+
458
+ # === BRACKETS AND PARENTHESES ===
459
+ # Remove or simplify brackets that might cause pauses
460
+ text = re.sub(r"\[([^\]]+)\]", r"(\1)", text) # Square to round
461
+ text = re.sub(r"\{([^}]+)\}", r"(\1)", text) # Curly to round
462
+
463
+ # Remove citation numbers like [1], [2,3], [1-5]
464
+ text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
465
+ text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text)
466
+
467
+ # === CLEANUP ===
468
+ # Remove standalone special characters
469
+ text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text)
470
+
471
+ # Remove content in angle brackets (often HTML/XML artifacts)
472
+ text = re.sub(r"<[^>]+>", "", text)
473
+
474
+ # Normalize multiple spaces
475
+ text = re.sub(r"[ \t]+", " ", text)
476
+
477
+ # Remove spaces before punctuation
478
+ text = re.sub(r"\s+([.,;:!?])", r"\1", text)
479
+
480
+ # Ensure space after punctuation (but not before another punctuation)
481
+ text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
482
+
483
+ # Remove leading/trailing whitespace from lines
484
+ text = "\n".join(line.strip() for line in text.split("\n"))
485
+
486
+ # Remove empty lines that resulted from cleaning
487
+ text = re.sub(r"\n{3,}", "\n\n", text)
488
+
489
+ return text
src/talking_snake/static/app.js ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Talking Snake - Main Application Script
3
+ * Handles file upload, URL submission, and audio streaming
4
+ */
5
+
6
+ // DOM Elements
7
+ const dropZone = document.getElementById("dropZone");
8
+ const fileInput = document.getElementById("fileInput");
9
+ const urlInput = document.getElementById("urlInput");
10
+ const urlSubmit = document.getElementById("urlSubmit");
11
+ const textInput = document.getElementById("textInput");
12
+ const textSubmit = document.getElementById("textSubmit");
13
+ const status = document.getElementById("status");
14
+ const player = document.getElementById("player");
15
+ const audio = document.getElementById("audio");
16
+ const filename = document.getElementById("filename");
17
+ const tabs = document.querySelectorAll(".tab");
18
+ const tabContents = document.querySelectorAll(".tab-content");
19
+ const inputSection = document.getElementById("inputSection");
20
+ const processingSection = document.getElementById("processingSection");
21
+ const stopBtn = document.getElementById("stopBtn");
22
+ const pauseBtn = document.getElementById("pauseBtn");
23
+ const deviceInfo = document.getElementById("deviceInfo");
24
+ const docInfo = document.getElementById("docInfo");
25
+ const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
26
+ const processingProgressBar = document.getElementById("processingProgressBar");
27
+
28
+ // Custom player elements
29
+ const playerPlayBtn = document.getElementById("playerPlayBtn");
30
+ const progressBar = document.getElementById("progressBar");
31
+ const progressSlider = document.getElementById("progressSlider");
32
+ const timeDisplay = document.getElementById("timeDisplay");
33
+ const volumeBtn = document.getElementById("volumeBtn");
34
+ const downloadBtn = document.getElementById("downloadBtn");
35
+
36
+ // Constants
37
+ const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
38
+
39
+ // State
40
+ let currentAbortController = null;
41
+ let selectedLanguage = "english";
42
+ let isPaused = false;
43
+ let estimatedDuration = 0; // Estimated total duration from server
44
+ let isMuted = false;
45
+ let currentAudioBlob = null; // Store audio blob for download
46
+ let currentDocName = ""; // Store document name for download filename
47
+
48
+ /**
49
+ * Format time in seconds to MM:SS
50
+ */
51
+ function formatTime(seconds) {
52
+ if (!isFinite(seconds) || seconds < 0) {
53
+ return "0:00";
54
+ }
55
+ const mins = Math.floor(seconds / 60);
56
+ const secs = Math.floor(seconds % 60);
57
+ return `${mins}:${secs.toString().padStart(2, "0")}`;
58
+ }
59
+
60
+ /**
61
+ * Format a number in human-readable form (1.2K, 3.4M, etc.)
62
+ */
63
+ function formatNumber(num) {
64
+ if (num >= 1000000) {
65
+ return (num / 1000000).toFixed(1).replace(/\.0$/, "") + "M";
66
+ }
67
+ if (num >= 1000) {
68
+ return (num / 1000).toFixed(1).replace(/\.0$/, "") + "K";
69
+ }
70
+ return num.toString();
71
+ }
72
+
73
+ /**
74
+ * Get icon for document type
75
+ */
76
+ function getDocTypeIcon(docType) {
77
+ switch (docType) {
78
+ case "pdf": return "fa-file-pdf";
79
+ case "url": return "fa-link";
80
+ case "text": return "fa-file-lines";
81
+ default: return "fa-file";
82
+ }
83
+ }
84
+
85
+ /**
86
+ * Update the document info display
87
+ */
88
+ function updateDocInfo(data) {
89
+ const icon = getDocTypeIcon(data.doc_type);
90
+ const docName = data.doc_name || "Document";
91
+ const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
92
+ const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
93
+
94
+ docInfo.innerHTML = `
95
+ <span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
96
+ ${pageInfo}
97
+ ${charInfo}
98
+ `;
99
+ }
100
+
101
+ /**
102
+ * Update the custom player progress bar and time display
103
+ */
104
+ function updatePlayerProgress() {
105
+ const currentTime = audio.currentTime || 0;
106
+ // Use estimated duration if audio duration is unrealistic (streaming issue)
107
+ let duration = audio.duration;
108
+ if (!isFinite(duration) || duration > 36000 || duration <= 0) {
109
+ duration = estimatedDuration || currentTime + 60; // Fallback
110
+ }
111
+
112
+ const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
113
+ progressBar.style.width = `${Math.min(progress, 100)}%`;
114
+ progressSlider.value = progress;
115
+ timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
116
+ }
117
+
118
+ /**
119
+ * Handle seeking via the progress slider
120
+ */
121
+ function handleSeek(e) {
122
+ const percent = parseFloat(e.target.value);
123
+ let duration = audio.duration;
124
+ if (!isFinite(duration) || duration > 36000) {
125
+ duration = estimatedDuration || 60;
126
+ }
127
+ audio.currentTime = (percent / 100) * duration;
128
+ updatePlayerProgress();
129
+ }
130
+
131
+ /**
132
+ * Toggle play/pause for custom player
133
+ */
134
+ function togglePlayerPlay() {
135
+ if (audio.paused) {
136
+ audio.play().catch(() => {});
137
+ } else {
138
+ audio.pause();
139
+ }
140
+ }
141
+
142
+ /**
143
+ * Update play button icon
144
+ */
145
+ function updatePlayButton() {
146
+ const icon = playerPlayBtn.querySelector("i");
147
+ if (audio.paused) {
148
+ icon.className = "fa-solid fa-play";
149
+ } else {
150
+ icon.className = "fa-solid fa-pause";
151
+ }
152
+ }
153
+
154
+ /**
155
+ * Toggle mute
156
+ */
157
+ function toggleMute() {
158
+ isMuted = !isMuted;
159
+ audio.muted = isMuted;
160
+ const icon = volumeBtn.querySelector("i");
161
+ icon.className = isMuted ? "fa-solid fa-volume-xmark" : "fa-solid fa-volume-high";
162
+ }
163
+
164
+ /**
165
+ * Update device info display from SSE data
166
+ * @param {Object} info - Device info object
167
+ */
168
+ function updateDeviceInfo(info) {
169
+ const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
170
+ const memoryInfo = info.device === "cuda"
171
+ ? `${info.memory_used_gb}GB / ${info.memory_total_gb}GB (${info.memory_percent}%)`
172
+ : "CPU mode";
173
+ deviceInfo.innerHTML = `
174
+ <i class="fa-solid ${icon}"></i>
175
+ <span>${info.device_name}</span>
176
+ <span class="device-memory">${memoryInfo}</span>
177
+ <span class="device-batch">Batch: ${info.batch_size}</span>
178
+ `;
179
+ deviceInfo.classList.add("visible");
180
+ }
181
+
182
+ /**
183
+ * Initialize device info SSE stream
184
+ */
185
+ function initDeviceInfoStream() {
186
+ const eventSource = new EventSource("/api/device-info-stream");
187
+
188
+ eventSource.onmessage = (event) => {
189
+ try {
190
+ const info = JSON.parse(event.data);
191
+ updateDeviceInfo(info);
192
+ } catch {
193
+ // Silently fail - device info is optional
194
+ }
195
+ };
196
+
197
+ eventSource.onerror = () => {
198
+ // On error, close and try to reconnect after a delay
199
+ eventSource.close();
200
+ setTimeout(initDeviceInfoStream, 5000);
201
+ };
202
+ }
203
+
204
+ // Start device info SSE stream
205
+ initDeviceInfoStream();
206
+
207
+ // Custom player event listeners
208
+ playerPlayBtn.addEventListener("click", togglePlayerPlay);
209
+ progressSlider.addEventListener("input", handleSeek);
210
+ volumeBtn.addEventListener("click", toggleMute);
211
+ audio.addEventListener("play", updatePlayButton);
212
+ audio.addEventListener("pause", updatePlayButton);
213
+ audio.addEventListener("timeupdate", updatePlayerProgress);
214
+ audio.addEventListener("ended", () => {
215
+ updatePlayButton();
216
+ progressBar.style.width = "100%";
217
+ });
218
+ // Show pause button when audio actually starts playing
219
+ audio.addEventListener("playing", () => {
220
+ pauseBtn.classList.remove("hidden");
221
+ });
222
+
223
+ /**
224
+ * Fetch audio blob from the server for download capability
225
+ * @param {string} jobId - The job ID for the audio
226
+ */
227
+ async function fetchAudioBlob(jobId) {
228
+ try {
229
+ const response = await fetch(`/api/audio/${jobId}`);
230
+ if (response.ok) {
231
+ currentAudioBlob = await response.blob();
232
+ // Show download button
233
+ downloadBtn.classList.remove("hidden");
234
+ }
235
+ } catch (error) {
236
+ console.error("Failed to fetch audio for download:", error);
237
+ }
238
+ }
239
+
240
+ /**
241
+ * Download the current audio as a WAV file
242
+ */
243
+ function downloadAudio() {
244
+ if (!currentAudioBlob) {
245
+ return;
246
+ }
247
+
248
+ const url = URL.createObjectURL(currentAudioBlob);
249
+ const a = document.createElement("a");
250
+ a.href = url;
251
+
252
+ // Create filename from document name
253
+ let filename = currentDocName || "audio";
254
+ // Remove file extension if present and add .wav
255
+ filename = filename.replace(/\.[^.]+$/, "") + ".wav";
256
+ a.download = filename;
257
+
258
+ document.body.appendChild(a);
259
+ a.click();
260
+ document.body.removeChild(a);
261
+ URL.revokeObjectURL(url);
262
+ }
263
+
264
+ /**
265
+ * Get the currently selected language
266
+ * @returns {string} The selected language name
267
+ */
268
+ function getSelectedLanguage() {
269
+ return selectedLanguage;
270
+ }
271
+
272
+ /**
273
+ * Show the input section and hide processing section
274
+ */
275
+ function showInputSection() {
276
+ inputSection.classList.remove("hidden");
277
+ processingSection.classList.remove("visible");
278
+ }
279
+
280
+ /**
281
+ * Show the processing section and hide input section
282
+ */
283
+ function showProcessingSection() {
284
+ inputSection.classList.add("hidden");
285
+ processingSection.classList.add("visible");
286
+ // Reset progress bar and hide pause button
287
+ processingProgressBar.style.width = "0%";
288
+ pauseBtn.classList.add("hidden");
289
+ }
290
+
291
+ /**
292
+ * Show a status message to the user
293
+ * @param {string} message - HTML message to display
294
+ * @param {string} type - Status type: 'loading', 'error', or 'success'
295
+ */
296
+ function showStatus(message, type) {
297
+ status.innerHTML = message;
298
+ status.className = `status visible ${type}`;
299
+ }
300
+
301
+ /**
302
+ * Stop the current generation and audio playback
303
+ */
304
+ function stopGeneration() {
305
+ // Stop the fetch request
306
+ if (currentAbortController) {
307
+ currentAbortController.abort();
308
+ currentAbortController = null;
309
+ }
310
+
311
+ // Stop audio playback and clear source
312
+ audio.pause();
313
+ audio.currentTime = 0;
314
+ audio.src = "";
315
+ audio.load(); // Force release of audio resources
316
+
317
+ // Reset pause state
318
+ isPaused = false;
319
+ updatePauseButton();
320
+
321
+ // Hide download button and pause button
322
+ downloadBtn.classList.add("hidden");
323
+ pauseBtn.classList.add("hidden");
324
+ currentAudioBlob = null;
325
+
326
+ // Reset progress bar
327
+ processingProgressBar.style.width = "0%";
328
+
329
+ showStatus('<i class="fa-solid fa-ban"></i> Generation stopped', "error");
330
+ showInputSection();
331
+ }
332
+
333
+ // Stop audio when page is closed or navigated away
334
+ window.addEventListener("beforeunload", () => {
335
+ audio.pause();
336
+ audio.src = "";
337
+ });
338
+
339
+ // Also handle page hide (works better on mobile and for navigation)
340
+ window.addEventListener("pagehide", () => {
341
+ audio.pause();
342
+ audio.src = "";
343
+ });
344
+
345
+ /**
346
+ * Toggle pause/play state
347
+ */
348
+ function togglePause() {
349
+ if (audio.paused) {
350
+ audio.play().catch(() => {});
351
+ isPaused = false;
352
+ } else {
353
+ audio.pause();
354
+ isPaused = true;
355
+ }
356
+ updatePauseButton();
357
+ }
358
+
359
+ /**
360
+ * Update pause button icon based on state
361
+ */
362
+ function updatePauseButton() {
363
+ const icon = pauseBtn.querySelector("i");
364
+ if (isPaused || audio.paused) {
365
+ icon.className = "fa-solid fa-play";
366
+ pauseBtn.title = "Resume";
367
+ } else {
368
+ icon.className = "fa-solid fa-pause";
369
+ pauseBtn.title = "Pause";
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Format remaining time for display
375
+ * @param {number} seconds - Remaining time in seconds
376
+ * @returns {string} Formatted time string
377
+ */
378
+ function formatTimeRemaining(seconds) {
379
+ if (seconds > 60) {
380
+ return `~${Math.ceil(seconds / 60)} min remaining`;
381
+ }
382
+ return `~${Math.ceil(seconds)}s remaining`;
383
+ }
384
+
385
+ /**
386
+ * Process SSE stream for progress updates
387
+ * Sets up audio stream once job_id is received
388
+ * @param {Response} response - Fetch response with SSE stream
389
+ * @param {string} docName - Document name for display
390
+ * @returns {Promise<void>}
391
+ * @throws {Error} If stream contains an error event or fails
392
+ */
393
+ async function processStream(response, docName) {
394
+ const reader = response.body.getReader();
395
+ const decoder = new TextDecoder();
396
+ let lastStatus = "";
397
+ let jobId = null;
398
+ let audioStarted = false;
399
+
400
+ // Reset estimated duration
401
+ estimatedDuration = 0;
402
+
403
+ try {
404
+ while (true) {
405
+ const { done, value } = await reader.read();
406
+ if (done) {
407
+ break;
408
+ }
409
+
410
+ const text = decoder.decode(value, { stream: true });
411
+ const lines = text.split("\n");
412
+
413
+ for (const line of lines) {
414
+ if (line.startsWith("data: ")) {
415
+ try {
416
+ const data = JSON.parse(line.slice(6));
417
+
418
+ if (data.type === "error") {
419
+ throw new Error(data.message || "TTS generation failed");
420
+ } else if (data.type === "start" && data.job_id) {
421
+ // Got job ID - start audio stream immediately
422
+ jobId = data.job_id;
423
+ // Capture initial duration estimate
424
+ if (data.estimated_remaining) {
425
+ estimatedDuration = data.estimated_remaining;
426
+ }
427
+ // Display document info
428
+ updateDocInfo(data);
429
+ if (!audioStarted) {
430
+ audioStarted = true;
431
+ // Set audio source to stream endpoint
432
+ // Browser will start playing as data arrives
433
+ audio.src = `/api/audio/${jobId}`;
434
+ audio.load();
435
+ // Try to play (may need user interaction first time)
436
+ audio.play().catch(() => {
437
+ // Autoplay blocked - will play when user clicks
438
+ });
439
+ updatePlayButton();
440
+ // Pause button will be shown by the 'playing' event listener
441
+ }
442
+ const timeStr = formatTimeRemaining(data.estimated_remaining);
443
+ showStatus(
444
+ `<span class="spinner"></span>ETA ${timeStr}`,
445
+ "loading"
446
+ );
447
+ // Update progress bar
448
+ processingProgressBar.style.width = "5%";
449
+ } else if (data.type === "progress") {
450
+ lastStatus = data.status;
451
+ const timeStr = formatTimeRemaining(data.estimated_remaining);
452
+ showStatus(
453
+ `<span class="spinner"></span>${data.percent}% • ETA ${timeStr}`,
454
+ "loading"
455
+ );
456
+ // Update progress bar
457
+ processingProgressBar.style.width = `${data.percent}%`;
458
+ } else if (data.type === "complete") {
459
+ // Generation complete - show player
460
+ // Update estimated duration based on actual processing time
461
+ if (data.total_time) {
462
+ // Estimate audio duration: ~0.1s per char at normal speech rate
463
+ // Use total_time as a rough guide
464
+ estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
465
+ }
466
+ filename.textContent = docName;
467
+ currentDocName = docName;
468
+ player.classList.add("visible");
469
+ // Set progress to 100%
470
+ processingProgressBar.style.width = "100%";
471
+ showInputSection();
472
+ showStatus(
473
+ `<i class="fa-solid fa-circle-check"></i> Done in ${data.total_time}s`,
474
+ "success"
475
+ );
476
+ updatePlayerProgress();
477
+
478
+ // Fetch audio blob for download capability
479
+ if (jobId) {
480
+ fetchAudioBlob(jobId);
481
+ }
482
+ }
483
+ } catch (parseError) {
484
+ // Check if it's our thrown error or a JSON parse error
485
+ if (parseError.message && !parseError.message.includes("JSON")) {
486
+ throw parseError;
487
+ }
488
+ // Ignore JSON parse errors for partial data
489
+ }
490
+ }
491
+ }
492
+ }
493
+ } catch (streamError) {
494
+ // Re-throw with more context and preserve the original cause
495
+ const context = lastStatus ? ` (during: ${lastStatus})` : "";
496
+ throw new Error(`Stream error${context}: ${streamError.message}`, { cause: streamError });
497
+ }
498
+ }
499
+
500
+ /**
501
+ * Handle file upload and TTS conversion
502
+ * @param {File} file - The uploaded file
503
+ */
504
+ async function handleFile(file) {
505
+ // Validate file type
506
+ if (!file.name.toLowerCase().endsWith(".pdf")) {
507
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please select a PDF file', "error");
508
+ return;
509
+ }
510
+
511
+ // Validate file size
512
+ if (file.size > MAX_FILE_SIZE) {
513
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> File too large. Maximum size is 50MB.', "error");
514
+ return;
515
+ }
516
+
517
+ showProcessingSection();
518
+ showStatus('<span class="spinner"></span> Extracting text...', "loading");
519
+ player.classList.remove("visible");
520
+ downloadBtn.classList.add("hidden");
521
+ currentAudioBlob = null;
522
+
523
+ const formData = new FormData();
524
+ formData.append("file", file);
525
+ formData.append("language", getSelectedLanguage());
526
+
527
+ // Create abort controller for this request
528
+ currentAbortController = new AbortController();
529
+
530
+ try {
531
+ const response = await fetch("/api/read-stream", {
532
+ method: "POST",
533
+ body: formData,
534
+ signal: currentAbortController.signal,
535
+ });
536
+
537
+ if (!response.ok) {
538
+ const error = await response.json();
539
+ throw new Error(error.detail || "Failed to process document");
540
+ }
541
+
542
+ // Process stream handles both progress SSE and starting audio playback
543
+ await processStream(response, file.name);
544
+ } catch (error) {
545
+ if (error.name === "AbortError") {
546
+ // User cancelled - already handled in stopGeneration
547
+ return;
548
+ }
549
+ showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
550
+ showInputSection();
551
+ } finally {
552
+ currentAbortController = null;
553
+ }
554
+ }
555
+
556
+ /**
557
+ * Handle URL submission and TTS conversion
558
+ * @param {string} url - The URL to process
559
+ */
560
+ async function handleUrl(url) {
561
+ url = url.trim();
562
+
563
+ if (!url) {
564
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a URL', "error");
565
+ return;
566
+ }
567
+
568
+ // Validate URL format
569
+ try {
570
+ new URL(url);
571
+ } catch {
572
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a valid URL', "error");
573
+ return;
574
+ }
575
+
576
+ showProcessingSection();
577
+ showStatus('<span class="spinner"></span> Fetching content...', "loading");
578
+ player.classList.remove("visible");
579
+ downloadBtn.classList.add("hidden");
580
+ currentAudioBlob = null;
581
+ urlSubmit.disabled = true;
582
+
583
+ // Create abort controller for this request
584
+ currentAbortController = new AbortController();
585
+
586
+ try {
587
+ const response = await fetch("/api/read-url-stream", {
588
+ method: "POST",
589
+ headers: {
590
+ "Content-Type": "application/json",
591
+ },
592
+ body: JSON.stringify({
593
+ url,
594
+ language: getSelectedLanguage()
595
+ }),
596
+ signal: currentAbortController.signal,
597
+ });
598
+
599
+ if (!response.ok) {
600
+ const error = await response.json();
601
+ throw new Error(error.detail || "Failed to process document");
602
+ }
603
+
604
+ // Extract filename from URL
605
+ const urlPath = new URL(url).pathname;
606
+ const docName = urlPath.split("/").pop() || "document";
607
+
608
+ // Process stream handles both progress SSE and starting audio playback
609
+ await processStream(response, docName);
610
+ } catch (error) {
611
+ if (error.name === "AbortError") {
612
+ // User cancelled - already handled in stopGeneration
613
+ return;
614
+ }
615
+ showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
616
+ showInputSection();
617
+ } finally {
618
+ urlSubmit.disabled = false;
619
+ currentAbortController = null;
620
+ }
621
+ }
622
+
623
+ /**
624
+ * Handle text submission and TTS conversion
625
+ * @param {string} text - The text to process
626
+ */
627
+ async function handleText(text) {
628
+ text = text.trim();
629
+
630
+ if (!text) {
631
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter some text', "error");
632
+ return;
633
+ }
634
+
635
+ if (text.length > 500000) {
636
+ showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Text too long (max 500,000 characters)', "error");
637
+ return;
638
+ }
639
+
640
+ showProcessingSection();
641
+ showStatus('<span class="spinner"></span> Processing text...', "loading");
642
+ player.classList.remove("visible");
643
+ downloadBtn.classList.add("hidden");
644
+ currentAudioBlob = null;
645
+ textSubmit.disabled = true;
646
+
647
+ // Create abort controller for this request
648
+ currentAbortController = new AbortController();
649
+
650
+ try {
651
+ const response = await fetch("/api/read-text-stream", {
652
+ method: "POST",
653
+ headers: {
654
+ "Content-Type": "application/json",
655
+ },
656
+ body: JSON.stringify({
657
+ text,
658
+ language: getSelectedLanguage()
659
+ }),
660
+ signal: currentAbortController.signal,
661
+ });
662
+
663
+ if (!response.ok) {
664
+ const error = await response.json();
665
+ throw new Error(error.detail || "Failed to process text");
666
+ }
667
+
668
+ // Process stream handles both progress SSE and starting audio playback
669
+ await processStream(response, "Pasted Text");
670
+ } catch (error) {
671
+ if (error.name === "AbortError") {
672
+ // User cancelled - already handled in stopGeneration
673
+ return;
674
+ }
675
+ showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
676
+ showInputSection();
677
+ } finally {
678
+ textSubmit.disabled = false;
679
+ currentAbortController = null;
680
+ }
681
+ }
682
+
683
+ // Tab switching
684
+ tabs.forEach((tab) => {
685
+ tab.addEventListener("click", () => {
686
+ tabs.forEach((t) => t.classList.remove("active"));
687
+ tabContents.forEach((tc) => tc.classList.remove("active"));
688
+ tab.classList.add("active");
689
+ document.getElementById(`${tab.dataset.tab}-tab`).classList.add("active");
690
+ });
691
+ });
692
+
693
+ // Drag and drop handlers
694
+ dropZone.addEventListener("dragover", (e) => {
695
+ e.preventDefault();
696
+ dropZone.classList.add("dragover");
697
+ });
698
+
699
+ dropZone.addEventListener("dragleave", () => {
700
+ dropZone.classList.remove("dragover");
701
+ });
702
+
703
+ dropZone.addEventListener("drop", (e) => {
704
+ e.preventDefault();
705
+ dropZone.classList.remove("dragover");
706
+
707
+ const files = e.dataTransfer.files;
708
+ if (files.length > 0) {
709
+ handleFile(files[0]);
710
+ }
711
+ });
712
+
713
+ // Click to select file
714
+ dropZone.addEventListener("click", (e) => {
715
+ if (e.target !== fileInput && !e.target.classList.contains("file-label")) {
716
+ fileInput.click();
717
+ }
718
+ });
719
+
720
+ fileInput.addEventListener("change", () => {
721
+ if (fileInput.files.length > 0) {
722
+ handleFile(fileInput.files[0]);
723
+ }
724
+ });
725
+
726
+ // URL submission
727
+ urlSubmit.addEventListener("click", () => {
728
+ handleUrl(urlInput.value);
729
+ });
730
+
731
+ urlInput.addEventListener("keypress", (e) => {
732
+ if (e.key === "Enter") {
733
+ handleUrl(urlInput.value);
734
+ }
735
+ });
736
+
737
+ // Text submission
738
+ textSubmit.addEventListener("click", () => {
739
+ handleText(textInput.value);
740
+ });
741
+
742
+ // Allow Ctrl+Enter to submit text
743
+ textInput.addEventListener("keydown", (e) => {
744
+ if (e.key === "Enter" && (e.ctrlKey || e.metaKey)) {
745
+ handleText(textInput.value);
746
+ }
747
+ });
748
+
749
+ // Stop button
750
+ stopBtn.addEventListener("click", stopGeneration);
751
+
752
+ // Pause button
753
+ pauseBtn.addEventListener("click", togglePause);
754
+
755
+ // Download button
756
+ downloadBtn.addEventListener("click", downloadAudio);
757
+
758
+ // Update pause button when audio state changes
759
+ audio.addEventListener("play", updatePauseButton);
760
+ audio.addEventListener("pause", updatePauseButton);
761
+ audio.addEventListener("ended", () => {
762
+ isPaused = false;
763
+ updatePauseButton();
764
+ });
765
+
766
+ // Language selection
767
+ languageButtons.forEach((btn) => {
768
+ btn.addEventListener("click", () => {
769
+ languageButtons.forEach((b) => b.classList.remove("active"));
770
+ btn.classList.add("active");
771
+ selectedLanguage = btn.dataset.language;
772
+ });
773
+ });
src/talking_snake/static/apple-touch-icon.png ADDED

Git LFS Details

  • SHA256: 989b16d28890ccb0f7448a3b9908ccd8d32f6e21905d5a19911e91c3f20b3321
  • Pointer size: 130 Bytes
  • Size of remote file: 16.2 kB
src/talking_snake/static/favicon.png ADDED

Git LFS Details

  • SHA256: b5e1394d12d7e68102bbc5c840d08e2273813b8c217e7672e5eead49021c699f
  • Pointer size: 131 Bytes
  • Size of remote file: 253 kB
src/talking_snake/static/icon-192.png ADDED

Git LFS Details

  • SHA256: f092b628061742568c93d060f3549b356b4e2d32816f30412960e34da3b18c9e
  • Pointer size: 130 Bytes
  • Size of remote file: 18.4 kB
src/talking_snake/static/icon-512.png ADDED

Git LFS Details

  • SHA256: 8622da6773cd5becce47e2b1e4a2ff5b5eb25d9335a472808cde51dce1f33ddc
  • Pointer size: 131 Bytes
  • Size of remote file: 107 kB
src/talking_snake/static/index.html ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Talking Snake - PDF & Web to Speech</title>
7
+
8
+ <!-- PWA / Mobile App Configuration -->
9
+ <meta name="application-name" content="Talking Snake">
10
+ <meta name="theme-color" content="#1a1a2e">
11
+ <meta name="mobile-web-app-capable" content="yes">
12
+ <link rel="manifest" href="/static/manifest.json">
13
+
14
+ <!-- iOS PWA Configuration -->
15
+ <meta name="apple-mobile-web-app-capable" content="yes">
16
+ <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
17
+ <meta name="apple-mobile-web-app-title" content="Talking Snake">
18
+ <link rel="apple-touch-icon" href="/static/apple-touch-icon.png">
19
+ <link rel="apple-touch-icon" sizes="180x180" href="/static/apple-touch-icon.png">
20
+ <link rel="apple-touch-icon" sizes="152x152" href="/static/apple-touch-icon.png">
21
+ <link rel="apple-touch-icon" sizes="120x120" href="/static/apple-touch-icon.png">
22
+
23
+ <!-- Standard favicon -->
24
+ <link rel="icon" type="image/png" href="/static/favicon.png">
25
+ <link rel="icon" type="image/png" sizes="192x192" href="/static/icon-192.png">
26
+ <link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
27
+
28
+ <link rel="stylesheet" href="/static/styles.css">
29
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
30
+ <script src="https://unpkg.com/htmx.org@2.0.4"></script>
31
+ </head>
32
+ <body>
33
+ <div class="main-content">
34
+ <img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
35
+ <h1>Talking Snake</h1>
36
+ <p class="subtitle">Transform PDFs & Web into Audio</p>
37
+
38
+ <div class="container">
39
+ <div class="input-section" id="inputSection">
40
+ <div class="options-row">
41
+ <div class="language-selector">
42
+ <span class="style-label">Language:</span>
43
+ <div class="style-buttons" id="languageButtons">
44
+ <button class="style-btn lang-btn active" data-language="english" title="English">
45
+ 🇬🇧
46
+ </button>
47
+ <button class="style-btn lang-btn" data-language="chinese" title="Chinese">
48
+ 🇨🇳
49
+ </button>
50
+ <button class="style-btn lang-btn" data-language="japanese" title="Japanese">
51
+ 🇯🇵
52
+ </button>
53
+ <button class="style-btn lang-btn" data-language="korean" title="Korean">
54
+ 🇰🇷
55
+ </button>
56
+ </div>
57
+ </div>
58
+ </div>
59
+
60
+ <div class="tabs">
61
+ <button class="tab active" data-tab="upload"><i class="fa-solid fa-upload"></i> Upload File</button>
62
+ <button class="tab" data-tab="url"><i class="fa-solid fa-link"></i> From URL</button>
63
+ <button class="tab" data-tab="text"><i class="fa-solid fa-keyboard"></i> Paste Text</button>
64
+ </div>
65
+
66
+ <div class="tab-content active" id="upload-tab">
67
+ <div class="drop-zone" id="dropZone">
68
+ <i class="fa-solid fa-file-pdf drop-icon"></i>
69
+ <p>Drag & drop a PDF here</p>
70
+ <label class="file-label">
71
+ <i class="fa-solid fa-folder-open"></i> Choose File
72
+ <input type="file" id="fileInput" accept=".pdf">
73
+ </label>
74
+ <p class="hint">Supports PDF documents up to 50MB</p>
75
+ </div>
76
+ </div>
77
+
78
+ <div class="tab-content" id="url-tab">
79
+ <div class="url-form">
80
+ <input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
81
+ <button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i> Read Content</button>
82
+ <p class="hint">Enter a link to a PDF or web page (articles, docs, blogs)</p>
83
+ </div>
84
+ </div>
85
+
86
+ <div class="tab-content" id="text-tab">
87
+ <div class="text-form">
88
+ <textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
89
+ <button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
90
+ <p class="hint">Paste any text you want to hear read aloud</p>
91
+ </div>
92
+ </div>
93
+ </div>
94
+
95
+ <div class="processing-section" id="processingSection">
96
+ <div class="processing-row-1">
97
+ <div class="doc-info" id="docInfo"></div>
98
+ </div>
99
+ <div class="processing-row-2">
100
+ <div class="status" id="status"></div>
101
+ <div class="processing-progress-container" id="processingProgressContainer">
102
+ <div class="processing-progress-bar" id="processingProgressBar"></div>
103
+ </div>
104
+ <div class="control-buttons">
105
+ <button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
106
+ <button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
107
+ </div>
108
+ </div>
109
+ </div>
110
+
111
+ <div class="device-info" id="deviceInfo"></div>
112
+
113
+ <div class="player" id="player">
114
+ <div class="filename" id="filename"></div>
115
+ <div class="custom-player">
116
+ <button class="player-btn play-btn" id="playerPlayBtn" title="Play/Pause">
117
+ <i class="fa-solid fa-play"></i>
118
+ </button>
119
+ <div class="progress-container" id="progressContainer">
120
+ <div class="progress-bar" id="progressBar"></div>
121
+ <input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
122
+ </div>
123
+ <span class="time-display" id="timeDisplay">0:00 / 0:00</span>
124
+ <button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
125
+ <i class="fa-solid fa-volume-high"></i>
126
+ </button>
127
+ <button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
128
+ <i class="fa-solid fa-download"></i>
129
+ </button>
130
+ </div>
131
+ <audio id="audio" preload="auto"></audio>
132
+ </div>
133
+ </div>
134
+ </div>
135
+
136
+ <footer>
137
+ <p>Built with <i class="fa-solid fa-heart"></i> for listeners everywhere | <a href="https://github.com/LucaCappelletti94/talking-snake" target="_blank" rel="noopener noreferrer"><i class="fa-brands fa-github"></i> GitHub</a></p>
138
+ </footer>
139
+
140
+ <script src="/static/app.js"></script>
141
+ </body>
142
+ </html>
src/talking_snake/static/manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Talking Snake",
3
+ "short_name": "Talking Snake",
4
+ "description": "Transform PDFs & Web into Audio",
5
+ "start_url": "/",
6
+ "display": "standalone",
7
+ "background_color": "#1a1a2e",
8
+ "theme_color": "#1a1a2e",
9
+ "orientation": "portrait-primary",
10
+ "icons": [
11
+ {
12
+ "src": "/static/favicon.png",
13
+ "sizes": "64x64",
14
+ "type": "image/png"
15
+ },
16
+ {
17
+ "src": "/static/icon-192.png",
18
+ "sizes": "192x192",
19
+ "type": "image/png",
20
+ "purpose": "any maskable"
21
+ },
22
+ {
23
+ "src": "/static/icon-512.png",
24
+ "sizes": "512x512",
25
+ "type": "image/png",
26
+ "purpose": "any maskable"
27
+ },
28
+ {
29
+ "src": "/static/apple-touch-icon.png",
30
+ "sizes": "180x180",
31
+ "type": "image/png"
32
+ }
33
+ ],
34
+ "categories": ["utilities", "productivity"],
35
+ "lang": "en"
36
+ }
src/talking_snake/static/sample.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:131aea479804ac10ad86674780fca80134775ef547e808339f66408eb90ffadb
3
+ size 291884
src/talking_snake/static/styles.css ADDED
@@ -0,0 +1,848 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Talking Snake - Main Stylesheet
3
+ * A warm, accessible color scheme inspired by the talking snake logo
4
+ */
5
+
6
+ :root {
7
+ /* Warm, friendly palette inspired by the talking snake logo */
8
+ --bg: #fff7e9; /* Warm cream background */
9
+ --surface: #fff; /* Clean white cards */
10
+ --primary: #d4763a; /* Warm orange - friendly & energetic */
11
+ --primary-hover: #c06830; /* Darker orange for hover */
12
+ --secondary: #5a8f5a; /* Soft green - snake accent */
13
+ --text: #3d3425; /* Warm dark brown - easy on eyes */
14
+ --text-muted: #7a6f5f; /* Muted brown */
15
+ --border: #e5d9c8; /* Warm border */
16
+ --success: #5a8f5a; /* Green for success states */
17
+ --error: #c45a4a; /* Soft red for errors */
18
+ }
19
+
20
+ * {
21
+ box-sizing: border-box;
22
+ }
23
+
24
+ body {
25
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, sans-serif;
26
+ background: var(--bg);
27
+ color: var(--text);
28
+ min-height: 100vh;
29
+ margin: 0;
30
+ padding: 1.5rem;
31
+ display: flex;
32
+ flex-direction: column;
33
+ align-items: center;
34
+ justify-content: center;
35
+ line-height: 1.4;
36
+ }
37
+
38
+ .main-content {
39
+ display: flex;
40
+ flex-direction: column;
41
+ align-items: center;
42
+ flex: 1;
43
+ justify-content: center;
44
+ width: 100%;
45
+ }
46
+
47
+ h1 {
48
+ font-size: 1.75rem;
49
+ margin: 0 0 0.25rem;
50
+ color: var(--primary);
51
+ }
52
+
53
+ .subtitle {
54
+ color: var(--text-muted);
55
+ margin: 0 0 1rem;
56
+ font-size: 0.9rem;
57
+ }
58
+
59
+ .container {
60
+ max-width: 500px;
61
+ width: 100%;
62
+ }
63
+
64
+ /* Options Row - Style and Language selectors */
65
+ .options-row {
66
+ display: flex;
67
+ justify-content: center;
68
+ gap: 1.5rem;
69
+ margin-bottom: 1rem;
70
+ flex-wrap: wrap;
71
+ }
72
+
73
+ /* Style Selector */
74
+ .style-selector,
75
+ .language-selector {
76
+ display: flex;
77
+ align-items: center;
78
+ gap: 0.5rem;
79
+ flex-wrap: wrap;
80
+ }
81
+
82
+ .style-label {
83
+ font-size: 0.85rem;
84
+ color: var(--text-muted);
85
+ }
86
+
87
+ .style-buttons {
88
+ display: flex;
89
+ gap: 0.35rem;
90
+ }
91
+
92
+ .style-btn {
93
+ width: 38px;
94
+ height: 38px;
95
+ border: 1px solid var(--border);
96
+ border-radius: 6px;
97
+ background: var(--surface);
98
+ color: var(--text-muted);
99
+ cursor: pointer;
100
+ font-size: 0.95rem;
101
+ transition: all 0.15s ease;
102
+ display: flex;
103
+ align-items: center;
104
+ justify-content: center;
105
+ }
106
+
107
+ /* Language buttons use emoji flags */
108
+ .style-btn.lang-btn {
109
+ font-size: 1.2rem;
110
+ }
111
+
112
+ .style-btn:hover {
113
+ border-color: var(--primary);
114
+ color: var(--text);
115
+ }
116
+
117
+ .style-btn.active {
118
+ background: rgb(212, 118, 58, 0.15);
119
+ border-color: var(--primary);
120
+ color: var(--primary);
121
+ }
122
+
123
+ /* Input Section - hidden during processing */
124
+ .input-section.hidden {
125
+ display: none;
126
+ }
127
+
128
+ /* Processing Section - two row layout */
129
+ .processing-section {
130
+ display: none;
131
+ flex-direction: column;
132
+ gap: 0.75rem;
133
+ padding: 1rem 1.25rem;
134
+ background: var(--surface);
135
+ border-radius: 10px;
136
+ border: 1px solid var(--border);
137
+ width: 100%;
138
+ }
139
+
140
+ .processing-section.visible {
141
+ display: flex;
142
+ }
143
+
144
+ /* Row 1: Document info */
145
+ .processing-row-1 {
146
+ display: flex;
147
+ align-items: center;
148
+ width: 100%;
149
+ }
150
+
151
+ /* Row 2: Status, progress, buttons */
152
+ .processing-row-2 {
153
+ display: flex;
154
+ align-items: center;
155
+ gap: 0.75rem;
156
+ }
157
+
158
+ /* Document Info - fills first row */
159
+ .doc-info {
160
+ display: flex;
161
+ align-items: center;
162
+ gap: 0.75rem;
163
+ font-size: 0.85rem;
164
+ color: var(--text);
165
+ width: 100%;
166
+ min-width: 0;
167
+ }
168
+
169
+ .doc-info:empty {
170
+ display: none;
171
+ }
172
+
173
+ .doc-info .doc-name {
174
+ font-weight: 600;
175
+ display: flex;
176
+ align-items: center;
177
+ gap: 0.4rem;
178
+ flex: 1;
179
+ min-width: 0;
180
+ }
181
+
182
+ .doc-info .doc-name i {
183
+ color: var(--primary);
184
+ flex-shrink: 0;
185
+ }
186
+
187
+ .doc-info .doc-name-text {
188
+ overflow: hidden;
189
+ text-overflow: ellipsis;
190
+ white-space: nowrap;
191
+ }
192
+
193
+ .doc-info .doc-pages,
194
+ .doc-info .doc-chars {
195
+ color: var(--text-muted);
196
+ font-size: 0.75rem;
197
+ display: flex;
198
+ align-items: center;
199
+ gap: 0.25rem;
200
+ white-space: nowrap;
201
+ flex-shrink: 0;
202
+ }
203
+
204
+ .doc-info .doc-pages i,
205
+ .doc-info .doc-chars i {
206
+ font-size: 0.7rem;
207
+ opacity: 0.6;
208
+ }
209
+
210
+ /* Status in processing */
211
+ .processing-section .status {
212
+ padding: 0;
213
+ background: none;
214
+ font-size: 0.8rem;
215
+ white-space: nowrap;
216
+ flex-shrink: 0;
217
+ }
218
+
219
+ /* Processing progress bar */
220
+ .processing-progress-container {
221
+ flex: 1;
222
+ height: 6px;
223
+ background: var(--bg);
224
+ border-radius: 3px;
225
+ overflow: hidden;
226
+ min-width: 60px;
227
+ }
228
+
229
+ .processing-progress-bar {
230
+ height: 100%;
231
+ background: linear-gradient(90deg, var(--primary) 0%, #c06030 100%);
232
+ border-radius: 3px;
233
+ width: 0%;
234
+ transition: width 0.3s ease;
235
+ }
236
+
237
+ /* Control buttons row */
238
+ .control-buttons {
239
+ display: flex;
240
+ gap: 0.5rem;
241
+ flex-shrink: 0;
242
+ }
243
+
244
+ .control-btn {
245
+ width: 36px;
246
+ height: 36px;
247
+ padding: 0;
248
+ color: white;
249
+ border: none;
250
+ border-radius: 8px;
251
+ cursor: pointer;
252
+ font-size: 0.9rem;
253
+ transition: all 0.15s ease;
254
+ display: flex;
255
+ align-items: center;
256
+ justify-content: center;
257
+ }
258
+
259
+ .control-btn.hidden {
260
+ display: none;
261
+ }
262
+
263
+ .control-btn:hover {
264
+ filter: brightness(1.1);
265
+ }
266
+
267
+ .pause-btn {
268
+ background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
269
+ background-size: 200% 200%;
270
+ animation: gradient-idle 3s ease infinite;
271
+ }
272
+
273
+ .pause-btn:hover {
274
+ animation: gradient-shift 0.8s ease infinite;
275
+ }
276
+
277
+ .stop-btn {
278
+ background: linear-gradient(135deg, var(--error), #8b3a30, var(--error));
279
+ background-size: 200% 200%;
280
+ animation: gradient-idle 3s ease infinite;
281
+ }
282
+
283
+ .stop-btn:hover {
284
+ animation: gradient-shift 0.8s ease infinite;
285
+ }
286
+
287
+ @keyframes gradient-idle {
288
+ 0%, 100% { background-position: 0% 50%; }
289
+ 50% { background-position: 100% 50%; }
290
+ }
291
+
292
+ @keyframes gradient-shift {
293
+ 0% { background-position: 0% 50%; }
294
+ 50% { background-position: 100% 50%; }
295
+ 100% { background-position: 0% 50%; }
296
+ }
297
+
298
+ /* Drop Zone */
299
+ .drop-zone {
300
+ border: 2px dashed var(--border);
301
+ border-radius: 8px;
302
+ padding: 1.5rem 1rem;
303
+ text-align: center;
304
+ transition: all 0.2s ease;
305
+ cursor: pointer;
306
+ background: var(--surface);
307
+ }
308
+
309
+ .drop-zone:hover,
310
+ .drop-zone.dragover {
311
+ border-color: var(--primary);
312
+ background: rgb(212, 118, 58, 0.08);
313
+ }
314
+
315
+ .drop-zone p {
316
+ margin: 0 0 0.75rem;
317
+ font-size: 0.95rem;
318
+ }
319
+
320
+ .drop-zone .hint {
321
+ color: var(--text-muted);
322
+ font-size: 0.8rem;
323
+ }
324
+
325
+ .drop-icon {
326
+ font-size: 2.5rem;
327
+ color: var(--primary);
328
+ margin-bottom: 0.75rem;
329
+ display: block;
330
+ }
331
+
332
+ /* Tabs */
333
+ .tabs {
334
+ display: flex;
335
+ gap: 0.25rem;
336
+ margin-bottom: 0.75rem;
337
+ }
338
+
339
+ .tab {
340
+ flex: 1;
341
+ padding: 0.5rem 0.75rem;
342
+ background: var(--surface);
343
+ border: 1px solid var(--border);
344
+ border-radius: 6px;
345
+ color: var(--text-muted);
346
+ cursor: pointer;
347
+ font-size: 0.85rem;
348
+ transition: all 0.15s ease;
349
+ }
350
+
351
+ .tab:hover {
352
+ border-color: var(--primary);
353
+ color: var(--text);
354
+ }
355
+
356
+ .tab.active {
357
+ background: rgb(212, 118, 58, 0.12);
358
+ border-color: var(--primary);
359
+ color: var(--primary);
360
+ }
361
+
362
+ .tab-content {
363
+ display: none;
364
+ }
365
+
366
+ .tab-content.active {
367
+ display: block;
368
+ }
369
+
370
+ /* URL Form */
371
+ .url-form {
372
+ background: var(--surface);
373
+ border-radius: 8px;
374
+ padding: 1rem;
375
+ }
376
+
377
+ .url-form input[type="url"] {
378
+ width: 100%;
379
+ padding: 0.6rem 0.75rem;
380
+ background: var(--bg);
381
+ border: 1px solid var(--border);
382
+ border-radius: 6px;
383
+ color: var(--text);
384
+ font-size: 0.9rem;
385
+ margin-bottom: 0.75rem;
386
+ transition: border-color 0.15s ease;
387
+ }
388
+
389
+ .url-form input[type="url"]:focus {
390
+ outline: none;
391
+ border-color: var(--primary);
392
+ }
393
+
394
+ .url-form input[type="url"]::placeholder {
395
+ color: var(--text-muted);
396
+ }
397
+
398
+ /* Text Form */
399
+ .text-form {
400
+ background: var(--surface);
401
+ border-radius: 8px;
402
+ padding: 1rem;
403
+ }
404
+
405
+ .text-form textarea {
406
+ width: 100%;
407
+ padding: 0.6rem 0.75rem;
408
+ background: var(--bg);
409
+ border: 1px solid var(--border);
410
+ border-radius: 6px;
411
+ color: var(--text);
412
+ font-size: 0.9rem;
413
+ margin-bottom: 0.75rem;
414
+ transition: border-color 0.15s ease;
415
+ resize: vertical;
416
+ min-height: 120px;
417
+ font-family: inherit;
418
+ line-height: 1.5;
419
+ }
420
+
421
+ .text-form textarea:focus {
422
+ outline: none;
423
+ border-color: var(--primary);
424
+ }
425
+
426
+ .text-form textarea::placeholder {
427
+ color: var(--text-muted);
428
+ }
429
+
430
+ .text-form .hint {
431
+ color: var(--text-muted);
432
+ font-size: 0.8rem;
433
+ text-align: center;
434
+ margin: 0;
435
+ }
436
+
437
+ /* Buttons */
438
+ .submit-btn {
439
+ width: 100%;
440
+ padding: 0.6rem 1rem;
441
+ background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
442
+ background-size: 200% 200%;
443
+ animation: gradient-idle 3s ease infinite;
444
+ color: white;
445
+ border: none;
446
+ border-radius: 8px;
447
+ cursor: pointer;
448
+ font-size: 0.9rem;
449
+ font-weight: 500;
450
+ transition: filter 0.15s ease;
451
+ margin-bottom: 0.5rem;
452
+ }
453
+
454
+ .submit-btn:hover {
455
+ filter: brightness(1.1);
456
+ animation: gradient-shift 0.8s ease infinite;
457
+ }
458
+
459
+ .submit-btn:disabled {
460
+ opacity: 0.6;
461
+ cursor: not-allowed;
462
+ filter: none;
463
+ animation: none;
464
+ }
465
+
466
+ .url-form .hint {
467
+ color: var(--text-muted);
468
+ font-size: 0.8rem;
469
+ text-align: center;
470
+ margin: 0;
471
+ }
472
+
473
+ input[type="file"] {
474
+ display: none;
475
+ }
476
+
477
+ .file-label {
478
+ display: inline-block;
479
+ padding: 0.5rem 1rem;
480
+ background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
481
+ background-size: 200% 200%;
482
+ animation: gradient-idle 3s ease infinite;
483
+ color: white;
484
+ border-radius: 8px;
485
+ cursor: pointer;
486
+ font-weight: 500;
487
+ font-size: 0.9rem;
488
+ transition: filter 0.15s ease;
489
+ }
490
+
491
+ .file-label:hover {
492
+ filter: brightness(1.1);
493
+ animation: gradient-shift 0.8s ease infinite;
494
+ }
495
+
496
+ /* Device Info - Subtle footer-like display */
497
+ .device-info {
498
+ display: none;
499
+ justify-content: center;
500
+ align-items: center;
501
+ gap: 1rem;
502
+ padding: 0.75rem 1rem;
503
+ font-size: 0.7rem;
504
+ color: var(--text-muted);
505
+ margin-top: 0.5rem;
506
+ opacity: 0.7;
507
+ }
508
+
509
+ .device-info.visible {
510
+ display: flex;
511
+ flex-wrap: wrap;
512
+ }
513
+
514
+ .device-info i {
515
+ color: var(--primary);
516
+ opacity: 0.8;
517
+ }
518
+
519
+ .device-memory {
520
+ opacity: 0.9;
521
+ }
522
+
523
+ .device-batch {
524
+ background: var(--surface);
525
+ padding: 0.2rem 0.5rem;
526
+ border-radius: 4px;
527
+ font-size: 0.65rem;
528
+ }
529
+
530
+ /* Icon spacing in buttons and tabs */
531
+ .tab i,
532
+ .submit-btn i,
533
+ .file-label i {
534
+ margin-right: 0.4rem;
535
+ }
536
+
537
+ /* Status Messages */
538
+ .status {
539
+ font-size: 0.85rem;
540
+ display: none;
541
+ }
542
+
543
+ .status.visible {
544
+ display: block;
545
+ }
546
+
547
+ .status i {
548
+ margin-right: 0.4rem;
549
+ }
550
+
551
+ .status.loading {
552
+ color: var(--text-muted);
553
+ }
554
+
555
+ .status.error {
556
+ color: var(--error);
557
+ }
558
+
559
+ .status.success {
560
+ color: var(--success);
561
+ }
562
+
563
+ /* Audio Player */
564
+ .player {
565
+ margin-top: 1.5rem;
566
+ width: 100%;
567
+ display: none;
568
+ padding: 1.25rem;
569
+ background: var(--surface);
570
+ border-radius: 12px;
571
+ border: 1px solid var(--border);
572
+ }
573
+
574
+ .player.visible {
575
+ display: block;
576
+ }
577
+
578
+ /* Hidden audio element */
579
+ #audio {
580
+ display: none;
581
+ }
582
+
583
+ /* Custom Audio Player */
584
+ .custom-player {
585
+ display: flex;
586
+ align-items: center;
587
+ gap: 0.75rem;
588
+ }
589
+
590
+ .player-btn {
591
+ width: 36px;
592
+ height: 36px;
593
+ border: none;
594
+ border-radius: 8px;
595
+ background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
596
+ background-size: 200% 200%;
597
+ animation: gradient-idle 3s ease infinite;
598
+ color: white;
599
+ cursor: pointer;
600
+ display: flex;
601
+ align-items: center;
602
+ justify-content: center;
603
+ font-size: 0.85rem;
604
+ transition: filter 0.15s ease;
605
+ flex-shrink: 0;
606
+ }
607
+
608
+ .player-btn:hover {
609
+ filter: brightness(1.1);
610
+ animation: gradient-shift 0.8s ease infinite;
611
+ }
612
+
613
+ .player-btn.volume-btn {
614
+ background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
615
+ background-size: 200% 200%;
616
+ animation: gradient-idle 3s ease infinite;
617
+ color: var(--text-muted);
618
+ width: 32px;
619
+ height: 32px;
620
+ font-size: 0.8rem;
621
+ }
622
+
623
+ .player-btn.volume-btn:hover {
624
+ color: var(--text);
625
+ animation: gradient-shift 0.8s ease infinite;
626
+ }
627
+
628
+ .player-btn.download-btn {
629
+ background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
630
+ background-size: 200% 200%;
631
+ animation: gradient-idle 3s ease infinite;
632
+ color: var(--text-muted);
633
+ width: 32px;
634
+ height: 32px;
635
+ font-size: 0.8rem;
636
+ }
637
+
638
+ .player-btn.download-btn:hover {
639
+ color: var(--primary);
640
+ animation: gradient-shift 0.8s ease infinite;
641
+ }
642
+
643
+ .progress-container {
644
+ flex: 1;
645
+ height: 6px;
646
+ background: var(--bg);
647
+ border-radius: 3px;
648
+ position: relative;
649
+ cursor: pointer;
650
+ }
651
+
652
+ .progress-bar {
653
+ height: 100%;
654
+ background: var(--primary);
655
+ border-radius: 3px;
656
+ width: 0%;
657
+ transition: width 0.1s ease;
658
+ pointer-events: none;
659
+ }
660
+
661
+ .progress-slider {
662
+ position: absolute;
663
+ top: 0;
664
+ left: 0;
665
+ width: 100%;
666
+ height: 100%;
667
+ opacity: 0;
668
+ cursor: pointer;
669
+ margin: 0;
670
+ appearance: none;
671
+ }
672
+
673
+ .progress-slider::-webkit-slider-thumb {
674
+ appearance: none;
675
+ width: 14px;
676
+ height: 14px;
677
+ background: var(--primary);
678
+ border-radius: 50%;
679
+ cursor: pointer;
680
+ }
681
+
682
+ .progress-slider::-moz-range-thumb {
683
+ width: 14px;
684
+ height: 14px;
685
+ background: var(--primary);
686
+ border-radius: 50%;
687
+ cursor: pointer;
688
+ border: none;
689
+ }
690
+
691
+ .time-display {
692
+ font-size: 0.75rem;
693
+ color: var(--text-muted);
694
+ min-width: 80px;
695
+ text-align: center;
696
+ font-variant-numeric: tabular-nums;
697
+ }
698
+
699
+ .filename {
700
+ margin-bottom: 0.75rem;
701
+ font-size: 0.85rem;
702
+ font-weight: 500;
703
+ color: var(--text);
704
+ word-break: break-all;
705
+ }
706
+
707
+ /* Spinner Animation */
708
+ .spinner {
709
+ display: inline-block;
710
+ width: 14px;
711
+ height: 14px;
712
+ border: 2px solid var(--text-muted);
713
+ border-top-color: var(--primary);
714
+ border-radius: 50%;
715
+ animation: spin 1s linear infinite;
716
+ margin-right: 0.4rem;
717
+ vertical-align: middle;
718
+ }
719
+
720
+ @keyframes spin {
721
+ to {
722
+ transform: rotate(360deg);
723
+ }
724
+ }
725
+
726
+ /* Footer */
727
+ footer {
728
+ margin-top: auto;
729
+ padding-top: 1.5rem;
730
+ color: var(--text-muted);
731
+ font-size: 0.75rem;
732
+ flex-shrink: 0;
733
+ }
734
+
735
+ footer a {
736
+ color: var(--primary);
737
+ text-decoration: none;
738
+ }
739
+
740
+ footer a:hover {
741
+ text-decoration: underline;
742
+ }
743
+
744
+ footer i.fa-heart {
745
+ color: var(--error);
746
+ }
747
+
748
+ footer i.fa-github {
749
+ margin-right: 0.2rem;
750
+ }
751
+
752
+ /* Logo */
753
+ .logo {
754
+ width: 250px;
755
+ height: auto;
756
+ margin-bottom: 0.75rem;
757
+ }
758
+
759
+ /* Tablet styles */
760
+ @media (width <= 768px) {
761
+ body {
762
+ padding: 1rem;
763
+ }
764
+
765
+ h1 {
766
+ font-size: 1.5rem;
767
+ }
768
+
769
+ .logo {
770
+ width: 200px;
771
+ }
772
+
773
+ .drop-zone {
774
+ padding: 1.25rem 1rem;
775
+ }
776
+
777
+ .tabs {
778
+ flex-direction: column;
779
+ }
780
+
781
+ .tab {
782
+ width: 100%;
783
+ }
784
+ }
785
+
786
+ /* Mobile styles */
787
+ @media (width <= 480px) {
788
+ body {
789
+ padding: 0.75rem;
790
+ }
791
+
792
+ h1 {
793
+ font-size: 1.35rem;
794
+ }
795
+
796
+ .subtitle {
797
+ font-size: 0.8rem;
798
+ }
799
+
800
+ .logo {
801
+ width: 160px;
802
+ }
803
+
804
+ .drop-zone {
805
+ padding: 1rem;
806
+ }
807
+
808
+ .drop-zone p {
809
+ font-size: 0.9rem;
810
+ }
811
+
812
+ .url-form {
813
+ padding: 0.75rem;
814
+ }
815
+
816
+ .url-form input[type="url"] {
817
+ padding: 0.5rem;
818
+ font-size: 0.85rem;
819
+ }
820
+
821
+ .submit-btn,
822
+ .file-label {
823
+ padding: 0.5rem 0.75rem;
824
+ font-size: 0.85rem;
825
+ }
826
+
827
+ .filename {
828
+ font-size: 0.75rem;
829
+ padding: 0.4rem 0.5rem;
830
+ }
831
+
832
+ footer {
833
+ font-size: 0.7rem;
834
+ text-align: center;
835
+ }
836
+ }
837
+
838
+ /* Ensure touch targets are large enough */
839
+ @media (pointer: coarse) {
840
+ .tab,
841
+ .submit-btn,
842
+ .file-label {
843
+ min-height: 44px;
844
+ display: flex;
845
+ align-items: center;
846
+ justify-content: center;
847
+ }
848
+ }
src/talking_snake/static/talking_snake.png ADDED

Git LFS Details

  • SHA256: 2fcd44f86e8dd2a1a7e04e3275cf46f94a7f16d24c0bae0e7debab29e1aa6305
  • Pointer size: 131 Bytes
  • Size of remote file: 280 kB
src/talking_snake/tts.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TTS engine wrapper for Qwen3-TTS."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import wave
7
+ from abc import ABC, abstractmethod
8
+ from collections.abc import Iterator
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ import numpy as np
13
+ import numpy.typing as npt
14
+
15
+
16
+ class TTSEngineProtocol(ABC):
17
+ """Protocol for TTS engines, enabling dependency injection and mocking."""
18
+
19
+ @abstractmethod
20
+ def synthesize(self, text: str) -> Iterator[bytes]:
21
+ """Synthesize text to audio.
22
+
23
+ Args:
24
+ text: Text to synthesize.
25
+
26
+ Yields:
27
+ WAV audio data chunks.
28
+ """
29
+ ...
30
+
31
+ @property
32
+ @abstractmethod
33
+ def sample_rate(self) -> int:
34
+ """Return the sample rate of generated audio."""
35
+ ...
36
+
37
+ @property
38
+ def batch_size(self) -> int:
39
+ """Return the batch size for parallel processing (default: 1)."""
40
+ return 1
41
+
42
+
43
+ # Professional narration style prompt
44
+ # This instructs the model to read with clear, authoritative delivery
45
+ PROFESSIONAL_STYLE = (
46
+ "Read this as a professional narrator with clear enunciation, "
47
+ "measured pacing, and an authoritative yet warm tone. "
48
+ "Speak naturally as if presenting an audiobook or documentary. "
49
+ "Avoid sounding robotic or monotone. Emphasize key points and maintain a steady rhythm. "
50
+ "Use appropriate intonation to convey meaning and keep the listener engaged. "
51
+ "This is not casual conversation, but a polished narration style. "
52
+ "Use proper diction, read correctly acronyms, and pronounce all words clearly."
53
+ )
54
+
55
+ # Language to default voice mapping
56
+ LANGUAGE_VOICES: dict[str, str] = {
57
+ "english": "Ryan",
58
+ "chinese": "Vivian",
59
+ "japanese": "Ono_Anna",
60
+ "korean": "Sohee",
61
+ }
62
+
63
+ # Default chunk size for streaming
64
+ # Larger chunks = more stable voice, fewer artifacts at boundaries
65
+ # Smaller chunks = faster first audio but potential voice instability
66
+ # 1200 chars provides good balance for natural speech flow
67
+ DEFAULT_CHUNK_SIZE = 1200
68
+
69
+
70
+ class QwenTTSEngine(TTSEngineProtocol):
71
+ """TTS engine using Qwen3-TTS model."""
72
+
73
+ # Available voices for CustomVoice model:
74
+ # Chinese: Vivian, Serena, Uncle_Fu, Dylan (Beijing), Eric (Sichuan)
75
+ # English: Ryan, Aiden
76
+ # Japanese: Ono_Anna
77
+ # Korean: Sohee
78
+ AVAILABLE_VOICES = [
79
+ "Vivian",
80
+ "Serena",
81
+ "Uncle_Fu",
82
+ "Dylan",
83
+ "Eric",
84
+ "Ryan",
85
+ "Aiden",
86
+ "Ono_Anna",
87
+ "Sohee",
88
+ ]
89
+
90
+ def __init__(
91
+ self,
92
+ voice: str | None = None,
93
+ language: str = "english",
94
+ device: str = "cuda",
95
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
96
+ model_name: str = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
97
+ ) -> None:
98
+ """Initialize the TTS engine.
99
+
100
+ Args:
101
+ voice: Voice name to use for synthesis. If None, uses default for language.
102
+ Available voices:
103
+ Chinese: Vivian, Serena, Uncle_Fu, Dylan, Eric
104
+ English: Ryan, Aiden
105
+ Japanese: Ono_Anna
106
+ Korean: Sohee
107
+ language: Language for TTS. One of: english, chinese, japanese, korean.
108
+ Sets default voice if voice is None.
109
+ device: Device to run the model on ('cuda' or 'cpu').
110
+ chunk_size: Maximum characters per chunk (smaller = faster streaming start).
111
+ model_name: HuggingFace model identifier.
112
+ """
113
+ import logging
114
+ import warnings
115
+
116
+ import torch
117
+ from qwen_tts import Qwen3TTSModel
118
+
119
+ # Suppress the pad_token_id warning from transformers
120
+ logging.getLogger("transformers.generation.utils").setLevel(logging.ERROR)
121
+ warnings.filterwarnings("ignore", message=".*pad_token_id.*")
122
+
123
+ self.language = language.lower()
124
+ self.voice = voice or LANGUAGE_VOICES.get(self.language, "Ryan")
125
+ self.device = device
126
+ self.chunk_size = chunk_size
127
+ self._sample_rate = 24000
128
+ self._batch_size = 1 # Will be calculated after model loads
129
+
130
+ # Determine dtype based on device
131
+ dtype = torch.bfloat16 if device == "cuda" else torch.float32
132
+
133
+ # Try to use flash attention on CUDA
134
+ attn_impl = "flash_attention_2" if device == "cuda" else "eager"
135
+
136
+ try:
137
+ self.model = Qwen3TTSModel.from_pretrained(
138
+ model_name,
139
+ device_map=device,
140
+ dtype=dtype,
141
+ attn_implementation=attn_impl,
142
+ )
143
+ except Exception:
144
+ # Fallback without flash attention
145
+ self.model = Qwen3TTSModel.from_pretrained(
146
+ model_name,
147
+ device_map=device,
148
+ dtype=dtype,
149
+ )
150
+
151
+ # Calculate optimal batch size based on available VRAM
152
+ if device == "cuda":
153
+ self._batch_size = self._calculate_batch_size()
154
+ print(f" Batch size: {self._batch_size} (based on available VRAM)")
155
+
156
+ def _calculate_batch_size(self) -> int:
157
+ """Calculate optimal batch size based on available GPU memory.
158
+
159
+ Returns:
160
+ Recommended batch size for parallel chunk processing.
161
+ """
162
+ import torch
163
+
164
+ if not torch.cuda.is_available():
165
+ return 1
166
+
167
+ try:
168
+ # Get GPU memory info
169
+ gpu_mem = torch.cuda.get_device_properties(0).total_memory
170
+ allocated = torch.cuda.memory_allocated(0)
171
+ reserved = torch.cuda.memory_reserved(0)
172
+
173
+ # Available memory (conservative estimate)
174
+ available = gpu_mem - max(allocated, reserved)
175
+
176
+ # Model uses ~6GB, each batch item needs ~2-3GB for generation
177
+ # Use conservative 3GB per batch item estimate
178
+ mem_per_batch = 3 * 1024 * 1024 * 1024 # 3GB
179
+
180
+ # Calculate batch size, minimum 1, cap at 8
181
+ batch_size = max(1, min(8, int(available / mem_per_batch)))
182
+
183
+ return batch_size
184
+ except Exception:
185
+ return 1
186
+
187
+ @property
188
+ def sample_rate(self) -> int:
189
+ """Return the sample rate of generated audio."""
190
+ return self._sample_rate
191
+
192
+ @property
193
+ def batch_size(self) -> int:
194
+ """Return the current batch size."""
195
+ return self._batch_size
196
+
197
+ def synthesize(self, text: str) -> Iterator[bytes]:
198
+ """Synthesize text to WAV audio using batched GPU inference.
199
+
200
+ Args:
201
+ text: Text to synthesize.
202
+
203
+ Yields:
204
+ WAV audio data chunks.
205
+ """
206
+ if not text.strip():
207
+ return
208
+
209
+ # Split text into chunks for streaming
210
+ chunks = self._split_text(text)
211
+
212
+ # First chunk includes WAV header
213
+ first_chunk = True
214
+
215
+ # Process chunks in batches for GPU efficiency
216
+ batch_size = self._batch_size
217
+
218
+ for i in range(0, len(chunks), batch_size):
219
+ batch = chunks[i : i + batch_size]
220
+
221
+ # Filter empty chunks
222
+ batch = [c for c in batch if c.strip()]
223
+ if not batch:
224
+ continue
225
+
226
+ # Always use batched call for consistent GPU memory allocation
227
+ # Use professional narration style for clear, authoritative delivery
228
+ batch_instruct = (
229
+ [PROFESSIONAL_STYLE] * len(batch) if len(batch) > 1 else PROFESSIONAL_STYLE
230
+ )
231
+ audios, sr = self.model.generate_custom_voice(
232
+ text=batch if len(batch) > 1 else batch[0],
233
+ speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
234
+ instruct=batch_instruct,
235
+ # Use lower temperature for more stable, consistent voice
236
+ temperature=0.7,
237
+ repetition_penalty=1.1,
238
+ )
239
+
240
+ # Ensure audios is a list for consistent iteration
241
+ if len(batch) == 1:
242
+ audios = [audios]
243
+
244
+ # Yield each audio chunk in order
245
+ for audio in audios:
246
+ wav_bytes = self._audio_to_wav(audio, sr, include_header=first_chunk)
247
+ first_chunk = False
248
+ yield wav_bytes
249
+
250
+ def _split_text(self, text: str, max_chars: int | None = None) -> list[str]:
251
+ """Split text into chunks suitable for TTS.
252
+
253
+ Splits on sentence boundaries when possible.
254
+
255
+ Args:
256
+ text: Text to split.
257
+ max_chars: Maximum characters per chunk. Uses self.chunk_size if None.
258
+
259
+ Returns:
260
+ List of text chunks.
261
+ """
262
+ import re
263
+
264
+ if max_chars is None:
265
+ max_chars = self.chunk_size
266
+
267
+ # Split on sentence boundaries
268
+ sentences = re.split(r"(?<=[.!?])\s+", text)
269
+
270
+ chunks: list[str] = []
271
+ current_chunk: list[str] = []
272
+ current_length = 0
273
+
274
+ for sentence in sentences:
275
+ sentence = sentence.strip()
276
+ if not sentence:
277
+ continue
278
+
279
+ if current_length + len(sentence) > max_chars and current_chunk:
280
+ chunks.append(" ".join(current_chunk))
281
+ current_chunk = []
282
+ current_length = 0
283
+
284
+ current_chunk.append(sentence)
285
+ current_length += len(sentence) + 1
286
+
287
+ if current_chunk:
288
+ chunks.append(" ".join(current_chunk))
289
+
290
+ return chunks
291
+
292
+ def _audio_to_wav(
293
+ self,
294
+ audio: npt.NDArray[np.float32] | list[float],
295
+ sample_rate: int,
296
+ include_header: bool = True,
297
+ ) -> bytes:
298
+ """Convert audio array to WAV bytes.
299
+
300
+ Args:
301
+ audio: Audio data as numpy array or list.
302
+ sample_rate: Sample rate of the audio.
303
+ include_header: Whether to include WAV header.
304
+
305
+ Returns:
306
+ WAV audio data as bytes.
307
+ """
308
+ import numpy as np
309
+
310
+ # Convert to numpy array if needed
311
+ if isinstance(audio, list):
312
+ audio = np.array(audio, dtype=np.float32)
313
+
314
+ # Ensure audio is 1D
315
+ if audio.ndim > 1:
316
+ audio = audio.flatten()
317
+
318
+ # Normalize and convert to 16-bit PCM
319
+ audio = np.clip(audio, -1.0, 1.0)
320
+ audio_int16 = (audio * 32767).astype(np.int16)
321
+
322
+ if include_header:
323
+ # Write full WAV file
324
+ buffer = io.BytesIO()
325
+ with wave.open(buffer, "wb") as wav_file:
326
+ wav_file.setnchannels(1)
327
+ wav_file.setsampwidth(2) # 16-bit
328
+ wav_file.setframerate(sample_rate)
329
+ wav_file.writeframes(audio_int16.tobytes())
330
+ result: bytes = buffer.getvalue()
331
+ return result
332
+ else:
333
+ # Return raw PCM data
334
+ pcm_data: bytes = audio_int16.tobytes()
335
+ return pcm_data
336
+
337
+
338
+ class MockTTSEngine(TTSEngineProtocol):
339
+ """Mock TTS engine for testing."""
340
+
341
+ def __init__(self, sample_rate: int = 24000) -> None:
342
+ """Initialize the mock TTS engine.
343
+
344
+ Args:
345
+ sample_rate: Sample rate for generated audio.
346
+ """
347
+ self._sample_rate = sample_rate
348
+
349
+ @property
350
+ def sample_rate(self) -> int:
351
+ """Return the sample rate of generated audio."""
352
+ return self._sample_rate
353
+
354
+ def synthesize(self, text: str) -> Iterator[bytes]:
355
+ """Generate silent WAV audio for testing.
356
+
357
+ Args:
358
+ text: Text to synthesize (used to determine duration).
359
+
360
+ Yields:
361
+ WAV audio data with silence.
362
+ """
363
+ if not text.strip():
364
+ return
365
+
366
+ # Generate ~0.1 seconds of silence per word
367
+ words = len(text.split())
368
+ duration_samples = int(self._sample_rate * 0.1 * max(1, words))
369
+
370
+ # Create silent audio
371
+ silence = b"\x00\x00" * duration_samples
372
+
373
+ # Write WAV header + silence
374
+ buffer = io.BytesIO()
375
+ with wave.open(buffer, "wb") as wav_file:
376
+ wav_file.setnchannels(1)
377
+ wav_file.setsampwidth(2)
378
+ wav_file.setframerate(self._sample_rate)
379
+ wav_file.writeframes(silence)
380
+
381
+ yield buffer.getvalue()
uv.lock ADDED
The diff for this file is too large to render. See raw diff