Spaces:
Sleeping
Sleeping
Commit ·
19933fe
1
Parent(s): 618f5ab
Initial deployment with LFS
Browse files- .gitattributes +2 -0
- Dockerfile +31 -0
- README.md +10 -5
- pyproject.toml +111 -0
- src/talking_snake/__init__.py +3 -0
- src/talking_snake/__main__.py +103 -0
- src/talking_snake/__pycache__/__init__.cpython-312.pyc +0 -0
- src/talking_snake/__pycache__/__main__.cpython-312.pyc +0 -0
- src/talking_snake/__pycache__/app.cpython-312.pyc +0 -0
- src/talking_snake/__pycache__/extract.cpython-312.pyc +0 -0
- src/talking_snake/__pycache__/tts.cpython-312.pyc +0 -0
- src/talking_snake/app.py +935 -0
- src/talking_snake/extract.py +489 -0
- src/talking_snake/static/app.js +773 -0
- src/talking_snake/static/apple-touch-icon.png +3 -0
- src/talking_snake/static/favicon.png +3 -0
- src/talking_snake/static/icon-192.png +3 -0
- src/talking_snake/static/icon-512.png +3 -0
- src/talking_snake/static/index.html +142 -0
- src/talking_snake/static/manifest.json +36 -0
- src/talking_snake/static/sample.wav +3 -0
- src/talking_snake/static/styles.css +848 -0
- src/talking_snake/static/talking_snake.png +3 -0
- src/talking_snake/tts.py +381 -0
- uv.lock +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
sox \
|
| 6 |
+
libsox-dev \
|
| 7 |
+
git \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Install uv
|
| 11 |
+
RUN pip install uv
|
| 12 |
+
|
| 13 |
+
# Create non-root user for HF Spaces
|
| 14 |
+
RUN useradd -m -u 1000 user
|
| 15 |
+
USER user
|
| 16 |
+
ENV HOME=/home/user \
|
| 17 |
+
PATH=/home/user/.local/bin:$PATH
|
| 18 |
+
|
| 19 |
+
WORKDIR $HOME/app
|
| 20 |
+
|
| 21 |
+
# Copy project files
|
| 22 |
+
COPY --chown=user . .
|
| 23 |
+
|
| 24 |
+
# Install dependencies
|
| 25 |
+
RUN uv sync --no-dev
|
| 26 |
+
|
| 27 |
+
# Expose port 7860 (HF Spaces default)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Run the app
|
| 31 |
+
CMD ["uv", "run", "talking-snake", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
---
|
| 2 |
title: Talking Snake
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
-
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Talking Snake
|
| 3 |
+
emoji: 🐍
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
+
app_port: 7860
|
| 10 |
+
suggested_hardware: l4x1
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Talking Snake
|
| 14 |
+
|
| 15 |
+
PDF and web page to speech using Qwen3-TTS.
|
| 16 |
+
|
| 17 |
+
Click "Duplicate this Space" to deploy your own instance (L4 or A100 recommended for speed).
|
pyproject.toml
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "talking-snake"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Just a talking snake that reads PDFs and web pages aloud."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
license = { text = "MIT" }
|
| 7 |
+
requires-python = ">=3.11"
|
| 8 |
+
authors = [{ name = "Luca" }]
|
| 9 |
+
keywords = ["tts", "pdf", "speech", "audiobook", "text-to-speech", "listening"]
|
| 10 |
+
classifiers = [
|
| 11 |
+
"Development Status :: 3 - Alpha",
|
| 12 |
+
"Intended Audience :: End Users/Desktop",
|
| 13 |
+
"License :: OSI Approved :: MIT License",
|
| 14 |
+
"Operating System :: POSIX :: Linux",
|
| 15 |
+
"Programming Language :: Python :: 3.11",
|
| 16 |
+
"Programming Language :: Python :: 3.12",
|
| 17 |
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
# System dependencies (not installable via pip):
|
| 21 |
+
# - sox: Audio processing tool required by qwen-tts
|
| 22 |
+
# Ubuntu/Debian: sudo apt-get install sox libsox-dev
|
| 23 |
+
# macOS: brew install sox
|
| 24 |
+
# Fedora: sudo dnf install sox sox-devel
|
| 25 |
+
|
| 26 |
+
dependencies = [
|
| 27 |
+
"fastapi>=0.115.0",
|
| 28 |
+
"uvicorn[standard]>=0.32.0",
|
| 29 |
+
"qwen-tts>=0.1.1",
|
| 30 |
+
"torch>=2.5.0",
|
| 31 |
+
"pdfminer.six>=20260107",
|
| 32 |
+
"python-multipart>=0.0.12",
|
| 33 |
+
"jinja2>=3.1.4",
|
| 34 |
+
"httpx>=0.27.0",
|
| 35 |
+
"trafilatura>=2.0.0",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
[project.optional-dependencies]
|
| 39 |
+
dev = [
|
| 40 |
+
"pytest>=8.3.0",
|
| 41 |
+
"pytest-asyncio>=0.24.0",
|
| 42 |
+
"pytest-cov>=6.0.0",
|
| 43 |
+
"httpx>=0.27.0",
|
| 44 |
+
"ruff>=0.8.0",
|
| 45 |
+
"mypy>=1.14.0",
|
| 46 |
+
"pre-commit>=4.0.0",
|
| 47 |
+
]
|
| 48 |
+
# Flash Attention for ~2x faster inference (requires CUDA 11.6+)
|
| 49 |
+
# Install separately: pip install flash-attn --no-build-isolation
|
| 50 |
+
fast = [
|
| 51 |
+
"flash-attn>=2.5.0",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
[project.scripts]
|
| 55 |
+
talking-snake = "talking_snake.__main__:main"
|
| 56 |
+
|
| 57 |
+
[build-system]
|
| 58 |
+
requires = ["hatchling"]
|
| 59 |
+
build-backend = "hatchling.build"
|
| 60 |
+
|
| 61 |
+
[tool.hatch.build.targets.wheel]
|
| 62 |
+
packages = ["src/talking_snake"]
|
| 63 |
+
|
| 64 |
+
[tool.pytest.ini_options]
|
| 65 |
+
asyncio_mode = "auto"
|
| 66 |
+
testpaths = ["tests"]
|
| 67 |
+
markers = [
|
| 68 |
+
"slow: marks tests as slow (run with --run-slow)",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
[tool.ruff]
|
| 72 |
+
line-length = 100
|
| 73 |
+
target-version = "py311"
|
| 74 |
+
|
| 75 |
+
[tool.ruff.lint]
|
| 76 |
+
select = ["E", "F", "I", "N", "W", "UP"]
|
| 77 |
+
|
| 78 |
+
[tool.ruff.lint.per-file-ignores]
|
| 79 |
+
# PDF xref tables require trailing whitespace per spec
|
| 80 |
+
"tests/conftest.py" = ["W291"]
|
| 81 |
+
|
| 82 |
+
[tool.coverage.run]
|
| 83 |
+
source = ["src/talking_snake"]
|
| 84 |
+
branch = true
|
| 85 |
+
omit = [
|
| 86 |
+
"*/tests/*",
|
| 87 |
+
"*/__main__.py",
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
[tool.coverage.report]
|
| 91 |
+
exclude_lines = [
|
| 92 |
+
"pragma: no cover",
|
| 93 |
+
"if TYPE_CHECKING:",
|
| 94 |
+
"raise NotImplementedError",
|
| 95 |
+
"if __name__ == .__main__.:",
|
| 96 |
+
"class QwenTTSEngine",
|
| 97 |
+
"def _audio_to_wav",
|
| 98 |
+
"def _split_text",
|
| 99 |
+
"import torch",
|
| 100 |
+
"from qwen_tts",
|
| 101 |
+
]
|
| 102 |
+
show_missing = true
|
| 103 |
+
skip_covered = true
|
| 104 |
+
fail_under = 70
|
| 105 |
+
|
| 106 |
+
[tool.mypy]
|
| 107 |
+
python_version = "3.11"
|
| 108 |
+
warn_return_any = true
|
| 109 |
+
warn_unused_configs = true
|
| 110 |
+
disallow_untyped_defs = true
|
| 111 |
+
ignore_missing_imports = true
|
src/talking_snake/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF-to-Speech web server using Qwen3-TTS - listen to any content."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
src/talking_snake/__main__.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI entry point for the Reader server."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def main() -> int:
|
| 10 |
+
"""Main entry point for the Reader CLI.
|
| 11 |
+
|
| 12 |
+
Returns:
|
| 13 |
+
Exit code (0 for success).
|
| 14 |
+
"""
|
| 15 |
+
parser = argparse.ArgumentParser(
|
| 16 |
+
prog="reader",
|
| 17 |
+
description="PDF-to-Speech web server - listen to any content",
|
| 18 |
+
)
|
| 19 |
+
parser.add_argument(
|
| 20 |
+
"--voice",
|
| 21 |
+
type=str,
|
| 22 |
+
default=None,
|
| 23 |
+
help="Voice name for TTS. Options: Vivian, Serena, Uncle_Fu, Dylan, Eric, "
|
| 24 |
+
"Ryan, Aiden, Ono_Anna, Sohee (default: auto based on language)",
|
| 25 |
+
)
|
| 26 |
+
parser.add_argument(
|
| 27 |
+
"--language",
|
| 28 |
+
type=str,
|
| 29 |
+
default="english",
|
| 30 |
+
choices=["english", "chinese", "japanese", "korean"],
|
| 31 |
+
help="Language for TTS (default: english). Sets default voice if --voice not specified.",
|
| 32 |
+
)
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--host",
|
| 35 |
+
type=str,
|
| 36 |
+
default="0.0.0.0",
|
| 37 |
+
help="Host to bind the server to (default: 0.0.0.0)",
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--port",
|
| 41 |
+
type=int,
|
| 42 |
+
default=8000,
|
| 43 |
+
help="Port to bind the server to (default: 8000)",
|
| 44 |
+
)
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
"--device",
|
| 47 |
+
type=str,
|
| 48 |
+
default="cuda",
|
| 49 |
+
choices=["cuda", "cpu"],
|
| 50 |
+
help="Device to run the TTS model on (default: cuda)",
|
| 51 |
+
)
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--reload",
|
| 54 |
+
action="store_true",
|
| 55 |
+
help="Enable auto-reload for development",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
args = parser.parse_args()
|
| 59 |
+
|
| 60 |
+
print("🚀 Starting Reader server...")
|
| 61 |
+
print(f" Language: {args.language}")
|
| 62 |
+
print(f" Voice: {args.voice or 'auto'}")
|
| 63 |
+
print(f" Device: {args.device}")
|
| 64 |
+
print(f" URL: http://{args.host}:{args.port}")
|
| 65 |
+
print()
|
| 66 |
+
|
| 67 |
+
# Import here to avoid slow startup for --help
|
| 68 |
+
import uvicorn
|
| 69 |
+
|
| 70 |
+
from talking_snake.app import create_app
|
| 71 |
+
from talking_snake.tts import QwenTTSEngine
|
| 72 |
+
|
| 73 |
+
# Initialize TTS engine
|
| 74 |
+
print("📦 Loading TTS model (this may take a moment)...")
|
| 75 |
+
try:
|
| 76 |
+
tts_engine = QwenTTSEngine(
|
| 77 |
+
voice=args.voice,
|
| 78 |
+
language=args.language,
|
| 79 |
+
device=args.device,
|
| 80 |
+
)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"❌ Failed to load TTS model: {e}", file=sys.stderr)
|
| 83 |
+
return 1
|
| 84 |
+
|
| 85 |
+
print("✅ TTS model loaded!")
|
| 86 |
+
print()
|
| 87 |
+
|
| 88 |
+
# Create app with engine
|
| 89 |
+
app = create_app(tts_engine=tts_engine)
|
| 90 |
+
|
| 91 |
+
# Run server
|
| 92 |
+
uvicorn.run(
|
| 93 |
+
app,
|
| 94 |
+
host=args.host,
|
| 95 |
+
port=args.port,
|
| 96 |
+
log_level="info",
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return 0
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
sys.exit(main())
|
src/talking_snake/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (260 Bytes). View file
|
|
|
src/talking_snake/__pycache__/__main__.cpython-312.pyc
ADDED
|
Binary file (3.44 kB). View file
|
|
|
src/talking_snake/__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (34.7 kB). View file
|
|
|
src/talking_snake/__pycache__/extract.cpython-312.pyc
ADDED
|
Binary file (18.6 kB). View file
|
|
|
src/talking_snake/__pycache__/tts.cpython-312.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
src/talking_snake/app.py
ADDED
|
@@ -0,0 +1,935 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for PDF-to-Speech server."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import queue
|
| 8 |
+
import struct
|
| 9 |
+
import threading
|
| 10 |
+
import time
|
| 11 |
+
import uuid
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import TYPE_CHECKING
|
| 14 |
+
from urllib.parse import urlparse
|
| 15 |
+
|
| 16 |
+
import httpx
|
| 17 |
+
import trafilatura
|
| 18 |
+
from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
|
| 19 |
+
from fastapi.responses import HTMLResponse, StreamingResponse
|
| 20 |
+
from fastapi.staticfiles import StaticFiles
|
| 21 |
+
from pydantic import BaseModel
|
| 22 |
+
|
| 23 |
+
from talking_snake.extract import clean_text, extract_text, get_page_count
|
| 24 |
+
from talking_snake.tts import (
|
| 25 |
+
DEFAULT_CHUNK_SIZE,
|
| 26 |
+
LANGUAGE_VOICES,
|
| 27 |
+
MockTTSEngine,
|
| 28 |
+
TTSEngineProtocol,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
if TYPE_CHECKING:
|
| 32 |
+
from collections.abc import Iterator
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Request timeout for fetching URLs (seconds)
|
| 36 |
+
URL_FETCH_TIMEOUT = 60.0
|
| 37 |
+
# Maximum file size to fetch (50MB)
|
| 38 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024
|
| 39 |
+
|
| 40 |
+
# Initial estimate for time calculation before calibration
|
| 41 |
+
# This value is refined after the first chunk is processed
|
| 42 |
+
# RTX 4090 + flash-attn: ~0.001s/char, RTX 4090: ~0.002s/char, RTX 3060: ~0.005s/char
|
| 43 |
+
INITIAL_SECONDS_PER_CHAR = 0.002 # Optimistic GPU estimate, calibrates after first chunk
|
| 44 |
+
|
| 45 |
+
# Job timeout (seconds) - jobs are cleaned up after this time
|
| 46 |
+
JOB_TIMEOUT = 3600 # 1 hour
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class AudioJob:
|
| 50 |
+
"""Represents an audio generation job with a queue for streaming."""
|
| 51 |
+
|
| 52 |
+
def __init__(self, job_id: str):
|
| 53 |
+
self.job_id = job_id
|
| 54 |
+
self.audio_queue: queue.Queue[bytes | None] = queue.Queue()
|
| 55 |
+
self.started = time.time()
|
| 56 |
+
self.completed = False
|
| 57 |
+
self.error: str | None = None
|
| 58 |
+
self.sample_rate = 24000 # Default, will be set by TTS engine
|
| 59 |
+
self.header_sent = False
|
| 60 |
+
|
| 61 |
+
def put_audio(self, audio_bytes: bytes) -> None:
|
| 62 |
+
"""Add audio data to the queue."""
|
| 63 |
+
self.audio_queue.put(audio_bytes)
|
| 64 |
+
|
| 65 |
+
def finish(self) -> None:
|
| 66 |
+
"""Signal that audio generation is complete."""
|
| 67 |
+
self.completed = True
|
| 68 |
+
self.audio_queue.put(None) # Sentinel to signal end
|
| 69 |
+
|
| 70 |
+
def set_error(self, error: str) -> None:
|
| 71 |
+
"""Set an error and finish the job."""
|
| 72 |
+
self.error = error
|
| 73 |
+
self.completed = True
|
| 74 |
+
self.audio_queue.put(None)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class JobManager:
|
| 78 |
+
"""Manages audio generation jobs."""
|
| 79 |
+
|
| 80 |
+
def __init__(self) -> None:
|
| 81 |
+
self._jobs: dict[str, AudioJob] = {}
|
| 82 |
+
self._lock = threading.Lock()
|
| 83 |
+
|
| 84 |
+
def create_job(self) -> AudioJob:
|
| 85 |
+
"""Create a new job and return it."""
|
| 86 |
+
job_id = str(uuid.uuid4())
|
| 87 |
+
job = AudioJob(job_id)
|
| 88 |
+
with self._lock:
|
| 89 |
+
self._jobs[job_id] = job
|
| 90 |
+
self._cleanup_old_jobs()
|
| 91 |
+
return job
|
| 92 |
+
|
| 93 |
+
def get_job(self, job_id: str) -> AudioJob | None:
|
| 94 |
+
"""Get a job by ID."""
|
| 95 |
+
with self._lock:
|
| 96 |
+
return self._jobs.get(job_id)
|
| 97 |
+
|
| 98 |
+
def remove_job(self, job_id: str) -> None:
|
| 99 |
+
"""Remove a job."""
|
| 100 |
+
with self._lock:
|
| 101 |
+
self._jobs.pop(job_id, None)
|
| 102 |
+
|
| 103 |
+
def _cleanup_old_jobs(self) -> None:
|
| 104 |
+
"""Remove jobs older than JOB_TIMEOUT."""
|
| 105 |
+
now = time.time()
|
| 106 |
+
to_remove = [jid for jid, job in self._jobs.items() if now - job.started > JOB_TIMEOUT]
|
| 107 |
+
for jid in to_remove:
|
| 108 |
+
del self._jobs[jid]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# Global job manager
|
| 112 |
+
_job_manager = JobManager()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class UrlRequest(BaseModel):
|
| 116 |
+
"""Request body for URL-based reading."""
|
| 117 |
+
|
| 118 |
+
url: str
|
| 119 |
+
language: str = "english"
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class TextRequest(BaseModel):
|
| 123 |
+
"""Request body for direct text reading."""
|
| 124 |
+
|
| 125 |
+
text: str
|
| 126 |
+
language: str = "english"
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class EstimateResponse(BaseModel):
|
| 130 |
+
"""Response for time estimation."""
|
| 131 |
+
|
| 132 |
+
text_length: int
|
| 133 |
+
chunk_count: int
|
| 134 |
+
estimated_seconds: float
|
| 135 |
+
estimated_minutes: float
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# Global TTS engine instance (set during startup)
|
| 139 |
+
_tts_engine: TTSEngineProtocol | None = None
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def create_app(tts_engine: TTSEngineProtocol | None = None) -> FastAPI:
|
| 143 |
+
"""Create and configure the FastAPI application.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
tts_engine: TTS engine to use. If None, uses MockTTSEngine.
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Configured FastAPI application.
|
| 150 |
+
"""
|
| 151 |
+
global _tts_engine
|
| 152 |
+
_tts_engine = tts_engine or MockTTSEngine()
|
| 153 |
+
|
| 154 |
+
app = FastAPI(
|
| 155 |
+
title="Reader",
|
| 156 |
+
description="PDF-to-Speech web server - listen to any content",
|
| 157 |
+
version="0.1.0",
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Mount static files
|
| 161 |
+
static_dir = Path(__file__).parent / "static"
|
| 162 |
+
if static_dir.exists():
|
| 163 |
+
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
| 164 |
+
|
| 165 |
+
# Register routes
|
| 166 |
+
app.add_api_route("/", index, methods=["GET"], response_class=HTMLResponse)
|
| 167 |
+
app.add_api_route("/api/read", read_pdf, methods=["POST"])
|
| 168 |
+
app.add_api_route("/api/read-url", read_url, methods=["POST"])
|
| 169 |
+
app.add_api_route("/api/read-stream", read_pdf_stream, methods=["POST"])
|
| 170 |
+
app.add_api_route("/api/read-url-stream", read_url_stream, methods=["POST"])
|
| 171 |
+
app.add_api_route("/api/read-text-stream", read_text_stream, methods=["POST"])
|
| 172 |
+
app.add_api_route("/api/audio/{job_id}", stream_audio, methods=["GET"])
|
| 173 |
+
app.add_api_route("/api/languages", get_languages, methods=["GET"])
|
| 174 |
+
app.add_api_route("/api/device-info-stream", stream_device_info, methods=["GET"])
|
| 175 |
+
app.add_api_route("/api/health", health_check, methods=["GET"])
|
| 176 |
+
|
| 177 |
+
return app
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
async def index(request: Request) -> HTMLResponse:
|
| 181 |
+
"""Serve the main page.
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
request: The incoming request.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
HTML response with the main page.
|
| 188 |
+
"""
|
| 189 |
+
static_dir = Path(__file__).parent / "static"
|
| 190 |
+
index_file = static_dir / "index.html"
|
| 191 |
+
|
| 192 |
+
if not index_file.exists():
|
| 193 |
+
return HTMLResponse(
|
| 194 |
+
content="<h1>Reader</h1><p>Static files not found.</p>",
|
| 195 |
+
status_code=200,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
return HTMLResponse(content=index_file.read_text())
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
async def read_pdf(file: UploadFile = File(...)) -> StreamingResponse:
|
| 202 |
+
"""Read a PDF and return synthesized speech.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
file: Uploaded PDF file.
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Streaming WAV audio response.
|
| 209 |
+
|
| 210 |
+
Raises:
|
| 211 |
+
HTTPException: If file is not a PDF or extraction fails.
|
| 212 |
+
"""
|
| 213 |
+
if _tts_engine is None:
|
| 214 |
+
raise HTTPException(status_code=500, detail="TTS engine not initialized")
|
| 215 |
+
|
| 216 |
+
# Validate file type
|
| 217 |
+
if not file.filename or not file.filename.lower().endswith(".pdf"):
|
| 218 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 219 |
+
|
| 220 |
+
# Read file content
|
| 221 |
+
try:
|
| 222 |
+
pdf_bytes = await file.read()
|
| 223 |
+
except Exception as e:
|
| 224 |
+
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
| 225 |
+
|
| 226 |
+
if not pdf_bytes:
|
| 227 |
+
raise HTTPException(status_code=400, detail="Empty file")
|
| 228 |
+
|
| 229 |
+
# Extract text
|
| 230 |
+
try:
|
| 231 |
+
text = extract_text(pdf_bytes)
|
| 232 |
+
except Exception as e:
|
| 233 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
|
| 234 |
+
|
| 235 |
+
if not text.strip():
|
| 236 |
+
raise HTTPException(status_code=400, detail="No text found in PDF")
|
| 237 |
+
|
| 238 |
+
# Stream TTS audio
|
| 239 |
+
def generate_audio() -> Iterator[bytes]:
|
| 240 |
+
assert _tts_engine is not None
|
| 241 |
+
yield from _tts_engine.synthesize(text)
|
| 242 |
+
|
| 243 |
+
return StreamingResponse(
|
| 244 |
+
generate_audio(),
|
| 245 |
+
media_type="audio/wav",
|
| 246 |
+
headers={
|
| 247 |
+
"Content-Disposition": f'inline; filename="{Path(file.filename).stem}.wav"',
|
| 248 |
+
},
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
async def read_url(request: UrlRequest) -> StreamingResponse:
|
| 253 |
+
"""Read content from a URL (PDF or web page) and return synthesized speech.
|
| 254 |
+
|
| 255 |
+
For PDFs: extracts text and removes headers/footers/page numbers.
|
| 256 |
+
For web pages: extracts main article content, removing navigation,
|
| 257 |
+
sidebars, footers, ads, and other boilerplate.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
request: Request containing the URL to fetch.
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Streaming WAV audio response.
|
| 264 |
+
|
| 265 |
+
Raises:
|
| 266 |
+
HTTPException: If URL is invalid, fetch fails, or extraction fails.
|
| 267 |
+
"""
|
| 268 |
+
if _tts_engine is None:
|
| 269 |
+
raise HTTPException(status_code=500, detail="TTS engine not initialized")
|
| 270 |
+
|
| 271 |
+
# Validate URL
|
| 272 |
+
url = request.url.strip()
|
| 273 |
+
if not url:
|
| 274 |
+
raise HTTPException(status_code=400, detail="URL is required")
|
| 275 |
+
|
| 276 |
+
parsed = urlparse(url)
|
| 277 |
+
if parsed.scheme not in ("http", "https"):
|
| 278 |
+
raise HTTPException(status_code=400, detail="Only HTTP/HTTPS URLs are supported")
|
| 279 |
+
|
| 280 |
+
# Determine if this is a PDF or web page
|
| 281 |
+
is_pdf = parsed.path.lower().endswith(".pdf")
|
| 282 |
+
|
| 283 |
+
# Fetch the content
|
| 284 |
+
try:
|
| 285 |
+
async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
|
| 286 |
+
response = await client.get(url)
|
| 287 |
+
response.raise_for_status()
|
| 288 |
+
|
| 289 |
+
# Check content length if available
|
| 290 |
+
content_length = response.headers.get("content-length")
|
| 291 |
+
if content_length and int(content_length) > MAX_FILE_SIZE:
|
| 292 |
+
raise HTTPException(
|
| 293 |
+
status_code=400,
|
| 294 |
+
detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
content = response.content
|
| 298 |
+
|
| 299 |
+
if len(content) > MAX_FILE_SIZE:
|
| 300 |
+
raise HTTPException(
|
| 301 |
+
status_code=400,
|
| 302 |
+
detail=f"File too large. Maximum size is {MAX_FILE_SIZE // 1024 // 1024}MB",
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Also check content-type header to detect PDFs served without .pdf extension
|
| 306 |
+
content_type = response.headers.get("content-type", "").lower()
|
| 307 |
+
if "application/pdf" in content_type:
|
| 308 |
+
is_pdf = True
|
| 309 |
+
|
| 310 |
+
except httpx.TimeoutException:
|
| 311 |
+
raise HTTPException(status_code=408, detail="Request timed out while fetching URL")
|
| 312 |
+
except httpx.HTTPStatusError as e:
|
| 313 |
+
raise HTTPException(
|
| 314 |
+
status_code=400,
|
| 315 |
+
detail=f"Failed to fetch URL: HTTP {e.response.status_code}",
|
| 316 |
+
)
|
| 317 |
+
except httpx.RequestError as e:
|
| 318 |
+
raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
|
| 319 |
+
|
| 320 |
+
if not content:
|
| 321 |
+
raise HTTPException(status_code=400, detail="Empty content at URL")
|
| 322 |
+
|
| 323 |
+
# Extract text based on content type
|
| 324 |
+
if is_pdf:
|
| 325 |
+
try:
|
| 326 |
+
text = extract_text(content)
|
| 327 |
+
except Exception as e:
|
| 328 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
|
| 329 |
+
else:
|
| 330 |
+
# Use trafilatura to extract main content from HTML
|
| 331 |
+
# This removes navigation, sidebars, footers, ads, etc.
|
| 332 |
+
try:
|
| 333 |
+
extracted = trafilatura.extract(
|
| 334 |
+
content,
|
| 335 |
+
include_comments=False,
|
| 336 |
+
include_tables=True,
|
| 337 |
+
no_fallback=False,
|
| 338 |
+
favor_precision=True,
|
| 339 |
+
)
|
| 340 |
+
if extracted:
|
| 341 |
+
# Apply additional cleaning for TTS
|
| 342 |
+
text = clean_text(extracted)
|
| 343 |
+
else:
|
| 344 |
+
text = ""
|
| 345 |
+
except Exception as e:
|
| 346 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
|
| 347 |
+
|
| 348 |
+
if not text or not text.strip():
|
| 349 |
+
raise HTTPException(status_code=400, detail="No readable content found at URL")
|
| 350 |
+
|
| 351 |
+
# Extract filename from URL for the response
|
| 352 |
+
filename = Path(parsed.path).stem or parsed.netloc or "document"
|
| 353 |
+
|
| 354 |
+
# Stream TTS audio
|
| 355 |
+
def generate_audio() -> Iterator[bytes]:
|
| 356 |
+
assert _tts_engine is not None
|
| 357 |
+
yield from _tts_engine.synthesize(text)
|
| 358 |
+
|
| 359 |
+
return StreamingResponse(
|
| 360 |
+
generate_audio(),
|
| 361 |
+
media_type="audio/wav",
|
| 362 |
+
headers={
|
| 363 |
+
"Content-Disposition": f'inline; filename="{filename}.wav"',
|
| 364 |
+
},
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
async def health_check() -> dict[str, str]:
|
| 369 |
+
"""Health check endpoint.
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
Status information.
|
| 373 |
+
"""
|
| 374 |
+
return {"status": "ok"}
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
async def get_languages() -> dict[str, list[str]]:
|
| 378 |
+
"""Get available languages.
|
| 379 |
+
|
| 380 |
+
Returns:
|
| 381 |
+
List of available language names.
|
| 382 |
+
"""
|
| 383 |
+
return {"languages": list(LANGUAGE_VOICES.keys())}
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def _get_device_info() -> dict:
|
| 387 |
+
"""Get device and model information with real-time memory stats.
|
| 388 |
+
|
| 389 |
+
Returns:
|
| 390 |
+
Device type, memory usage, and model info.
|
| 391 |
+
"""
|
| 392 |
+
import torch
|
| 393 |
+
|
| 394 |
+
info = {
|
| 395 |
+
"device": "cpu",
|
| 396 |
+
"device_name": "CPU",
|
| 397 |
+
"memory_used_gb": 0,
|
| 398 |
+
"memory_total_gb": 0,
|
| 399 |
+
"memory_percent": 0,
|
| 400 |
+
"batch_size": 1,
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
if torch.cuda.is_available():
|
| 404 |
+
props = torch.cuda.get_device_properties(0)
|
| 405 |
+
# Use reserved memory for more accurate GPU usage (includes PyTorch cache)
|
| 406 |
+
reserved = torch.cuda.memory_reserved(0)
|
| 407 |
+
allocated = torch.cuda.memory_allocated(0)
|
| 408 |
+
total = props.total_memory
|
| 409 |
+
|
| 410 |
+
# Show reserved memory (what's actually held by PyTorch)
|
| 411 |
+
used = max(reserved, allocated)
|
| 412 |
+
|
| 413 |
+
info["device"] = "cuda"
|
| 414 |
+
info["device_name"] = props.name
|
| 415 |
+
info["memory_used_gb"] = round(used / 1024**3, 1)
|
| 416 |
+
info["memory_total_gb"] = round(total / 1024**3, 1)
|
| 417 |
+
info["memory_percent"] = round((used / total) * 100, 1) if total > 0 else 0
|
| 418 |
+
# Also include allocated for debugging
|
| 419 |
+
info["memory_allocated_gb"] = round(allocated / 1024**3, 1)
|
| 420 |
+
|
| 421 |
+
if _tts_engine is not None:
|
| 422 |
+
info["batch_size"] = getattr(_tts_engine, "batch_size", 1)
|
| 423 |
+
info["chunk_size"] = getattr(_tts_engine, "chunk_size", 800)
|
| 424 |
+
|
| 425 |
+
return info
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
async def stream_device_info() -> StreamingResponse:
|
| 429 |
+
"""Stream device info updates via SSE.
|
| 430 |
+
|
| 431 |
+
Returns:
|
| 432 |
+
SSE stream with device info updates every 3 seconds.
|
| 433 |
+
"""
|
| 434 |
+
import asyncio
|
| 435 |
+
from collections.abc import AsyncIterator
|
| 436 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 437 |
+
|
| 438 |
+
executor = ThreadPoolExecutor(max_workers=1)
|
| 439 |
+
|
| 440 |
+
async def generate_events() -> AsyncIterator[str]:
|
| 441 |
+
"""Generate SSE events for device info."""
|
| 442 |
+
loop = asyncio.get_event_loop()
|
| 443 |
+
while True:
|
| 444 |
+
try:
|
| 445 |
+
# Run torch calls in executor to avoid blocking
|
| 446 |
+
info = await loop.run_in_executor(executor, _get_device_info)
|
| 447 |
+
yield f"data: {json.dumps(info)}\n\n"
|
| 448 |
+
except Exception as e:
|
| 449 |
+
# Send error info but continue
|
| 450 |
+
yield f'data: {{"error": "{e!s}"}}\n\n'
|
| 451 |
+
await asyncio.sleep(3)
|
| 452 |
+
|
| 453 |
+
return StreamingResponse(
|
| 454 |
+
generate_events(),
|
| 455 |
+
media_type="text/event-stream",
|
| 456 |
+
headers={
|
| 457 |
+
"Cache-Control": "no-cache",
|
| 458 |
+
"Connection": "keep-alive",
|
| 459 |
+
"X-Accel-Buffering": "no",
|
| 460 |
+
},
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def _estimate_time(
|
| 465 |
+
text: str, seconds_per_char: float = INITIAL_SECONDS_PER_CHAR
|
| 466 |
+
) -> tuple[int, float]:
|
| 467 |
+
"""Estimate processing time for text.
|
| 468 |
+
|
| 469 |
+
Args:
|
| 470 |
+
text: Text to process.
|
| 471 |
+
seconds_per_char: Calibrated rate (defaults to initial estimate).
|
| 472 |
+
|
| 473 |
+
Returns:
|
| 474 |
+
Tuple of (chunk_count, estimated_seconds).
|
| 475 |
+
"""
|
| 476 |
+
# Count chunks (500 chars per chunk approximately)
|
| 477 |
+
chunk_count = max(1, len(text) // 500 + (1 if len(text) % 500 else 0))
|
| 478 |
+
estimated_seconds = len(text) * seconds_per_char
|
| 479 |
+
return chunk_count, estimated_seconds
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
def _create_wav_header(sample_rate: int = 24000, bits_per_sample: int = 16) -> bytes:
|
| 483 |
+
"""Create a WAV header for streaming (unknown length).
|
| 484 |
+
|
| 485 |
+
Uses maximum possible file size since we don't know the final length.
|
| 486 |
+
|
| 487 |
+
Args:
|
| 488 |
+
sample_rate: Audio sample rate.
|
| 489 |
+
bits_per_sample: Bits per sample.
|
| 490 |
+
|
| 491 |
+
Returns:
|
| 492 |
+
WAV header bytes.
|
| 493 |
+
"""
|
| 494 |
+
channels = 1
|
| 495 |
+
byte_rate = sample_rate * channels * bits_per_sample // 8
|
| 496 |
+
block_align = channels * bits_per_sample // 8
|
| 497 |
+
|
| 498 |
+
# Use maximum size for streaming (will be truncated on close)
|
| 499 |
+
max_size = 0x7FFFFFFF
|
| 500 |
+
|
| 501 |
+
header = io.BytesIO()
|
| 502 |
+
header.write(b"RIFF")
|
| 503 |
+
header.write(struct.pack("<I", max_size))
|
| 504 |
+
header.write(b"WAVE")
|
| 505 |
+
header.write(b"fmt ")
|
| 506 |
+
header.write(struct.pack("<I", 16)) # fmt chunk size
|
| 507 |
+
header.write(struct.pack("<H", 1)) # PCM format
|
| 508 |
+
header.write(struct.pack("<H", channels))
|
| 509 |
+
header.write(struct.pack("<I", sample_rate))
|
| 510 |
+
header.write(struct.pack("<I", byte_rate))
|
| 511 |
+
header.write(struct.pack("<H", block_align))
|
| 512 |
+
header.write(struct.pack("<H", bits_per_sample))
|
| 513 |
+
header.write(b"data")
|
| 514 |
+
header.write(struct.pack("<I", max_size - 36))
|
| 515 |
+
|
| 516 |
+
return header.getvalue()
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def _generate_audio_to_job(
|
| 520 |
+
job: AudioJob,
|
| 521 |
+
text: str,
|
| 522 |
+
tts_engine: TTSEngineProtocol,
|
| 523 |
+
language: str = "english",
|
| 524 |
+
doc_name: str = "document",
|
| 525 |
+
doc_type: str = "text",
|
| 526 |
+
page_count: int | None = None,
|
| 527 |
+
) -> Iterator[bytes]:
|
| 528 |
+
"""Generate audio with progress events via SSE, streaming audio to job queue.
|
| 529 |
+
|
| 530 |
+
This function sends progress events via SSE while simultaneously writing
|
| 531 |
+
audio data to the job's queue for streaming by another endpoint.
|
| 532 |
+
Supports batched GPU inference for faster processing.
|
| 533 |
+
|
| 534 |
+
Args:
|
| 535 |
+
job: AudioJob to write audio data to.
|
| 536 |
+
text: Text to synthesize.
|
| 537 |
+
tts_engine: TTS engine to use.
|
| 538 |
+
language: Language for TTS (english, chinese, japanese, korean).
|
| 539 |
+
doc_name: Name of the document being processed.
|
| 540 |
+
doc_type: Type of document (pdf, url, text).
|
| 541 |
+
page_count: Number of pages (for PDFs).
|
| 542 |
+
tts_engine: TTS engine to use.
|
| 543 |
+
language: Language for TTS (english, chinese, japanese, korean).
|
| 544 |
+
|
| 545 |
+
Yields:
|
| 546 |
+
SSE events for progress.
|
| 547 |
+
"""
|
| 548 |
+
import re
|
| 549 |
+
|
| 550 |
+
# Apply language if the engine supports it
|
| 551 |
+
if hasattr(tts_engine, "set_language"):
|
| 552 |
+
tts_engine.set_language(language)
|
| 553 |
+
|
| 554 |
+
# Get chunk size and batch size from engine
|
| 555 |
+
chunk_size = getattr(tts_engine, "chunk_size", DEFAULT_CHUNK_SIZE)
|
| 556 |
+
batch_size = getattr(tts_engine, "batch_size", 1)
|
| 557 |
+
|
| 558 |
+
# Split text into chunks (same logic as TTS engine)
|
| 559 |
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
| 560 |
+
chunks: list[str] = []
|
| 561 |
+
current_chunk: list[str] = []
|
| 562 |
+
current_length = 0
|
| 563 |
+
|
| 564 |
+
for sentence in sentences:
|
| 565 |
+
sentence = sentence.strip()
|
| 566 |
+
if not sentence:
|
| 567 |
+
continue
|
| 568 |
+
if current_length + len(sentence) > chunk_size and current_chunk:
|
| 569 |
+
chunks.append(" ".join(current_chunk))
|
| 570 |
+
current_chunk = []
|
| 571 |
+
current_length = 0
|
| 572 |
+
current_chunk.append(sentence)
|
| 573 |
+
current_length += len(sentence) + 1
|
| 574 |
+
|
| 575 |
+
if current_chunk:
|
| 576 |
+
chunks.append(" ".join(current_chunk))
|
| 577 |
+
|
| 578 |
+
total_chunks = len(chunks) if chunks else 1
|
| 579 |
+
total_chars = sum(len(c) for c in chunks)
|
| 580 |
+
|
| 581 |
+
# Use initial estimate before calibration
|
| 582 |
+
seconds_per_char = INITIAL_SECONDS_PER_CHAR
|
| 583 |
+
estimated_total = total_chars * seconds_per_char
|
| 584 |
+
|
| 585 |
+
# Send initial progress event with job_id and batch info
|
| 586 |
+
progress_data = {
|
| 587 |
+
"type": "start",
|
| 588 |
+
"job_id": job.job_id,
|
| 589 |
+
"current": 0,
|
| 590 |
+
"total": total_chunks,
|
| 591 |
+
"percent": 0,
|
| 592 |
+
"estimated_remaining": estimated_total,
|
| 593 |
+
"batch_size": batch_size,
|
| 594 |
+
"doc_name": doc_name,
|
| 595 |
+
"doc_type": doc_type,
|
| 596 |
+
"page_count": page_count,
|
| 597 |
+
"total_chars": total_chars,
|
| 598 |
+
"status": f"Starting (batch size: {batch_size})...",
|
| 599 |
+
}
|
| 600 |
+
yield f"event: start\ndata: {json.dumps(progress_data)}\n\n".encode()
|
| 601 |
+
|
| 602 |
+
# Generate audio - the TTS engine handles batching internally
|
| 603 |
+
# We pass the full text and let it process in optimized batches
|
| 604 |
+
start_time = time.time()
|
| 605 |
+
chunks_processed = 0
|
| 606 |
+
|
| 607 |
+
try:
|
| 608 |
+
for audio_bytes in tts_engine.synthesize(text):
|
| 609 |
+
# Write audio to job queue for streaming
|
| 610 |
+
job.put_audio(audio_bytes)
|
| 611 |
+
chunks_processed += 1
|
| 612 |
+
|
| 613 |
+
# Calibrate time estimate
|
| 614 |
+
elapsed = time.time() - start_time
|
| 615 |
+
if chunks_processed > 0:
|
| 616 |
+
time_per_chunk = elapsed / chunks_processed
|
| 617 |
+
remaining_chunks = total_chunks - chunks_processed
|
| 618 |
+
remaining = remaining_chunks * time_per_chunk
|
| 619 |
+
else:
|
| 620 |
+
remaining = estimated_total
|
| 621 |
+
|
| 622 |
+
progress_data = {
|
| 623 |
+
"type": "progress",
|
| 624 |
+
"current": chunks_processed,
|
| 625 |
+
"total": total_chunks,
|
| 626 |
+
"percent": int((chunks_processed / total_chunks) * 100),
|
| 627 |
+
"estimated_remaining": round(max(0, remaining), 1),
|
| 628 |
+
"chars_processed": sum(
|
| 629 |
+
len(chunks[i]) for i in range(min(chunks_processed, len(chunks)))
|
| 630 |
+
),
|
| 631 |
+
"total_chars": total_chars,
|
| 632 |
+
"status": f"Processing chunk {chunks_processed}/{total_chunks}",
|
| 633 |
+
}
|
| 634 |
+
yield f"event: progress\ndata: {json.dumps(progress_data)}\n\n".encode()
|
| 635 |
+
|
| 636 |
+
except Exception as e:
|
| 637 |
+
error_msg = f"TTS generation failed: {e!s}"
|
| 638 |
+
error_data = {
|
| 639 |
+
"type": "error",
|
| 640 |
+
"message": error_msg,
|
| 641 |
+
"chunk": chunks_processed + 1,
|
| 642 |
+
"total_chunks": total_chunks,
|
| 643 |
+
}
|
| 644 |
+
job.set_error(error_msg)
|
| 645 |
+
yield f"event: error\ndata: {json.dumps(error_data)}\n\n".encode()
|
| 646 |
+
return
|
| 647 |
+
|
| 648 |
+
# Signal audio generation complete
|
| 649 |
+
job.finish()
|
| 650 |
+
|
| 651 |
+
# Send completion event
|
| 652 |
+
total_time = time.time() - start_time
|
| 653 |
+
complete_data = {
|
| 654 |
+
"type": "complete",
|
| 655 |
+
"total_time": round(total_time, 1),
|
| 656 |
+
"chunks_processed": chunks_processed,
|
| 657 |
+
"batch_size": batch_size,
|
| 658 |
+
}
|
| 659 |
+
yield f"event: complete\ndata: {json.dumps(complete_data)}\n\n".encode()
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
async def stream_audio(job_id: str) -> StreamingResponse:
|
| 663 |
+
"""Stream audio data for a job.
|
| 664 |
+
|
| 665 |
+
This endpoint streams the raw WAV audio as it's being generated.
|
| 666 |
+
The browser can start playing as soon as data arrives.
|
| 667 |
+
|
| 668 |
+
Args:
|
| 669 |
+
job_id: The job ID to stream audio for.
|
| 670 |
+
|
| 671 |
+
Returns:
|
| 672 |
+
Streaming WAV audio response.
|
| 673 |
+
"""
|
| 674 |
+
job = _job_manager.get_job(job_id)
|
| 675 |
+
if job is None:
|
| 676 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 677 |
+
|
| 678 |
+
def generate_audio() -> Iterator[bytes]:
|
| 679 |
+
# Send WAV header first
|
| 680 |
+
yield _create_wav_header(sample_rate=24000)
|
| 681 |
+
|
| 682 |
+
# Stream audio data as it becomes available
|
| 683 |
+
while True:
|
| 684 |
+
try:
|
| 685 |
+
# Wait for audio data with timeout
|
| 686 |
+
audio_data = job.audio_queue.get(timeout=300) # 5 min timeout
|
| 687 |
+
if audio_data is None:
|
| 688 |
+
# End of stream
|
| 689 |
+
break
|
| 690 |
+
# Skip WAV headers from individual chunks, only send raw PCM
|
| 691 |
+
if audio_data[:4] == b"RIFF":
|
| 692 |
+
# This is a WAV file, extract just the PCM data
|
| 693 |
+
# WAV header is 44 bytes for standard PCM
|
| 694 |
+
yield audio_data[44:]
|
| 695 |
+
else:
|
| 696 |
+
yield audio_data
|
| 697 |
+
except queue.Empty:
|
| 698 |
+
# Timeout waiting for data
|
| 699 |
+
break
|
| 700 |
+
|
| 701 |
+
# Clean up job after streaming
|
| 702 |
+
_job_manager.remove_job(job_id)
|
| 703 |
+
|
| 704 |
+
return StreamingResponse(
|
| 705 |
+
generate_audio(),
|
| 706 |
+
media_type="audio/wav",
|
| 707 |
+
headers={
|
| 708 |
+
"Cache-Control": "no-cache",
|
| 709 |
+
"X-Accel-Buffering": "no",
|
| 710 |
+
},
|
| 711 |
+
)
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
async def read_pdf_stream(
|
| 715 |
+
file: UploadFile = File(...),
|
| 716 |
+
language: str = Form("english"),
|
| 717 |
+
) -> StreamingResponse:
|
| 718 |
+
"""Read a PDF with streaming progress updates.
|
| 719 |
+
|
| 720 |
+
Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
|
| 721 |
+
|
| 722 |
+
Args:
|
| 723 |
+
file: Uploaded PDF file.
|
| 724 |
+
language: Language for TTS (english, chinese, japanese, korean).
|
| 725 |
+
|
| 726 |
+
Returns:
|
| 727 |
+
Streaming response with progress events including job_id.
|
| 728 |
+
"""
|
| 729 |
+
if _tts_engine is None:
|
| 730 |
+
raise HTTPException(status_code=500, detail="TTS engine not initialized")
|
| 731 |
+
|
| 732 |
+
# Validate language
|
| 733 |
+
if language not in LANGUAGE_VOICES:
|
| 734 |
+
language = "english"
|
| 735 |
+
|
| 736 |
+
if not file.filename or not file.filename.lower().endswith(".pdf"):
|
| 737 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 738 |
+
|
| 739 |
+
try:
|
| 740 |
+
pdf_bytes = await file.read()
|
| 741 |
+
except Exception as e:
|
| 742 |
+
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
| 743 |
+
|
| 744 |
+
if not pdf_bytes:
|
| 745 |
+
raise HTTPException(status_code=400, detail="Empty file")
|
| 746 |
+
|
| 747 |
+
try:
|
| 748 |
+
text = extract_text(pdf_bytes)
|
| 749 |
+
except Exception as e:
|
| 750 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract text: {e}")
|
| 751 |
+
|
| 752 |
+
if not text.strip():
|
| 753 |
+
raise HTTPException(status_code=400, detail="No text found in PDF")
|
| 754 |
+
|
| 755 |
+
# Get page count for progress display
|
| 756 |
+
try:
|
| 757 |
+
page_count = get_page_count(pdf_bytes)
|
| 758 |
+
except Exception:
|
| 759 |
+
page_count = None
|
| 760 |
+
|
| 761 |
+
# Create a job for this request
|
| 762 |
+
job = _job_manager.create_job()
|
| 763 |
+
|
| 764 |
+
return StreamingResponse(
|
| 765 |
+
_generate_audio_to_job(
|
| 766 |
+
job,
|
| 767 |
+
text,
|
| 768 |
+
_tts_engine,
|
| 769 |
+
language,
|
| 770 |
+
doc_name=file.filename or "document.pdf",
|
| 771 |
+
doc_type="pdf",
|
| 772 |
+
page_count=page_count,
|
| 773 |
+
),
|
| 774 |
+
media_type="text/event-stream",
|
| 775 |
+
headers={
|
| 776 |
+
"Cache-Control": "no-cache",
|
| 777 |
+
"Connection": "keep-alive",
|
| 778 |
+
"X-Accel-Buffering": "no",
|
| 779 |
+
},
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
async def read_text_stream(request: TextRequest) -> StreamingResponse:
|
| 784 |
+
"""Read pasted text with streaming progress updates.
|
| 785 |
+
|
| 786 |
+
Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
|
| 787 |
+
|
| 788 |
+
Args:
|
| 789 |
+
request: Text request containing the text to read and language.
|
| 790 |
+
|
| 791 |
+
Returns:
|
| 792 |
+
Streaming response with progress events including job_id.
|
| 793 |
+
"""
|
| 794 |
+
if _tts_engine is None:
|
| 795 |
+
raise HTTPException(status_code=500, detail="TTS engine not initialized")
|
| 796 |
+
|
| 797 |
+
text = request.text.strip()
|
| 798 |
+
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
| 799 |
+
|
| 800 |
+
if not text:
|
| 801 |
+
raise HTTPException(status_code=400, detail="Text is required")
|
| 802 |
+
|
| 803 |
+
if len(text) > 500000: # ~500KB limit for pasted text
|
| 804 |
+
raise HTTPException(status_code=400, detail="Text too long (max 500,000 characters)")
|
| 805 |
+
|
| 806 |
+
# Apply text normalization
|
| 807 |
+
text = clean_text(text)
|
| 808 |
+
|
| 809 |
+
if not text.strip():
|
| 810 |
+
raise HTTPException(status_code=400, detail="No readable text provided")
|
| 811 |
+
|
| 812 |
+
# Create a job for this request
|
| 813 |
+
job = _job_manager.create_job()
|
| 814 |
+
|
| 815 |
+
return StreamingResponse(
|
| 816 |
+
_generate_audio_to_job(
|
| 817 |
+
job,
|
| 818 |
+
text,
|
| 819 |
+
_tts_engine,
|
| 820 |
+
language,
|
| 821 |
+
doc_name="Pasted Text",
|
| 822 |
+
doc_type="text",
|
| 823 |
+
),
|
| 824 |
+
media_type="text/event-stream",
|
| 825 |
+
headers={
|
| 826 |
+
"Cache-Control": "no-cache",
|
| 827 |
+
"Connection": "keep-alive",
|
| 828 |
+
"X-Accel-Buffering": "no",
|
| 829 |
+
},
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
async def read_url_stream(request: UrlRequest) -> StreamingResponse:
|
| 834 |
+
"""Read content from URL with streaming progress updates.
|
| 835 |
+
|
| 836 |
+
Returns SSE events for progress. Audio is streamed separately via /api/audio/{job_id}.
|
| 837 |
+
|
| 838 |
+
Args:
|
| 839 |
+
request: URL request containing the URL to fetch and language.
|
| 840 |
+
|
| 841 |
+
Returns:
|
| 842 |
+
Streaming response with progress events including job_id.
|
| 843 |
+
"""
|
| 844 |
+
if _tts_engine is None:
|
| 845 |
+
raise HTTPException(status_code=500, detail="TTS engine not initialized")
|
| 846 |
+
|
| 847 |
+
url = request.url.strip()
|
| 848 |
+
language = request.language if request.language in LANGUAGE_VOICES else "english"
|
| 849 |
+
|
| 850 |
+
if not url:
|
| 851 |
+
raise HTTPException(status_code=400, detail="URL is required")
|
| 852 |
+
|
| 853 |
+
try:
|
| 854 |
+
parsed = urlparse(url)
|
| 855 |
+
if parsed.scheme not in ("http", "https"):
|
| 856 |
+
raise HTTPException(status_code=400, detail="URL must use HTTP or HTTPS")
|
| 857 |
+
except Exception as e:
|
| 858 |
+
raise HTTPException(status_code=400, detail=f"Invalid URL: {e}")
|
| 859 |
+
|
| 860 |
+
# Determine if this is a PDF or HTML page
|
| 861 |
+
is_pdf = url.lower().endswith(".pdf")
|
| 862 |
+
|
| 863 |
+
try:
|
| 864 |
+
async with httpx.AsyncClient(timeout=URL_FETCH_TIMEOUT, follow_redirects=True) as client:
|
| 865 |
+
response = await client.get(url)
|
| 866 |
+
response.raise_for_status()
|
| 867 |
+
|
| 868 |
+
content_type = response.headers.get("content-type", "").lower()
|
| 869 |
+
if "application/pdf" in content_type:
|
| 870 |
+
is_pdf = True
|
| 871 |
+
|
| 872 |
+
if len(response.content) > MAX_FILE_SIZE:
|
| 873 |
+
raise HTTPException(status_code=400, detail="File too large (max 50MB)")
|
| 874 |
+
|
| 875 |
+
content = response.content
|
| 876 |
+
|
| 877 |
+
except httpx.HTTPStatusError as e:
|
| 878 |
+
raise HTTPException(
|
| 879 |
+
status_code=400, detail=f"Failed to fetch URL: HTTP {e.response.status_code}"
|
| 880 |
+
)
|
| 881 |
+
except httpx.RequestError as e:
|
| 882 |
+
raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {e}")
|
| 883 |
+
|
| 884 |
+
if is_pdf:
|
| 885 |
+
try:
|
| 886 |
+
text = extract_text(content)
|
| 887 |
+
page_count = get_page_count(content)
|
| 888 |
+
except Exception as e:
|
| 889 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract PDF text: {e}")
|
| 890 |
+
else:
|
| 891 |
+
page_count = None
|
| 892 |
+
try:
|
| 893 |
+
extracted = trafilatura.extract(
|
| 894 |
+
content,
|
| 895 |
+
include_comments=False,
|
| 896 |
+
include_tables=True,
|
| 897 |
+
no_fallback=False,
|
| 898 |
+
favor_precision=True,
|
| 899 |
+
)
|
| 900 |
+
if extracted:
|
| 901 |
+
text = clean_text(extracted)
|
| 902 |
+
else:
|
| 903 |
+
text = ""
|
| 904 |
+
except Exception as e:
|
| 905 |
+
raise HTTPException(status_code=400, detail=f"Failed to extract page content: {e}")
|
| 906 |
+
|
| 907 |
+
if not text or not text.strip():
|
| 908 |
+
raise HTTPException(status_code=400, detail="No readable content found at URL")
|
| 909 |
+
|
| 910 |
+
# Extract document name from URL
|
| 911 |
+
url_path = urlparse(url).path
|
| 912 |
+
doc_name = url_path.split("/")[-1] if url_path else url
|
| 913 |
+
if not doc_name or doc_name == "/":
|
| 914 |
+
doc_name = urlparse(url).netloc
|
| 915 |
+
|
| 916 |
+
# Create a job for this request
|
| 917 |
+
job = _job_manager.create_job()
|
| 918 |
+
|
| 919 |
+
return StreamingResponse(
|
| 920 |
+
_generate_audio_to_job(
|
| 921 |
+
job,
|
| 922 |
+
text,
|
| 923 |
+
_tts_engine,
|
| 924 |
+
language,
|
| 925 |
+
doc_name=doc_name,
|
| 926 |
+
doc_type="pdf" if is_pdf else "url",
|
| 927 |
+
page_count=page_count,
|
| 928 |
+
),
|
| 929 |
+
media_type="text/event-stream",
|
| 930 |
+
headers={
|
| 931 |
+
"Cache-Control": "no-cache",
|
| 932 |
+
"Connection": "keep-alive",
|
| 933 |
+
"X-Accel-Buffering": "no",
|
| 934 |
+
},
|
| 935 |
+
)
|
src/talking_snake/extract.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF text extraction and cleaning for TTS processing."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import re
|
| 7 |
+
from collections import Counter
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
from pdfminer.high_level import extract_pages
|
| 11 |
+
from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class TextBlock:
|
| 16 |
+
"""A block of text with positional metadata."""
|
| 17 |
+
|
| 18 |
+
text: str
|
| 19 |
+
y_ratio: float # 0.0 = bottom, 1.0 = top
|
| 20 |
+
font_size: float
|
| 21 |
+
page_num: int
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
|
| 25 |
+
"""Extract text blocks from PDF with positional information.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
pdf_bytes: Raw PDF file content.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
List of TextBlock objects with text and metadata.
|
| 32 |
+
"""
|
| 33 |
+
blocks: list[TextBlock] = []
|
| 34 |
+
pdf_file = io.BytesIO(pdf_bytes)
|
| 35 |
+
|
| 36 |
+
laparams = LAParams(
|
| 37 |
+
line_margin=0.5,
|
| 38 |
+
word_margin=0.1,
|
| 39 |
+
char_margin=2.0,
|
| 40 |
+
boxes_flow=0.5,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
|
| 44 |
+
if not isinstance(page_layout, LTPage):
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
page_height = page_layout.height
|
| 48 |
+
|
| 49 |
+
for element in page_layout:
|
| 50 |
+
if not isinstance(element, LTTextBoxHorizontal):
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
text = element.get_text().strip()
|
| 54 |
+
if not text:
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
# Calculate Y position as ratio (0=bottom, 1=top)
|
| 58 |
+
y_ratio = element.y0 / page_height if page_height > 0 else 0.5
|
| 59 |
+
|
| 60 |
+
# Extract average font size from characters
|
| 61 |
+
font_sizes: list[float] = []
|
| 62 |
+
for line in element:
|
| 63 |
+
if isinstance(line, LTTextLineHorizontal):
|
| 64 |
+
for char in line:
|
| 65 |
+
if isinstance(char, LTChar):
|
| 66 |
+
font_sizes.append(char.size)
|
| 67 |
+
|
| 68 |
+
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
|
| 69 |
+
|
| 70 |
+
blocks.append(
|
| 71 |
+
TextBlock(
|
| 72 |
+
text=text,
|
| 73 |
+
y_ratio=y_ratio,
|
| 74 |
+
font_size=avg_font_size,
|
| 75 |
+
page_num=page_num,
|
| 76 |
+
)
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
return blocks
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_page_count(pdf_bytes: bytes) -> int:
|
| 83 |
+
"""Get the number of pages in a PDF.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
pdf_bytes: Raw PDF file content.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Number of pages in the PDF.
|
| 90 |
+
"""
|
| 91 |
+
pdf_file = io.BytesIO(pdf_bytes)
|
| 92 |
+
laparams = LAParams()
|
| 93 |
+
page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
|
| 94 |
+
return page_count
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def extract_text(pdf_bytes: bytes) -> str:
|
| 98 |
+
"""Extract and clean text from a PDF file.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
pdf_bytes: Raw PDF file content.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Cleaned text suitable for TTS.
|
| 105 |
+
"""
|
| 106 |
+
blocks = extract_text_blocks(pdf_bytes)
|
| 107 |
+
if not blocks:
|
| 108 |
+
return ""
|
| 109 |
+
|
| 110 |
+
cleaned_blocks = clean_text_blocks(blocks)
|
| 111 |
+
text = "\n\n".join(block.text for block in cleaned_blocks)
|
| 112 |
+
|
| 113 |
+
# Apply TTS-specific normalization
|
| 114 |
+
return normalize_for_tts(text)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
|
| 118 |
+
"""Remove headers, footers, page numbers, and other artifacts.
|
| 119 |
+
|
| 120 |
+
Applies multiple heuristics:
|
| 121 |
+
1. Remove blocks in top/bottom margins (likely headers/footers)
|
| 122 |
+
2. Remove repeated text across pages (likely running headers)
|
| 123 |
+
3. Remove standalone page numbers
|
| 124 |
+
4. Remove very short lines that look like artifacts
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
blocks: List of TextBlock objects.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Filtered list of TextBlock objects.
|
| 131 |
+
"""
|
| 132 |
+
if not blocks:
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
# Find repeated text patterns (headers/footers)
|
| 136 |
+
text_counts = Counter(block.text for block in blocks)
|
| 137 |
+
total_pages = max(block.page_num for block in blocks)
|
| 138 |
+
repeated_threshold = max(2, total_pages // 2)
|
| 139 |
+
repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}
|
| 140 |
+
|
| 141 |
+
# Calculate median font size for filtering
|
| 142 |
+
font_sizes = sorted(block.font_size for block in blocks)
|
| 143 |
+
median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0
|
| 144 |
+
|
| 145 |
+
cleaned: list[TextBlock] = []
|
| 146 |
+
|
| 147 |
+
for block in blocks:
|
| 148 |
+
# Skip if in header zone (top 10%)
|
| 149 |
+
if block.y_ratio > 0.90:
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
# Skip if in footer zone (bottom 10%)
|
| 153 |
+
if block.y_ratio < 0.10:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
# Skip repeated text (running headers/footers)
|
| 157 |
+
if block.text in repeated_texts:
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
# Skip standalone page numbers
|
| 161 |
+
if is_page_number(block.text):
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Skip very short lines with small font (likely captions/footnotes)
|
| 165 |
+
if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
cleaned.append(block)
|
| 169 |
+
|
| 170 |
+
return cleaned
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def is_page_number(text: str) -> bool:
|
| 174 |
+
"""Check if text is likely a page number.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
text: Text to check.
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
True if text appears to be a page number.
|
| 181 |
+
"""
|
| 182 |
+
text = text.strip()
|
| 183 |
+
|
| 184 |
+
# Pure number
|
| 185 |
+
if text.isdigit():
|
| 186 |
+
return True
|
| 187 |
+
|
| 188 |
+
# Roman numerals
|
| 189 |
+
if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
|
| 190 |
+
return True
|
| 191 |
+
|
| 192 |
+
# "Page N" or "N of M" patterns
|
| 193 |
+
if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE):
|
| 194 |
+
return True
|
| 195 |
+
|
| 196 |
+
# "- N -" pattern
|
| 197 |
+
if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text):
|
| 198 |
+
return True
|
| 199 |
+
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def clean_text(text: str) -> str:
|
| 204 |
+
"""Clean raw text for TTS processing.
|
| 205 |
+
|
| 206 |
+
This is a simpler function for cleaning already-extracted text,
|
| 207 |
+
without the positional information.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
text: Raw text to clean.
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
Cleaned text suitable for TTS.
|
| 214 |
+
"""
|
| 215 |
+
lines = text.split("\n")
|
| 216 |
+
cleaned_lines: list[str] = []
|
| 217 |
+
|
| 218 |
+
for line in lines:
|
| 219 |
+
line = line.strip()
|
| 220 |
+
|
| 221 |
+
# Skip empty lines
|
| 222 |
+
if not line:
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
# Skip standalone page numbers
|
| 226 |
+
if is_page_number(line):
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
# Skip very short lines (likely artifacts)
|
| 230 |
+
if len(line) < 3:
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
cleaned_lines.append(line)
|
| 234 |
+
|
| 235 |
+
# Rejoin with proper spacing
|
| 236 |
+
result = "\n".join(cleaned_lines)
|
| 237 |
+
|
| 238 |
+
# === FIX HYPHENATED/SPLIT WORDS ===
|
| 239 |
+
# These are words broken across lines, common in PDFs and web content
|
| 240 |
+
|
| 241 |
+
# Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
|
| 242 |
+
result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)
|
| 243 |
+
|
| 244 |
+
# Pattern 2: word-\n word (hyphen + newline + spaces)
|
| 245 |
+
result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result)
|
| 246 |
+
|
| 247 |
+
# Pattern 3: word- word (hyphen + space, often from copy-paste)
|
| 248 |
+
result = re.sub(r"(\w)- (\w)", r"\1\2", result)
|
| 249 |
+
|
| 250 |
+
# Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
|
| 251 |
+
result = re.sub(r"-\n([a-z])", r"\1", result)
|
| 252 |
+
|
| 253 |
+
# === FIX LINE BREAK ARTIFACTS ===
|
| 254 |
+
# Join lines that don't end with sentence-ending punctuation
|
| 255 |
+
# This handles text that was wrapped at fixed width
|
| 256 |
+
|
| 257 |
+
# Replace single newlines (not paragraph breaks) with spaces
|
| 258 |
+
# Keep double newlines as paragraph separators
|
| 259 |
+
result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)
|
| 260 |
+
|
| 261 |
+
# Normalize whitespace
|
| 262 |
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
| 263 |
+
result = re.sub(r"[ \t]+", " ", result)
|
| 264 |
+
|
| 265 |
+
# Apply TTS-specific normalization
|
| 266 |
+
result = normalize_for_tts(result)
|
| 267 |
+
|
| 268 |
+
return result.strip()
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def normalize_for_tts(text: str) -> str:
|
| 272 |
+
"""Normalize text for natural TTS pronunciation.
|
| 273 |
+
|
| 274 |
+
Handles special characters, punctuation, and formatting that can
|
| 275 |
+
cause TTS models to slow down or mispronounce.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
text: Text to normalize.
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
Normalized text optimized for TTS.
|
| 282 |
+
"""
|
| 283 |
+
# === CODE AND TECHNICAL CONTENT ===
|
| 284 |
+
# Handle common programming patterns that read poorly
|
| 285 |
+
|
| 286 |
+
# === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
|
| 287 |
+
# URLs (various formats) - remove completely
|
| 288 |
+
text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
|
| 289 |
+
text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
|
| 290 |
+
text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)
|
| 291 |
+
|
| 292 |
+
# UUIDs (with or without dashes) - must come before git hash pattern
|
| 293 |
+
uuid_pattern = (
|
| 294 |
+
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
|
| 295 |
+
)
|
| 296 |
+
text = re.sub(uuid_pattern, "", text)
|
| 297 |
+
|
| 298 |
+
# Git commit hashes (7-40 hex chars standalone)
|
| 299 |
+
text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)
|
| 300 |
+
|
| 301 |
+
# Hex color codes (#fff, #ffffff)
|
| 302 |
+
text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)
|
| 303 |
+
|
| 304 |
+
# Long hex/base64 strings (likely encoded data)
|
| 305 |
+
text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)
|
| 306 |
+
|
| 307 |
+
# File paths (Unix and Windows style)
|
| 308 |
+
text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)
|
| 309 |
+
|
| 310 |
+
# IP addresses
|
| 311 |
+
text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)
|
| 312 |
+
|
| 313 |
+
# Port numbers after colon
|
| 314 |
+
text = re.sub(r":\d{2,5}\b", "", text)
|
| 315 |
+
|
| 316 |
+
# Remove email addresses
|
| 317 |
+
text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)
|
| 318 |
+
|
| 319 |
+
# SHA/MD5 style hashes with prefix
|
| 320 |
+
text = re.sub(r"\b(sha\d*|md5|hash)[:\s]*[0-9a-f]+\b", "", text, flags=re.IGNORECASE)
|
| 321 |
+
|
| 322 |
+
# CamelCase: split into words (e.g., "getUserName" -> "get User Name")
|
| 323 |
+
text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
|
| 324 |
+
|
| 325 |
+
# snake_case: replace underscores with spaces
|
| 326 |
+
text = re.sub(r"(\w)_(\w)", r"\1 \2", text)
|
| 327 |
+
|
| 328 |
+
# Function calls: "func()" -> "func"
|
| 329 |
+
text = re.sub(r"(\w+)\(\)", r"\1", text)
|
| 330 |
+
|
| 331 |
+
# Arrow functions/operators: -> and =>
|
| 332 |
+
text = text.replace("->", " returns ")
|
| 333 |
+
text = text.replace("=>", " arrow ")
|
| 334 |
+
|
| 335 |
+
# Common code operators spoken naturally
|
| 336 |
+
text = text.replace("!=", " not equals ")
|
| 337 |
+
text = text.replace("==", " equals ")
|
| 338 |
+
text = text.replace("===", " strictly equals ")
|
| 339 |
+
text = text.replace("!==", " strictly not equals ")
|
| 340 |
+
text = text.replace("&&", " and ")
|
| 341 |
+
text = text.replace("||", " or ")
|
| 342 |
+
text = text.replace("++", " increment ")
|
| 343 |
+
text = text.replace("--", " decrement ")
|
| 344 |
+
|
| 345 |
+
# File extensions: ".py" -> " dot py" (only for common extensions)
|
| 346 |
+
ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b"
|
| 347 |
+
text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)
|
| 348 |
+
|
| 349 |
+
# Remove standalone hashes/pound signs (not hashtags)
|
| 350 |
+
text = re.sub(r"(?<!\w)#(?!\w)", "", text)
|
| 351 |
+
|
| 352 |
+
# Backticks (often used in markdown for code)
|
| 353 |
+
text = text.replace("`", "")
|
| 354 |
+
|
| 355 |
+
# Triple quotes
|
| 356 |
+
text = text.replace('"""', "")
|
| 357 |
+
text = text.replace("'''", "")
|
| 358 |
+
|
| 359 |
+
# === UNICODE NORMALIZATION ===
|
| 360 |
+
# Convert smart quotes to simple quotes
|
| 361 |
+
text = text.replace(""", '"').replace(""", '"')
|
| 362 |
+
text = text.replace("'", "'").replace("'", "'")
|
| 363 |
+
text = text.replace("„", '"').replace("‟", '"')
|
| 364 |
+
|
| 365 |
+
# Normalize dashes to standard hyphen or remove
|
| 366 |
+
text = text.replace("–", "-") # en-dash
|
| 367 |
+
text = text.replace("—", " - ") # em-dash (add spaces for pause)
|
| 368 |
+
text = text.replace("―", " - ") # horizontal bar
|
| 369 |
+
text = text.replace("‐", "-") # Unicode hyphen
|
| 370 |
+
text = text.replace("‑", "-") # non-breaking hyphen
|
| 371 |
+
text = text.replace("⁃", "-") # hyphen bullet
|
| 372 |
+
text = text.replace("−", "-") # minus sign
|
| 373 |
+
|
| 374 |
+
# Normalize ellipsis
|
| 375 |
+
text = text.replace("…", "...")
|
| 376 |
+
text = re.sub(r"\.{4,}", "...", text) # Limit to 3 dots
|
| 377 |
+
|
| 378 |
+
# Normalize other Unicode punctuation
|
| 379 |
+
text = text.replace("•", ",") # Bullet points
|
| 380 |
+
text = text.replace("·", " ") # Middle dot
|
| 381 |
+
text = text.replace("‧", " ") # Hyphenation point
|
| 382 |
+
text = text.replace("※", " ") # Reference mark
|
| 383 |
+
text = text.replace("†", "") # Dagger (footnote)
|
| 384 |
+
text = text.replace("‡", "") # Double dagger
|
| 385 |
+
text = text.replace("§", "section ")
|
| 386 |
+
text = text.replace("¶", "") # Pilcrow
|
| 387 |
+
text = text.replace("©", "copyright ")
|
| 388 |
+
text = text.replace("®", " registered ")
|
| 389 |
+
text = text.replace("™", " trademark ")
|
| 390 |
+
text = text.replace("°", " degrees ")
|
| 391 |
+
|
| 392 |
+
# === SPACING AROUND PUNCTUATION ===
|
| 393 |
+
# Ensure proper spacing around dashes used as separators
|
| 394 |
+
text = re.sub(r"\s*-\s*-\s*", " - ", text) # Double dash
|
| 395 |
+
text = re.sub(r"(\w)\s*-\s*(\w)", r"\1 - \2", text) # Word-dash-word with spaces
|
| 396 |
+
|
| 397 |
+
# Fix missing space after punctuation
|
| 398 |
+
text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
|
| 399 |
+
text = re.sub(r",([A-Za-z])", r", \1", text)
|
| 400 |
+
|
| 401 |
+
# Fix multiple punctuation marks
|
| 402 |
+
text = re.sub(r"[,]{2,}", ",", text)
|
| 403 |
+
text = re.sub(r"[;]{2,}", ";", text)
|
| 404 |
+
text = re.sub(r"[:]{2,}", ":", text)
|
| 405 |
+
text = re.sub(r"[!]{2,}", "!", text)
|
| 406 |
+
text = re.sub(r"[?]{2,}", "?", text)
|
| 407 |
+
|
| 408 |
+
# === NUMBERS AND SPECIAL NOTATIONS ===
|
| 409 |
+
# Convert common fractions
|
| 410 |
+
text = text.replace("½", " one half ")
|
| 411 |
+
text = text.replace("⅓", " one third ")
|
| 412 |
+
text = text.replace("⅔", " two thirds ")
|
| 413 |
+
text = text.replace("¼", " one quarter ")
|
| 414 |
+
text = text.replace("¾", " three quarters ")
|
| 415 |
+
text = text.replace("⅕", " one fifth ")
|
| 416 |
+
text = text.replace("⅖", " two fifths ")
|
| 417 |
+
text = text.replace("⅗", " three fifths ")
|
| 418 |
+
text = text.replace("⅘", " four fifths ")
|
| 419 |
+
text = text.replace("⅙", " one sixth ")
|
| 420 |
+
text = text.replace("⅚", " five sixths ")
|
| 421 |
+
text = text.replace("⅛", " one eighth ")
|
| 422 |
+
text = text.replace("⅜", " three eighths ")
|
| 423 |
+
text = text.replace("⅝", " five eighths ")
|
| 424 |
+
text = text.replace("⅞", " seven eighths ")
|
| 425 |
+
|
| 426 |
+
# Handle percentage and math symbols
|
| 427 |
+
text = text.replace("%", " percent")
|
| 428 |
+
text = text.replace("&", " and ")
|
| 429 |
+
text = text.replace("+", " plus ")
|
| 430 |
+
text = text.replace("=", " equals ")
|
| 431 |
+
text = text.replace("<", " less than ")
|
| 432 |
+
text = text.replace(">", " greater than ")
|
| 433 |
+
text = text.replace("≤", " less than or equal to ")
|
| 434 |
+
text = text.replace("≥", " greater than or equal to ")
|
| 435 |
+
text = text.replace("≠", " not equal to ")
|
| 436 |
+
text = text.replace("±", " plus or minus ")
|
| 437 |
+
text = text.replace("×", " times ")
|
| 438 |
+
text = text.replace("÷", " divided by ")
|
| 439 |
+
|
| 440 |
+
# === ABBREVIATIONS AND SPECIAL CASES ===
|
| 441 |
+
# Common abbreviations that might cause issues
|
| 442 |
+
text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
|
| 443 |
+
text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
|
| 444 |
+
text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
|
| 445 |
+
text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
|
| 446 |
+
text = re.sub(r"\bDr\.", "Doctor", text)
|
| 447 |
+
text = re.sub(r"\bMr\.", "Mister", text)
|
| 448 |
+
text = re.sub(r"\bMrs\.", "Missus", text)
|
| 449 |
+
text = re.sub(r"\bMs\.", "Miss", text)
|
| 450 |
+
text = re.sub(r"\bProf\.", "Professor", text)
|
| 451 |
+
text = re.sub(r"\bSt\.", "Saint", text)
|
| 452 |
+
text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
|
| 453 |
+
text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
|
| 454 |
+
text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
|
| 455 |
+
text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
|
| 456 |
+
text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)
|
| 457 |
+
|
| 458 |
+
# === BRACKETS AND PARENTHESES ===
|
| 459 |
+
# Remove or simplify brackets that might cause pauses
|
| 460 |
+
text = re.sub(r"\[([^\]]+)\]", r"(\1)", text) # Square to round
|
| 461 |
+
text = re.sub(r"\{([^}]+)\}", r"(\1)", text) # Curly to round
|
| 462 |
+
|
| 463 |
+
# Remove citation numbers like [1], [2,3], [1-5]
|
| 464 |
+
text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
|
| 465 |
+
text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text)
|
| 466 |
+
|
| 467 |
+
# === CLEANUP ===
|
| 468 |
+
# Remove standalone special characters
|
| 469 |
+
text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text)
|
| 470 |
+
|
| 471 |
+
# Remove content in angle brackets (often HTML/XML artifacts)
|
| 472 |
+
text = re.sub(r"<[^>]+>", "", text)
|
| 473 |
+
|
| 474 |
+
# Normalize multiple spaces
|
| 475 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 476 |
+
|
| 477 |
+
# Remove spaces before punctuation
|
| 478 |
+
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
| 479 |
+
|
| 480 |
+
# Ensure space after punctuation (but not before another punctuation)
|
| 481 |
+
text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
|
| 482 |
+
|
| 483 |
+
# Remove leading/trailing whitespace from lines
|
| 484 |
+
text = "\n".join(line.strip() for line in text.split("\n"))
|
| 485 |
+
|
| 486 |
+
# Remove empty lines that resulted from cleaning
|
| 487 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 488 |
+
|
| 489 |
+
return text
|
src/talking_snake/static/app.js
ADDED
|
@@ -0,0 +1,773 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Talking Snake - Main Application Script
|
| 3 |
+
* Handles file upload, URL submission, and audio streaming
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
// DOM Elements
|
| 7 |
+
const dropZone = document.getElementById("dropZone");
|
| 8 |
+
const fileInput = document.getElementById("fileInput");
|
| 9 |
+
const urlInput = document.getElementById("urlInput");
|
| 10 |
+
const urlSubmit = document.getElementById("urlSubmit");
|
| 11 |
+
const textInput = document.getElementById("textInput");
|
| 12 |
+
const textSubmit = document.getElementById("textSubmit");
|
| 13 |
+
const status = document.getElementById("status");
|
| 14 |
+
const player = document.getElementById("player");
|
| 15 |
+
const audio = document.getElementById("audio");
|
| 16 |
+
const filename = document.getElementById("filename");
|
| 17 |
+
const tabs = document.querySelectorAll(".tab");
|
| 18 |
+
const tabContents = document.querySelectorAll(".tab-content");
|
| 19 |
+
const inputSection = document.getElementById("inputSection");
|
| 20 |
+
const processingSection = document.getElementById("processingSection");
|
| 21 |
+
const stopBtn = document.getElementById("stopBtn");
|
| 22 |
+
const pauseBtn = document.getElementById("pauseBtn");
|
| 23 |
+
const deviceInfo = document.getElementById("deviceInfo");
|
| 24 |
+
const docInfo = document.getElementById("docInfo");
|
| 25 |
+
const languageButtons = document.querySelectorAll("#languageButtons .style-btn");
|
| 26 |
+
const processingProgressBar = document.getElementById("processingProgressBar");
|
| 27 |
+
|
| 28 |
+
// Custom player elements
|
| 29 |
+
const playerPlayBtn = document.getElementById("playerPlayBtn");
|
| 30 |
+
const progressBar = document.getElementById("progressBar");
|
| 31 |
+
const progressSlider = document.getElementById("progressSlider");
|
| 32 |
+
const timeDisplay = document.getElementById("timeDisplay");
|
| 33 |
+
const volumeBtn = document.getElementById("volumeBtn");
|
| 34 |
+
const downloadBtn = document.getElementById("downloadBtn");
|
| 35 |
+
|
| 36 |
+
// Constants
|
| 37 |
+
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
| 38 |
+
|
| 39 |
+
// State
|
| 40 |
+
let currentAbortController = null;
|
| 41 |
+
let selectedLanguage = "english";
|
| 42 |
+
let isPaused = false;
|
| 43 |
+
let estimatedDuration = 0; // Estimated total duration from server
|
| 44 |
+
let isMuted = false;
|
| 45 |
+
let currentAudioBlob = null; // Store audio blob for download
|
| 46 |
+
let currentDocName = ""; // Store document name for download filename
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* Format time in seconds to MM:SS
|
| 50 |
+
*/
|
| 51 |
+
function formatTime(seconds) {
|
| 52 |
+
if (!isFinite(seconds) || seconds < 0) {
|
| 53 |
+
return "0:00";
|
| 54 |
+
}
|
| 55 |
+
const mins = Math.floor(seconds / 60);
|
| 56 |
+
const secs = Math.floor(seconds % 60);
|
| 57 |
+
return `${mins}:${secs.toString().padStart(2, "0")}`;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
/**
|
| 61 |
+
* Format a number in human-readable form (1.2K, 3.4M, etc.)
|
| 62 |
+
*/
|
| 63 |
+
function formatNumber(num) {
|
| 64 |
+
if (num >= 1000000) {
|
| 65 |
+
return (num / 1000000).toFixed(1).replace(/\.0$/, "") + "M";
|
| 66 |
+
}
|
| 67 |
+
if (num >= 1000) {
|
| 68 |
+
return (num / 1000).toFixed(1).replace(/\.0$/, "") + "K";
|
| 69 |
+
}
|
| 70 |
+
return num.toString();
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
* Get icon for document type
|
| 75 |
+
*/
|
| 76 |
+
function getDocTypeIcon(docType) {
|
| 77 |
+
switch (docType) {
|
| 78 |
+
case "pdf": return "fa-file-pdf";
|
| 79 |
+
case "url": return "fa-link";
|
| 80 |
+
case "text": return "fa-file-lines";
|
| 81 |
+
default: return "fa-file";
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
/**
|
| 86 |
+
* Update the document info display
|
| 87 |
+
*/
|
| 88 |
+
function updateDocInfo(data) {
|
| 89 |
+
const icon = getDocTypeIcon(data.doc_type);
|
| 90 |
+
const docName = data.doc_name || "Document";
|
| 91 |
+
const pageInfo = data.page_count ? `<span class="doc-pages"><i class="fa-solid fa-file"></i> ${data.page_count}p</span>` : "";
|
| 92 |
+
const charInfo = data.total_chars ? `<span class="doc-chars"><i class="fa-solid fa-font"></i> ${formatNumber(data.total_chars)}</span>` : "";
|
| 93 |
+
|
| 94 |
+
docInfo.innerHTML = `
|
| 95 |
+
<span class="doc-name" title="${docName}"><i class="fa-solid ${icon}"></i><span class="doc-name-text">${docName}</span></span>
|
| 96 |
+
${pageInfo}
|
| 97 |
+
${charInfo}
|
| 98 |
+
`;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/**
|
| 102 |
+
* Update the custom player progress bar and time display
|
| 103 |
+
*/
|
| 104 |
+
function updatePlayerProgress() {
|
| 105 |
+
const currentTime = audio.currentTime || 0;
|
| 106 |
+
// Use estimated duration if audio duration is unrealistic (streaming issue)
|
| 107 |
+
let duration = audio.duration;
|
| 108 |
+
if (!isFinite(duration) || duration > 36000 || duration <= 0) {
|
| 109 |
+
duration = estimatedDuration || currentTime + 60; // Fallback
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
|
| 113 |
+
progressBar.style.width = `${Math.min(progress, 100)}%`;
|
| 114 |
+
progressSlider.value = progress;
|
| 115 |
+
timeDisplay.textContent = `${formatTime(currentTime)} / ${formatTime(duration)}`;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
/**
|
| 119 |
+
* Handle seeking via the progress slider
|
| 120 |
+
*/
|
| 121 |
+
function handleSeek(e) {
|
| 122 |
+
const percent = parseFloat(e.target.value);
|
| 123 |
+
let duration = audio.duration;
|
| 124 |
+
if (!isFinite(duration) || duration > 36000) {
|
| 125 |
+
duration = estimatedDuration || 60;
|
| 126 |
+
}
|
| 127 |
+
audio.currentTime = (percent / 100) * duration;
|
| 128 |
+
updatePlayerProgress();
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/**
|
| 132 |
+
* Toggle play/pause for custom player
|
| 133 |
+
*/
|
| 134 |
+
function togglePlayerPlay() {
|
| 135 |
+
if (audio.paused) {
|
| 136 |
+
audio.play().catch(() => {});
|
| 137 |
+
} else {
|
| 138 |
+
audio.pause();
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
/**
|
| 143 |
+
* Update play button icon
|
| 144 |
+
*/
|
| 145 |
+
function updatePlayButton() {
|
| 146 |
+
const icon = playerPlayBtn.querySelector("i");
|
| 147 |
+
if (audio.paused) {
|
| 148 |
+
icon.className = "fa-solid fa-play";
|
| 149 |
+
} else {
|
| 150 |
+
icon.className = "fa-solid fa-pause";
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
/**
|
| 155 |
+
* Toggle mute
|
| 156 |
+
*/
|
| 157 |
+
function toggleMute() {
|
| 158 |
+
isMuted = !isMuted;
|
| 159 |
+
audio.muted = isMuted;
|
| 160 |
+
const icon = volumeBtn.querySelector("i");
|
| 161 |
+
icon.className = isMuted ? "fa-solid fa-volume-xmark" : "fa-solid fa-volume-high";
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
/**
|
| 165 |
+
* Update device info display from SSE data
|
| 166 |
+
* @param {Object} info - Device info object
|
| 167 |
+
*/
|
| 168 |
+
function updateDeviceInfo(info) {
|
| 169 |
+
const icon = info.device === "cuda" ? "fa-microchip" : "fa-server";
|
| 170 |
+
const memoryInfo = info.device === "cuda"
|
| 171 |
+
? `${info.memory_used_gb}GB / ${info.memory_total_gb}GB (${info.memory_percent}%)`
|
| 172 |
+
: "CPU mode";
|
| 173 |
+
deviceInfo.innerHTML = `
|
| 174 |
+
<i class="fa-solid ${icon}"></i>
|
| 175 |
+
<span>${info.device_name}</span>
|
| 176 |
+
<span class="device-memory">${memoryInfo}</span>
|
| 177 |
+
<span class="device-batch">Batch: ${info.batch_size}</span>
|
| 178 |
+
`;
|
| 179 |
+
deviceInfo.classList.add("visible");
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/**
|
| 183 |
+
* Initialize device info SSE stream
|
| 184 |
+
*/
|
| 185 |
+
function initDeviceInfoStream() {
|
| 186 |
+
const eventSource = new EventSource("/api/device-info-stream");
|
| 187 |
+
|
| 188 |
+
eventSource.onmessage = (event) => {
|
| 189 |
+
try {
|
| 190 |
+
const info = JSON.parse(event.data);
|
| 191 |
+
updateDeviceInfo(info);
|
| 192 |
+
} catch {
|
| 193 |
+
// Silently fail - device info is optional
|
| 194 |
+
}
|
| 195 |
+
};
|
| 196 |
+
|
| 197 |
+
eventSource.onerror = () => {
|
| 198 |
+
// On error, close and try to reconnect after a delay
|
| 199 |
+
eventSource.close();
|
| 200 |
+
setTimeout(initDeviceInfoStream, 5000);
|
| 201 |
+
};
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
// Start device info SSE stream
|
| 205 |
+
initDeviceInfoStream();
|
| 206 |
+
|
| 207 |
+
// Custom player event listeners
|
| 208 |
+
playerPlayBtn.addEventListener("click", togglePlayerPlay);
|
| 209 |
+
progressSlider.addEventListener("input", handleSeek);
|
| 210 |
+
volumeBtn.addEventListener("click", toggleMute);
|
| 211 |
+
audio.addEventListener("play", updatePlayButton);
|
| 212 |
+
audio.addEventListener("pause", updatePlayButton);
|
| 213 |
+
audio.addEventListener("timeupdate", updatePlayerProgress);
|
| 214 |
+
audio.addEventListener("ended", () => {
|
| 215 |
+
updatePlayButton();
|
| 216 |
+
progressBar.style.width = "100%";
|
| 217 |
+
});
|
| 218 |
+
// Show pause button when audio actually starts playing
|
| 219 |
+
audio.addEventListener("playing", () => {
|
| 220 |
+
pauseBtn.classList.remove("hidden");
|
| 221 |
+
});
|
| 222 |
+
|
| 223 |
+
/**
|
| 224 |
+
* Fetch audio blob from the server for download capability
|
| 225 |
+
* @param {string} jobId - The job ID for the audio
|
| 226 |
+
*/
|
| 227 |
+
async function fetchAudioBlob(jobId) {
|
| 228 |
+
try {
|
| 229 |
+
const response = await fetch(`/api/audio/${jobId}`);
|
| 230 |
+
if (response.ok) {
|
| 231 |
+
currentAudioBlob = await response.blob();
|
| 232 |
+
// Show download button
|
| 233 |
+
downloadBtn.classList.remove("hidden");
|
| 234 |
+
}
|
| 235 |
+
} catch (error) {
|
| 236 |
+
console.error("Failed to fetch audio for download:", error);
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
/**
|
| 241 |
+
* Download the current audio as a WAV file
|
| 242 |
+
*/
|
| 243 |
+
function downloadAudio() {
|
| 244 |
+
if (!currentAudioBlob) {
|
| 245 |
+
return;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
const url = URL.createObjectURL(currentAudioBlob);
|
| 249 |
+
const a = document.createElement("a");
|
| 250 |
+
a.href = url;
|
| 251 |
+
|
| 252 |
+
// Create filename from document name
|
| 253 |
+
let filename = currentDocName || "audio";
|
| 254 |
+
// Remove file extension if present and add .wav
|
| 255 |
+
filename = filename.replace(/\.[^.]+$/, "") + ".wav";
|
| 256 |
+
a.download = filename;
|
| 257 |
+
|
| 258 |
+
document.body.appendChild(a);
|
| 259 |
+
a.click();
|
| 260 |
+
document.body.removeChild(a);
|
| 261 |
+
URL.revokeObjectURL(url);
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
/**
|
| 265 |
+
* Get the currently selected language
|
| 266 |
+
* @returns {string} The selected language name
|
| 267 |
+
*/
|
| 268 |
+
function getSelectedLanguage() {
|
| 269 |
+
return selectedLanguage;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
/**
|
| 273 |
+
* Show the input section and hide processing section
|
| 274 |
+
*/
|
| 275 |
+
function showInputSection() {
|
| 276 |
+
inputSection.classList.remove("hidden");
|
| 277 |
+
processingSection.classList.remove("visible");
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
/**
|
| 281 |
+
* Show the processing section and hide input section
|
| 282 |
+
*/
|
| 283 |
+
function showProcessingSection() {
|
| 284 |
+
inputSection.classList.add("hidden");
|
| 285 |
+
processingSection.classList.add("visible");
|
| 286 |
+
// Reset progress bar and hide pause button
|
| 287 |
+
processingProgressBar.style.width = "0%";
|
| 288 |
+
pauseBtn.classList.add("hidden");
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
/**
|
| 292 |
+
* Show a status message to the user
|
| 293 |
+
* @param {string} message - HTML message to display
|
| 294 |
+
* @param {string} type - Status type: 'loading', 'error', or 'success'
|
| 295 |
+
*/
|
| 296 |
+
function showStatus(message, type) {
|
| 297 |
+
status.innerHTML = message;
|
| 298 |
+
status.className = `status visible ${type}`;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
/**
|
| 302 |
+
* Stop the current generation and audio playback
|
| 303 |
+
*/
|
| 304 |
+
function stopGeneration() {
|
| 305 |
+
// Stop the fetch request
|
| 306 |
+
if (currentAbortController) {
|
| 307 |
+
currentAbortController.abort();
|
| 308 |
+
currentAbortController = null;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
// Stop audio playback and clear source
|
| 312 |
+
audio.pause();
|
| 313 |
+
audio.currentTime = 0;
|
| 314 |
+
audio.src = "";
|
| 315 |
+
audio.load(); // Force release of audio resources
|
| 316 |
+
|
| 317 |
+
// Reset pause state
|
| 318 |
+
isPaused = false;
|
| 319 |
+
updatePauseButton();
|
| 320 |
+
|
| 321 |
+
// Hide download button and pause button
|
| 322 |
+
downloadBtn.classList.add("hidden");
|
| 323 |
+
pauseBtn.classList.add("hidden");
|
| 324 |
+
currentAudioBlob = null;
|
| 325 |
+
|
| 326 |
+
// Reset progress bar
|
| 327 |
+
processingProgressBar.style.width = "0%";
|
| 328 |
+
|
| 329 |
+
showStatus('<i class="fa-solid fa-ban"></i> Generation stopped', "error");
|
| 330 |
+
showInputSection();
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
// Stop audio when page is closed or navigated away
|
| 334 |
+
window.addEventListener("beforeunload", () => {
|
| 335 |
+
audio.pause();
|
| 336 |
+
audio.src = "";
|
| 337 |
+
});
|
| 338 |
+
|
| 339 |
+
// Also handle page hide (works better on mobile and for navigation)
|
| 340 |
+
window.addEventListener("pagehide", () => {
|
| 341 |
+
audio.pause();
|
| 342 |
+
audio.src = "";
|
| 343 |
+
});
|
| 344 |
+
|
| 345 |
+
/**
|
| 346 |
+
* Toggle pause/play state
|
| 347 |
+
*/
|
| 348 |
+
function togglePause() {
|
| 349 |
+
if (audio.paused) {
|
| 350 |
+
audio.play().catch(() => {});
|
| 351 |
+
isPaused = false;
|
| 352 |
+
} else {
|
| 353 |
+
audio.pause();
|
| 354 |
+
isPaused = true;
|
| 355 |
+
}
|
| 356 |
+
updatePauseButton();
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
/**
|
| 360 |
+
* Update pause button icon based on state
|
| 361 |
+
*/
|
| 362 |
+
function updatePauseButton() {
|
| 363 |
+
const icon = pauseBtn.querySelector("i");
|
| 364 |
+
if (isPaused || audio.paused) {
|
| 365 |
+
icon.className = "fa-solid fa-play";
|
| 366 |
+
pauseBtn.title = "Resume";
|
| 367 |
+
} else {
|
| 368 |
+
icon.className = "fa-solid fa-pause";
|
| 369 |
+
pauseBtn.title = "Pause";
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
/**
|
| 374 |
+
* Format remaining time for display
|
| 375 |
+
* @param {number} seconds - Remaining time in seconds
|
| 376 |
+
* @returns {string} Formatted time string
|
| 377 |
+
*/
|
| 378 |
+
function formatTimeRemaining(seconds) {
|
| 379 |
+
if (seconds > 60) {
|
| 380 |
+
return `~${Math.ceil(seconds / 60)} min remaining`;
|
| 381 |
+
}
|
| 382 |
+
return `~${Math.ceil(seconds)}s remaining`;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
/**
|
| 386 |
+
* Process SSE stream for progress updates
|
| 387 |
+
* Sets up audio stream once job_id is received
|
| 388 |
+
* @param {Response} response - Fetch response with SSE stream
|
| 389 |
+
* @param {string} docName - Document name for display
|
| 390 |
+
* @returns {Promise<void>}
|
| 391 |
+
* @throws {Error} If stream contains an error event or fails
|
| 392 |
+
*/
|
| 393 |
+
async function processStream(response, docName) {
|
| 394 |
+
const reader = response.body.getReader();
|
| 395 |
+
const decoder = new TextDecoder();
|
| 396 |
+
let lastStatus = "";
|
| 397 |
+
let jobId = null;
|
| 398 |
+
let audioStarted = false;
|
| 399 |
+
|
| 400 |
+
// Reset estimated duration
|
| 401 |
+
estimatedDuration = 0;
|
| 402 |
+
|
| 403 |
+
try {
|
| 404 |
+
while (true) {
|
| 405 |
+
const { done, value } = await reader.read();
|
| 406 |
+
if (done) {
|
| 407 |
+
break;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
const text = decoder.decode(value, { stream: true });
|
| 411 |
+
const lines = text.split("\n");
|
| 412 |
+
|
| 413 |
+
for (const line of lines) {
|
| 414 |
+
if (line.startsWith("data: ")) {
|
| 415 |
+
try {
|
| 416 |
+
const data = JSON.parse(line.slice(6));
|
| 417 |
+
|
| 418 |
+
if (data.type === "error") {
|
| 419 |
+
throw new Error(data.message || "TTS generation failed");
|
| 420 |
+
} else if (data.type === "start" && data.job_id) {
|
| 421 |
+
// Got job ID - start audio stream immediately
|
| 422 |
+
jobId = data.job_id;
|
| 423 |
+
// Capture initial duration estimate
|
| 424 |
+
if (data.estimated_remaining) {
|
| 425 |
+
estimatedDuration = data.estimated_remaining;
|
| 426 |
+
}
|
| 427 |
+
// Display document info
|
| 428 |
+
updateDocInfo(data);
|
| 429 |
+
if (!audioStarted) {
|
| 430 |
+
audioStarted = true;
|
| 431 |
+
// Set audio source to stream endpoint
|
| 432 |
+
// Browser will start playing as data arrives
|
| 433 |
+
audio.src = `/api/audio/${jobId}`;
|
| 434 |
+
audio.load();
|
| 435 |
+
// Try to play (may need user interaction first time)
|
| 436 |
+
audio.play().catch(() => {
|
| 437 |
+
// Autoplay blocked - will play when user clicks
|
| 438 |
+
});
|
| 439 |
+
updatePlayButton();
|
| 440 |
+
// Pause button will be shown by the 'playing' event listener
|
| 441 |
+
}
|
| 442 |
+
const timeStr = formatTimeRemaining(data.estimated_remaining);
|
| 443 |
+
showStatus(
|
| 444 |
+
`<span class="spinner"></span>ETA ${timeStr}`,
|
| 445 |
+
"loading"
|
| 446 |
+
);
|
| 447 |
+
// Update progress bar
|
| 448 |
+
processingProgressBar.style.width = "5%";
|
| 449 |
+
} else if (data.type === "progress") {
|
| 450 |
+
lastStatus = data.status;
|
| 451 |
+
const timeStr = formatTimeRemaining(data.estimated_remaining);
|
| 452 |
+
showStatus(
|
| 453 |
+
`<span class="spinner"></span>${data.percent}% • ETA ${timeStr}`,
|
| 454 |
+
"loading"
|
| 455 |
+
);
|
| 456 |
+
// Update progress bar
|
| 457 |
+
processingProgressBar.style.width = `${data.percent}%`;
|
| 458 |
+
} else if (data.type === "complete") {
|
| 459 |
+
// Generation complete - show player
|
| 460 |
+
// Update estimated duration based on actual processing time
|
| 461 |
+
if (data.total_time) {
|
| 462 |
+
// Estimate audio duration: ~0.1s per char at normal speech rate
|
| 463 |
+
// Use total_time as a rough guide
|
| 464 |
+
estimatedDuration = Math.max(estimatedDuration, audio.currentTime + 10);
|
| 465 |
+
}
|
| 466 |
+
filename.textContent = docName;
|
| 467 |
+
currentDocName = docName;
|
| 468 |
+
player.classList.add("visible");
|
| 469 |
+
// Set progress to 100%
|
| 470 |
+
processingProgressBar.style.width = "100%";
|
| 471 |
+
showInputSection();
|
| 472 |
+
showStatus(
|
| 473 |
+
`<i class="fa-solid fa-circle-check"></i> Done in ${data.total_time}s`,
|
| 474 |
+
"success"
|
| 475 |
+
);
|
| 476 |
+
updatePlayerProgress();
|
| 477 |
+
|
| 478 |
+
// Fetch audio blob for download capability
|
| 479 |
+
if (jobId) {
|
| 480 |
+
fetchAudioBlob(jobId);
|
| 481 |
+
}
|
| 482 |
+
}
|
| 483 |
+
} catch (parseError) {
|
| 484 |
+
// Check if it's our thrown error or a JSON parse error
|
| 485 |
+
if (parseError.message && !parseError.message.includes("JSON")) {
|
| 486 |
+
throw parseError;
|
| 487 |
+
}
|
| 488 |
+
// Ignore JSON parse errors for partial data
|
| 489 |
+
}
|
| 490 |
+
}
|
| 491 |
+
}
|
| 492 |
+
}
|
| 493 |
+
} catch (streamError) {
|
| 494 |
+
// Re-throw with more context and preserve the original cause
|
| 495 |
+
const context = lastStatus ? ` (during: ${lastStatus})` : "";
|
| 496 |
+
throw new Error(`Stream error${context}: ${streamError.message}`, { cause: streamError });
|
| 497 |
+
}
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
/**
|
| 501 |
+
* Handle file upload and TTS conversion
|
| 502 |
+
* @param {File} file - The uploaded file
|
| 503 |
+
*/
|
| 504 |
+
async function handleFile(file) {
|
| 505 |
+
// Validate file type
|
| 506 |
+
if (!file.name.toLowerCase().endsWith(".pdf")) {
|
| 507 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please select a PDF file', "error");
|
| 508 |
+
return;
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
// Validate file size
|
| 512 |
+
if (file.size > MAX_FILE_SIZE) {
|
| 513 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> File too large. Maximum size is 50MB.', "error");
|
| 514 |
+
return;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
showProcessingSection();
|
| 518 |
+
showStatus('<span class="spinner"></span> Extracting text...', "loading");
|
| 519 |
+
player.classList.remove("visible");
|
| 520 |
+
downloadBtn.classList.add("hidden");
|
| 521 |
+
currentAudioBlob = null;
|
| 522 |
+
|
| 523 |
+
const formData = new FormData();
|
| 524 |
+
formData.append("file", file);
|
| 525 |
+
formData.append("language", getSelectedLanguage());
|
| 526 |
+
|
| 527 |
+
// Create abort controller for this request
|
| 528 |
+
currentAbortController = new AbortController();
|
| 529 |
+
|
| 530 |
+
try {
|
| 531 |
+
const response = await fetch("/api/read-stream", {
|
| 532 |
+
method: "POST",
|
| 533 |
+
body: formData,
|
| 534 |
+
signal: currentAbortController.signal,
|
| 535 |
+
});
|
| 536 |
+
|
| 537 |
+
if (!response.ok) {
|
| 538 |
+
const error = await response.json();
|
| 539 |
+
throw new Error(error.detail || "Failed to process document");
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
// Process stream handles both progress SSE and starting audio playback
|
| 543 |
+
await processStream(response, file.name);
|
| 544 |
+
} catch (error) {
|
| 545 |
+
if (error.name === "AbortError") {
|
| 546 |
+
// User cancelled - already handled in stopGeneration
|
| 547 |
+
return;
|
| 548 |
+
}
|
| 549 |
+
showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
|
| 550 |
+
showInputSection();
|
| 551 |
+
} finally {
|
| 552 |
+
currentAbortController = null;
|
| 553 |
+
}
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
/**
|
| 557 |
+
* Handle URL submission and TTS conversion
|
| 558 |
+
* @param {string} url - The URL to process
|
| 559 |
+
*/
|
| 560 |
+
async function handleUrl(url) {
|
| 561 |
+
url = url.trim();
|
| 562 |
+
|
| 563 |
+
if (!url) {
|
| 564 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a URL', "error");
|
| 565 |
+
return;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
// Validate URL format
|
| 569 |
+
try {
|
| 570 |
+
new URL(url);
|
| 571 |
+
} catch {
|
| 572 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter a valid URL', "error");
|
| 573 |
+
return;
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
showProcessingSection();
|
| 577 |
+
showStatus('<span class="spinner"></span> Fetching content...', "loading");
|
| 578 |
+
player.classList.remove("visible");
|
| 579 |
+
downloadBtn.classList.add("hidden");
|
| 580 |
+
currentAudioBlob = null;
|
| 581 |
+
urlSubmit.disabled = true;
|
| 582 |
+
|
| 583 |
+
// Create abort controller for this request
|
| 584 |
+
currentAbortController = new AbortController();
|
| 585 |
+
|
| 586 |
+
try {
|
| 587 |
+
const response = await fetch("/api/read-url-stream", {
|
| 588 |
+
method: "POST",
|
| 589 |
+
headers: {
|
| 590 |
+
"Content-Type": "application/json",
|
| 591 |
+
},
|
| 592 |
+
body: JSON.stringify({
|
| 593 |
+
url,
|
| 594 |
+
language: getSelectedLanguage()
|
| 595 |
+
}),
|
| 596 |
+
signal: currentAbortController.signal,
|
| 597 |
+
});
|
| 598 |
+
|
| 599 |
+
if (!response.ok) {
|
| 600 |
+
const error = await response.json();
|
| 601 |
+
throw new Error(error.detail || "Failed to process document");
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
// Extract filename from URL
|
| 605 |
+
const urlPath = new URL(url).pathname;
|
| 606 |
+
const docName = urlPath.split("/").pop() || "document";
|
| 607 |
+
|
| 608 |
+
// Process stream handles both progress SSE and starting audio playback
|
| 609 |
+
await processStream(response, docName);
|
| 610 |
+
} catch (error) {
|
| 611 |
+
if (error.name === "AbortError") {
|
| 612 |
+
// User cancelled - already handled in stopGeneration
|
| 613 |
+
return;
|
| 614 |
+
}
|
| 615 |
+
showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
|
| 616 |
+
showInputSection();
|
| 617 |
+
} finally {
|
| 618 |
+
urlSubmit.disabled = false;
|
| 619 |
+
currentAbortController = null;
|
| 620 |
+
}
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
/**
|
| 624 |
+
* Handle text submission and TTS conversion
|
| 625 |
+
* @param {string} text - The text to process
|
| 626 |
+
*/
|
| 627 |
+
async function handleText(text) {
|
| 628 |
+
text = text.trim();
|
| 629 |
+
|
| 630 |
+
if (!text) {
|
| 631 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Please enter some text', "error");
|
| 632 |
+
return;
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
if (text.length > 500000) {
|
| 636 |
+
showStatus('<i class="fa-solid fa-triangle-exclamation"></i> Text too long (max 500,000 characters)', "error");
|
| 637 |
+
return;
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
showProcessingSection();
|
| 641 |
+
showStatus('<span class="spinner"></span> Processing text...', "loading");
|
| 642 |
+
player.classList.remove("visible");
|
| 643 |
+
downloadBtn.classList.add("hidden");
|
| 644 |
+
currentAudioBlob = null;
|
| 645 |
+
textSubmit.disabled = true;
|
| 646 |
+
|
| 647 |
+
// Create abort controller for this request
|
| 648 |
+
currentAbortController = new AbortController();
|
| 649 |
+
|
| 650 |
+
try {
|
| 651 |
+
const response = await fetch("/api/read-text-stream", {
|
| 652 |
+
method: "POST",
|
| 653 |
+
headers: {
|
| 654 |
+
"Content-Type": "application/json",
|
| 655 |
+
},
|
| 656 |
+
body: JSON.stringify({
|
| 657 |
+
text,
|
| 658 |
+
language: getSelectedLanguage()
|
| 659 |
+
}),
|
| 660 |
+
signal: currentAbortController.signal,
|
| 661 |
+
});
|
| 662 |
+
|
| 663 |
+
if (!response.ok) {
|
| 664 |
+
const error = await response.json();
|
| 665 |
+
throw new Error(error.detail || "Failed to process text");
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
// Process stream handles both progress SSE and starting audio playback
|
| 669 |
+
await processStream(response, "Pasted Text");
|
| 670 |
+
} catch (error) {
|
| 671 |
+
if (error.name === "AbortError") {
|
| 672 |
+
// User cancelled - already handled in stopGeneration
|
| 673 |
+
return;
|
| 674 |
+
}
|
| 675 |
+
showStatus(`<i class="fa-solid fa-circle-exclamation"></i> ${error.message}`, "error");
|
| 676 |
+
showInputSection();
|
| 677 |
+
} finally {
|
| 678 |
+
textSubmit.disabled = false;
|
| 679 |
+
currentAbortController = null;
|
| 680 |
+
}
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
// Tab switching
|
| 684 |
+
tabs.forEach((tab) => {
|
| 685 |
+
tab.addEventListener("click", () => {
|
| 686 |
+
tabs.forEach((t) => t.classList.remove("active"));
|
| 687 |
+
tabContents.forEach((tc) => tc.classList.remove("active"));
|
| 688 |
+
tab.classList.add("active");
|
| 689 |
+
document.getElementById(`${tab.dataset.tab}-tab`).classList.add("active");
|
| 690 |
+
});
|
| 691 |
+
});
|
| 692 |
+
|
| 693 |
+
// Drag and drop handlers
|
| 694 |
+
dropZone.addEventListener("dragover", (e) => {
|
| 695 |
+
e.preventDefault();
|
| 696 |
+
dropZone.classList.add("dragover");
|
| 697 |
+
});
|
| 698 |
+
|
| 699 |
+
dropZone.addEventListener("dragleave", () => {
|
| 700 |
+
dropZone.classList.remove("dragover");
|
| 701 |
+
});
|
| 702 |
+
|
| 703 |
+
dropZone.addEventListener("drop", (e) => {
|
| 704 |
+
e.preventDefault();
|
| 705 |
+
dropZone.classList.remove("dragover");
|
| 706 |
+
|
| 707 |
+
const files = e.dataTransfer.files;
|
| 708 |
+
if (files.length > 0) {
|
| 709 |
+
handleFile(files[0]);
|
| 710 |
+
}
|
| 711 |
+
});
|
| 712 |
+
|
| 713 |
+
// Click to select file
|
| 714 |
+
dropZone.addEventListener("click", (e) => {
|
| 715 |
+
if (e.target !== fileInput && !e.target.classList.contains("file-label")) {
|
| 716 |
+
fileInput.click();
|
| 717 |
+
}
|
| 718 |
+
});
|
| 719 |
+
|
| 720 |
+
fileInput.addEventListener("change", () => {
|
| 721 |
+
if (fileInput.files.length > 0) {
|
| 722 |
+
handleFile(fileInput.files[0]);
|
| 723 |
+
}
|
| 724 |
+
});
|
| 725 |
+
|
| 726 |
+
// URL submission
|
| 727 |
+
urlSubmit.addEventListener("click", () => {
|
| 728 |
+
handleUrl(urlInput.value);
|
| 729 |
+
});
|
| 730 |
+
|
| 731 |
+
urlInput.addEventListener("keypress", (e) => {
|
| 732 |
+
if (e.key === "Enter") {
|
| 733 |
+
handleUrl(urlInput.value);
|
| 734 |
+
}
|
| 735 |
+
});
|
| 736 |
+
|
| 737 |
+
// Text submission
|
| 738 |
+
textSubmit.addEventListener("click", () => {
|
| 739 |
+
handleText(textInput.value);
|
| 740 |
+
});
|
| 741 |
+
|
| 742 |
+
// Allow Ctrl+Enter to submit text
|
| 743 |
+
textInput.addEventListener("keydown", (e) => {
|
| 744 |
+
if (e.key === "Enter" && (e.ctrlKey || e.metaKey)) {
|
| 745 |
+
handleText(textInput.value);
|
| 746 |
+
}
|
| 747 |
+
});
|
| 748 |
+
|
| 749 |
+
// Stop button
|
| 750 |
+
stopBtn.addEventListener("click", stopGeneration);
|
| 751 |
+
|
| 752 |
+
// Pause button
|
| 753 |
+
pauseBtn.addEventListener("click", togglePause);
|
| 754 |
+
|
| 755 |
+
// Download button
|
| 756 |
+
downloadBtn.addEventListener("click", downloadAudio);
|
| 757 |
+
|
| 758 |
+
// Update pause button when audio state changes
|
| 759 |
+
audio.addEventListener("play", updatePauseButton);
|
| 760 |
+
audio.addEventListener("pause", updatePauseButton);
|
| 761 |
+
audio.addEventListener("ended", () => {
|
| 762 |
+
isPaused = false;
|
| 763 |
+
updatePauseButton();
|
| 764 |
+
});
|
| 765 |
+
|
| 766 |
+
// Language selection
|
| 767 |
+
languageButtons.forEach((btn) => {
|
| 768 |
+
btn.addEventListener("click", () => {
|
| 769 |
+
languageButtons.forEach((b) => b.classList.remove("active"));
|
| 770 |
+
btn.classList.add("active");
|
| 771 |
+
selectedLanguage = btn.dataset.language;
|
| 772 |
+
});
|
| 773 |
+
});
|
src/talking_snake/static/apple-touch-icon.png
ADDED
|
|
Git LFS Details
|
src/talking_snake/static/favicon.png
ADDED
|
|
Git LFS Details
|
src/talking_snake/static/icon-192.png
ADDED
|
|
Git LFS Details
|
src/talking_snake/static/icon-512.png
ADDED
|
|
Git LFS Details
|
src/talking_snake/static/index.html
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Talking Snake - PDF & Web to Speech</title>
|
| 7 |
+
|
| 8 |
+
<!-- PWA / Mobile App Configuration -->
|
| 9 |
+
<meta name="application-name" content="Talking Snake">
|
| 10 |
+
<meta name="theme-color" content="#1a1a2e">
|
| 11 |
+
<meta name="mobile-web-app-capable" content="yes">
|
| 12 |
+
<link rel="manifest" href="/static/manifest.json">
|
| 13 |
+
|
| 14 |
+
<!-- iOS PWA Configuration -->
|
| 15 |
+
<meta name="apple-mobile-web-app-capable" content="yes">
|
| 16 |
+
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
|
| 17 |
+
<meta name="apple-mobile-web-app-title" content="Talking Snake">
|
| 18 |
+
<link rel="apple-touch-icon" href="/static/apple-touch-icon.png">
|
| 19 |
+
<link rel="apple-touch-icon" sizes="180x180" href="/static/apple-touch-icon.png">
|
| 20 |
+
<link rel="apple-touch-icon" sizes="152x152" href="/static/apple-touch-icon.png">
|
| 21 |
+
<link rel="apple-touch-icon" sizes="120x120" href="/static/apple-touch-icon.png">
|
| 22 |
+
|
| 23 |
+
<!-- Standard favicon -->
|
| 24 |
+
<link rel="icon" type="image/png" href="/static/favicon.png">
|
| 25 |
+
<link rel="icon" type="image/png" sizes="192x192" href="/static/icon-192.png">
|
| 26 |
+
<link rel="icon" type="image/png" sizes="512x512" href="/static/icon-512.png">
|
| 27 |
+
|
| 28 |
+
<link rel="stylesheet" href="/static/styles.css">
|
| 29 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" crossorigin="anonymous" referrerpolicy="no-referrer">
|
| 30 |
+
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
|
| 31 |
+
</head>
|
| 32 |
+
<body>
|
| 33 |
+
<div class="main-content">
|
| 34 |
+
<img src="/static/talking_snake.png" alt="Talking Snake" class="logo">
|
| 35 |
+
<h1>Talking Snake</h1>
|
| 36 |
+
<p class="subtitle">Transform PDFs & Web into Audio</p>
|
| 37 |
+
|
| 38 |
+
<div class="container">
|
| 39 |
+
<div class="input-section" id="inputSection">
|
| 40 |
+
<div class="options-row">
|
| 41 |
+
<div class="language-selector">
|
| 42 |
+
<span class="style-label">Language:</span>
|
| 43 |
+
<div class="style-buttons" id="languageButtons">
|
| 44 |
+
<button class="style-btn lang-btn active" data-language="english" title="English">
|
| 45 |
+
🇬🇧
|
| 46 |
+
</button>
|
| 47 |
+
<button class="style-btn lang-btn" data-language="chinese" title="Chinese">
|
| 48 |
+
🇨🇳
|
| 49 |
+
</button>
|
| 50 |
+
<button class="style-btn lang-btn" data-language="japanese" title="Japanese">
|
| 51 |
+
🇯🇵
|
| 52 |
+
</button>
|
| 53 |
+
<button class="style-btn lang-btn" data-language="korean" title="Korean">
|
| 54 |
+
🇰🇷
|
| 55 |
+
</button>
|
| 56 |
+
</div>
|
| 57 |
+
</div>
|
| 58 |
+
</div>
|
| 59 |
+
|
| 60 |
+
<div class="tabs">
|
| 61 |
+
<button class="tab active" data-tab="upload"><i class="fa-solid fa-upload"></i> Upload File</button>
|
| 62 |
+
<button class="tab" data-tab="url"><i class="fa-solid fa-link"></i> From URL</button>
|
| 63 |
+
<button class="tab" data-tab="text"><i class="fa-solid fa-keyboard"></i> Paste Text</button>
|
| 64 |
+
</div>
|
| 65 |
+
|
| 66 |
+
<div class="tab-content active" id="upload-tab">
|
| 67 |
+
<div class="drop-zone" id="dropZone">
|
| 68 |
+
<i class="fa-solid fa-file-pdf drop-icon"></i>
|
| 69 |
+
<p>Drag & drop a PDF here</p>
|
| 70 |
+
<label class="file-label">
|
| 71 |
+
<i class="fa-solid fa-folder-open"></i> Choose File
|
| 72 |
+
<input type="file" id="fileInput" accept=".pdf">
|
| 73 |
+
</label>
|
| 74 |
+
<p class="hint">Supports PDF documents up to 50MB</p>
|
| 75 |
+
</div>
|
| 76 |
+
</div>
|
| 77 |
+
|
| 78 |
+
<div class="tab-content" id="url-tab">
|
| 79 |
+
<div class="url-form">
|
| 80 |
+
<input type="url" id="urlInput" placeholder="https://example.com/article or .pdf">
|
| 81 |
+
<button class="submit-btn" id="urlSubmit"><i class="fa-solid fa-microphone"></i> Read Content</button>
|
| 82 |
+
<p class="hint">Enter a link to a PDF or web page (articles, docs, blogs)</p>
|
| 83 |
+
</div>
|
| 84 |
+
</div>
|
| 85 |
+
|
| 86 |
+
<div class="tab-content" id="text-tab">
|
| 87 |
+
<div class="text-form">
|
| 88 |
+
<textarea id="textInput" placeholder="Paste or type your text here..." rows="6"></textarea>
|
| 89 |
+
<button class="submit-btn" id="textSubmit"><i class="fa-solid fa-microphone"></i> Read Text</button>
|
| 90 |
+
<p class="hint">Paste any text you want to hear read aloud</p>
|
| 91 |
+
</div>
|
| 92 |
+
</div>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
<div class="processing-section" id="processingSection">
|
| 96 |
+
<div class="processing-row-1">
|
| 97 |
+
<div class="doc-info" id="docInfo"></div>
|
| 98 |
+
</div>
|
| 99 |
+
<div class="processing-row-2">
|
| 100 |
+
<div class="status" id="status"></div>
|
| 101 |
+
<div class="processing-progress-container" id="processingProgressContainer">
|
| 102 |
+
<div class="processing-progress-bar" id="processingProgressBar"></div>
|
| 103 |
+
</div>
|
| 104 |
+
<div class="control-buttons">
|
| 105 |
+
<button class="control-btn pause-btn hidden" id="pauseBtn" title="Pause/Resume"><i class="fa-solid fa-pause"></i></button>
|
| 106 |
+
<button class="control-btn stop-btn" id="stopBtn" title="Stop generation"><i class="fa-solid fa-stop"></i></button>
|
| 107 |
+
</div>
|
| 108 |
+
</div>
|
| 109 |
+
</div>
|
| 110 |
+
|
| 111 |
+
<div class="device-info" id="deviceInfo"></div>
|
| 112 |
+
|
| 113 |
+
<div class="player" id="player">
|
| 114 |
+
<div class="filename" id="filename"></div>
|
| 115 |
+
<div class="custom-player">
|
| 116 |
+
<button class="player-btn play-btn" id="playerPlayBtn" title="Play/Pause">
|
| 117 |
+
<i class="fa-solid fa-play"></i>
|
| 118 |
+
</button>
|
| 119 |
+
<div class="progress-container" id="progressContainer">
|
| 120 |
+
<div class="progress-bar" id="progressBar"></div>
|
| 121 |
+
<input type="range" class="progress-slider" id="progressSlider" min="0" max="100" value="0">
|
| 122 |
+
</div>
|
| 123 |
+
<span class="time-display" id="timeDisplay">0:00 / 0:00</span>
|
| 124 |
+
<button class="player-btn volume-btn" id="volumeBtn" title="Mute/Unmute">
|
| 125 |
+
<i class="fa-solid fa-volume-high"></i>
|
| 126 |
+
</button>
|
| 127 |
+
<button class="player-btn download-btn hidden" id="downloadBtn" title="Download Audio">
|
| 128 |
+
<i class="fa-solid fa-download"></i>
|
| 129 |
+
</button>
|
| 130 |
+
</div>
|
| 131 |
+
<audio id="audio" preload="auto"></audio>
|
| 132 |
+
</div>
|
| 133 |
+
</div>
|
| 134 |
+
</div>
|
| 135 |
+
|
| 136 |
+
<footer>
|
| 137 |
+
<p>Built with <i class="fa-solid fa-heart"></i> for listeners everywhere | <a href="https://github.com/LucaCappelletti94/talking-snake" target="_blank" rel="noopener noreferrer"><i class="fa-brands fa-github"></i> GitHub</a></p>
|
| 138 |
+
</footer>
|
| 139 |
+
|
| 140 |
+
<script src="/static/app.js"></script>
|
| 141 |
+
</body>
|
| 142 |
+
</html>
|
src/talking_snake/static/manifest.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Talking Snake",
|
| 3 |
+
"short_name": "Talking Snake",
|
| 4 |
+
"description": "Transform PDFs & Web into Audio",
|
| 5 |
+
"start_url": "/",
|
| 6 |
+
"display": "standalone",
|
| 7 |
+
"background_color": "#1a1a2e",
|
| 8 |
+
"theme_color": "#1a1a2e",
|
| 9 |
+
"orientation": "portrait-primary",
|
| 10 |
+
"icons": [
|
| 11 |
+
{
|
| 12 |
+
"src": "/static/favicon.png",
|
| 13 |
+
"sizes": "64x64",
|
| 14 |
+
"type": "image/png"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"src": "/static/icon-192.png",
|
| 18 |
+
"sizes": "192x192",
|
| 19 |
+
"type": "image/png",
|
| 20 |
+
"purpose": "any maskable"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"src": "/static/icon-512.png",
|
| 24 |
+
"sizes": "512x512",
|
| 25 |
+
"type": "image/png",
|
| 26 |
+
"purpose": "any maskable"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"src": "/static/apple-touch-icon.png",
|
| 30 |
+
"sizes": "180x180",
|
| 31 |
+
"type": "image/png"
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"categories": ["utilities", "productivity"],
|
| 35 |
+
"lang": "en"
|
| 36 |
+
}
|
src/talking_snake/static/sample.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:131aea479804ac10ad86674780fca80134775ef547e808339f66408eb90ffadb
|
| 3 |
+
size 291884
|
src/talking_snake/static/styles.css
ADDED
|
@@ -0,0 +1,848 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Talking Snake - Main Stylesheet
|
| 3 |
+
* A warm, accessible color scheme inspired by the talking snake logo
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
:root {
|
| 7 |
+
/* Warm, friendly palette inspired by the talking snake logo */
|
| 8 |
+
--bg: #fff7e9; /* Warm cream background */
|
| 9 |
+
--surface: #fff; /* Clean white cards */
|
| 10 |
+
--primary: #d4763a; /* Warm orange - friendly & energetic */
|
| 11 |
+
--primary-hover: #c06830; /* Darker orange for hover */
|
| 12 |
+
--secondary: #5a8f5a; /* Soft green - snake accent */
|
| 13 |
+
--text: #3d3425; /* Warm dark brown - easy on eyes */
|
| 14 |
+
--text-muted: #7a6f5f; /* Muted brown */
|
| 15 |
+
--border: #e5d9c8; /* Warm border */
|
| 16 |
+
--success: #5a8f5a; /* Green for success states */
|
| 17 |
+
--error: #c45a4a; /* Soft red for errors */
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
* {
|
| 21 |
+
box-sizing: border-box;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
body {
|
| 25 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, sans-serif;
|
| 26 |
+
background: var(--bg);
|
| 27 |
+
color: var(--text);
|
| 28 |
+
min-height: 100vh;
|
| 29 |
+
margin: 0;
|
| 30 |
+
padding: 1.5rem;
|
| 31 |
+
display: flex;
|
| 32 |
+
flex-direction: column;
|
| 33 |
+
align-items: center;
|
| 34 |
+
justify-content: center;
|
| 35 |
+
line-height: 1.4;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
.main-content {
|
| 39 |
+
display: flex;
|
| 40 |
+
flex-direction: column;
|
| 41 |
+
align-items: center;
|
| 42 |
+
flex: 1;
|
| 43 |
+
justify-content: center;
|
| 44 |
+
width: 100%;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
h1 {
|
| 48 |
+
font-size: 1.75rem;
|
| 49 |
+
margin: 0 0 0.25rem;
|
| 50 |
+
color: var(--primary);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.subtitle {
|
| 54 |
+
color: var(--text-muted);
|
| 55 |
+
margin: 0 0 1rem;
|
| 56 |
+
font-size: 0.9rem;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.container {
|
| 60 |
+
max-width: 500px;
|
| 61 |
+
width: 100%;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/* Options Row - Style and Language selectors */
|
| 65 |
+
.options-row {
|
| 66 |
+
display: flex;
|
| 67 |
+
justify-content: center;
|
| 68 |
+
gap: 1.5rem;
|
| 69 |
+
margin-bottom: 1rem;
|
| 70 |
+
flex-wrap: wrap;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/* Style Selector */
|
| 74 |
+
.style-selector,
|
| 75 |
+
.language-selector {
|
| 76 |
+
display: flex;
|
| 77 |
+
align-items: center;
|
| 78 |
+
gap: 0.5rem;
|
| 79 |
+
flex-wrap: wrap;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.style-label {
|
| 83 |
+
font-size: 0.85rem;
|
| 84 |
+
color: var(--text-muted);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.style-buttons {
|
| 88 |
+
display: flex;
|
| 89 |
+
gap: 0.35rem;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.style-btn {
|
| 93 |
+
width: 38px;
|
| 94 |
+
height: 38px;
|
| 95 |
+
border: 1px solid var(--border);
|
| 96 |
+
border-radius: 6px;
|
| 97 |
+
background: var(--surface);
|
| 98 |
+
color: var(--text-muted);
|
| 99 |
+
cursor: pointer;
|
| 100 |
+
font-size: 0.95rem;
|
| 101 |
+
transition: all 0.15s ease;
|
| 102 |
+
display: flex;
|
| 103 |
+
align-items: center;
|
| 104 |
+
justify-content: center;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
/* Language buttons use emoji flags */
|
| 108 |
+
.style-btn.lang-btn {
|
| 109 |
+
font-size: 1.2rem;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.style-btn:hover {
|
| 113 |
+
border-color: var(--primary);
|
| 114 |
+
color: var(--text);
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.style-btn.active {
|
| 118 |
+
background: rgb(212, 118, 58, 0.15);
|
| 119 |
+
border-color: var(--primary);
|
| 120 |
+
color: var(--primary);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* Input Section - hidden during processing */
|
| 124 |
+
.input-section.hidden {
|
| 125 |
+
display: none;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
/* Processing Section - two row layout */
|
| 129 |
+
.processing-section {
|
| 130 |
+
display: none;
|
| 131 |
+
flex-direction: column;
|
| 132 |
+
gap: 0.75rem;
|
| 133 |
+
padding: 1rem 1.25rem;
|
| 134 |
+
background: var(--surface);
|
| 135 |
+
border-radius: 10px;
|
| 136 |
+
border: 1px solid var(--border);
|
| 137 |
+
width: 100%;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.processing-section.visible {
|
| 141 |
+
display: flex;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* Row 1: Document info */
|
| 145 |
+
.processing-row-1 {
|
| 146 |
+
display: flex;
|
| 147 |
+
align-items: center;
|
| 148 |
+
width: 100%;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
/* Row 2: Status, progress, buttons */
|
| 152 |
+
.processing-row-2 {
|
| 153 |
+
display: flex;
|
| 154 |
+
align-items: center;
|
| 155 |
+
gap: 0.75rem;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/* Document Info - fills first row */
|
| 159 |
+
.doc-info {
|
| 160 |
+
display: flex;
|
| 161 |
+
align-items: center;
|
| 162 |
+
gap: 0.75rem;
|
| 163 |
+
font-size: 0.85rem;
|
| 164 |
+
color: var(--text);
|
| 165 |
+
width: 100%;
|
| 166 |
+
min-width: 0;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.doc-info:empty {
|
| 170 |
+
display: none;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.doc-info .doc-name {
|
| 174 |
+
font-weight: 600;
|
| 175 |
+
display: flex;
|
| 176 |
+
align-items: center;
|
| 177 |
+
gap: 0.4rem;
|
| 178 |
+
flex: 1;
|
| 179 |
+
min-width: 0;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.doc-info .doc-name i {
|
| 183 |
+
color: var(--primary);
|
| 184 |
+
flex-shrink: 0;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
.doc-info .doc-name-text {
|
| 188 |
+
overflow: hidden;
|
| 189 |
+
text-overflow: ellipsis;
|
| 190 |
+
white-space: nowrap;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.doc-info .doc-pages,
|
| 194 |
+
.doc-info .doc-chars {
|
| 195 |
+
color: var(--text-muted);
|
| 196 |
+
font-size: 0.75rem;
|
| 197 |
+
display: flex;
|
| 198 |
+
align-items: center;
|
| 199 |
+
gap: 0.25rem;
|
| 200 |
+
white-space: nowrap;
|
| 201 |
+
flex-shrink: 0;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.doc-info .doc-pages i,
|
| 205 |
+
.doc-info .doc-chars i {
|
| 206 |
+
font-size: 0.7rem;
|
| 207 |
+
opacity: 0.6;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
/* Status in processing */
|
| 211 |
+
.processing-section .status {
|
| 212 |
+
padding: 0;
|
| 213 |
+
background: none;
|
| 214 |
+
font-size: 0.8rem;
|
| 215 |
+
white-space: nowrap;
|
| 216 |
+
flex-shrink: 0;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
/* Processing progress bar */
|
| 220 |
+
.processing-progress-container {
|
| 221 |
+
flex: 1;
|
| 222 |
+
height: 6px;
|
| 223 |
+
background: var(--bg);
|
| 224 |
+
border-radius: 3px;
|
| 225 |
+
overflow: hidden;
|
| 226 |
+
min-width: 60px;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.processing-progress-bar {
|
| 230 |
+
height: 100%;
|
| 231 |
+
background: linear-gradient(90deg, var(--primary) 0%, #c06030 100%);
|
| 232 |
+
border-radius: 3px;
|
| 233 |
+
width: 0%;
|
| 234 |
+
transition: width 0.3s ease;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
/* Control buttons row */
|
| 238 |
+
.control-buttons {
|
| 239 |
+
display: flex;
|
| 240 |
+
gap: 0.5rem;
|
| 241 |
+
flex-shrink: 0;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.control-btn {
|
| 245 |
+
width: 36px;
|
| 246 |
+
height: 36px;
|
| 247 |
+
padding: 0;
|
| 248 |
+
color: white;
|
| 249 |
+
border: none;
|
| 250 |
+
border-radius: 8px;
|
| 251 |
+
cursor: pointer;
|
| 252 |
+
font-size: 0.9rem;
|
| 253 |
+
transition: all 0.15s ease;
|
| 254 |
+
display: flex;
|
| 255 |
+
align-items: center;
|
| 256 |
+
justify-content: center;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
.control-btn.hidden {
|
| 260 |
+
display: none;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
.control-btn:hover {
|
| 264 |
+
filter: brightness(1.1);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
.pause-btn {
|
| 268 |
+
background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
|
| 269 |
+
background-size: 200% 200%;
|
| 270 |
+
animation: gradient-idle 3s ease infinite;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.pause-btn:hover {
|
| 274 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
.stop-btn {
|
| 278 |
+
background: linear-gradient(135deg, var(--error), #8b3a30, var(--error));
|
| 279 |
+
background-size: 200% 200%;
|
| 280 |
+
animation: gradient-idle 3s ease infinite;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
.stop-btn:hover {
|
| 284 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
@keyframes gradient-idle {
|
| 288 |
+
0%, 100% { background-position: 0% 50%; }
|
| 289 |
+
50% { background-position: 100% 50%; }
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
@keyframes gradient-shift {
|
| 293 |
+
0% { background-position: 0% 50%; }
|
| 294 |
+
50% { background-position: 100% 50%; }
|
| 295 |
+
100% { background-position: 0% 50%; }
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
/* Drop Zone */
|
| 299 |
+
.drop-zone {
|
| 300 |
+
border: 2px dashed var(--border);
|
| 301 |
+
border-radius: 8px;
|
| 302 |
+
padding: 1.5rem 1rem;
|
| 303 |
+
text-align: center;
|
| 304 |
+
transition: all 0.2s ease;
|
| 305 |
+
cursor: pointer;
|
| 306 |
+
background: var(--surface);
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
.drop-zone:hover,
|
| 310 |
+
.drop-zone.dragover {
|
| 311 |
+
border-color: var(--primary);
|
| 312 |
+
background: rgb(212, 118, 58, 0.08);
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
.drop-zone p {
|
| 316 |
+
margin: 0 0 0.75rem;
|
| 317 |
+
font-size: 0.95rem;
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
.drop-zone .hint {
|
| 321 |
+
color: var(--text-muted);
|
| 322 |
+
font-size: 0.8rem;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
.drop-icon {
|
| 326 |
+
font-size: 2.5rem;
|
| 327 |
+
color: var(--primary);
|
| 328 |
+
margin-bottom: 0.75rem;
|
| 329 |
+
display: block;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
/* Tabs */
|
| 333 |
+
.tabs {
|
| 334 |
+
display: flex;
|
| 335 |
+
gap: 0.25rem;
|
| 336 |
+
margin-bottom: 0.75rem;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
.tab {
|
| 340 |
+
flex: 1;
|
| 341 |
+
padding: 0.5rem 0.75rem;
|
| 342 |
+
background: var(--surface);
|
| 343 |
+
border: 1px solid var(--border);
|
| 344 |
+
border-radius: 6px;
|
| 345 |
+
color: var(--text-muted);
|
| 346 |
+
cursor: pointer;
|
| 347 |
+
font-size: 0.85rem;
|
| 348 |
+
transition: all 0.15s ease;
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
.tab:hover {
|
| 352 |
+
border-color: var(--primary);
|
| 353 |
+
color: var(--text);
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
.tab.active {
|
| 357 |
+
background: rgb(212, 118, 58, 0.12);
|
| 358 |
+
border-color: var(--primary);
|
| 359 |
+
color: var(--primary);
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
.tab-content {
|
| 363 |
+
display: none;
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
.tab-content.active {
|
| 367 |
+
display: block;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
/* URL Form */
|
| 371 |
+
.url-form {
|
| 372 |
+
background: var(--surface);
|
| 373 |
+
border-radius: 8px;
|
| 374 |
+
padding: 1rem;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
.url-form input[type="url"] {
|
| 378 |
+
width: 100%;
|
| 379 |
+
padding: 0.6rem 0.75rem;
|
| 380 |
+
background: var(--bg);
|
| 381 |
+
border: 1px solid var(--border);
|
| 382 |
+
border-radius: 6px;
|
| 383 |
+
color: var(--text);
|
| 384 |
+
font-size: 0.9rem;
|
| 385 |
+
margin-bottom: 0.75rem;
|
| 386 |
+
transition: border-color 0.15s ease;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
.url-form input[type="url"]:focus {
|
| 390 |
+
outline: none;
|
| 391 |
+
border-color: var(--primary);
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.url-form input[type="url"]::placeholder {
|
| 395 |
+
color: var(--text-muted);
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
/* Text Form */
|
| 399 |
+
.text-form {
|
| 400 |
+
background: var(--surface);
|
| 401 |
+
border-radius: 8px;
|
| 402 |
+
padding: 1rem;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
.text-form textarea {
|
| 406 |
+
width: 100%;
|
| 407 |
+
padding: 0.6rem 0.75rem;
|
| 408 |
+
background: var(--bg);
|
| 409 |
+
border: 1px solid var(--border);
|
| 410 |
+
border-radius: 6px;
|
| 411 |
+
color: var(--text);
|
| 412 |
+
font-size: 0.9rem;
|
| 413 |
+
margin-bottom: 0.75rem;
|
| 414 |
+
transition: border-color 0.15s ease;
|
| 415 |
+
resize: vertical;
|
| 416 |
+
min-height: 120px;
|
| 417 |
+
font-family: inherit;
|
| 418 |
+
line-height: 1.5;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
.text-form textarea:focus {
|
| 422 |
+
outline: none;
|
| 423 |
+
border-color: var(--primary);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
.text-form textarea::placeholder {
|
| 427 |
+
color: var(--text-muted);
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
.text-form .hint {
|
| 431 |
+
color: var(--text-muted);
|
| 432 |
+
font-size: 0.8rem;
|
| 433 |
+
text-align: center;
|
| 434 |
+
margin: 0;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
/* Buttons */
|
| 438 |
+
.submit-btn {
|
| 439 |
+
width: 100%;
|
| 440 |
+
padding: 0.6rem 1rem;
|
| 441 |
+
background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
|
| 442 |
+
background-size: 200% 200%;
|
| 443 |
+
animation: gradient-idle 3s ease infinite;
|
| 444 |
+
color: white;
|
| 445 |
+
border: none;
|
| 446 |
+
border-radius: 8px;
|
| 447 |
+
cursor: pointer;
|
| 448 |
+
font-size: 0.9rem;
|
| 449 |
+
font-weight: 500;
|
| 450 |
+
transition: filter 0.15s ease;
|
| 451 |
+
margin-bottom: 0.5rem;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
.submit-btn:hover {
|
| 455 |
+
filter: brightness(1.1);
|
| 456 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
.submit-btn:disabled {
|
| 460 |
+
opacity: 0.6;
|
| 461 |
+
cursor: not-allowed;
|
| 462 |
+
filter: none;
|
| 463 |
+
animation: none;
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
.url-form .hint {
|
| 467 |
+
color: var(--text-muted);
|
| 468 |
+
font-size: 0.8rem;
|
| 469 |
+
text-align: center;
|
| 470 |
+
margin: 0;
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
input[type="file"] {
|
| 474 |
+
display: none;
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
.file-label {
|
| 478 |
+
display: inline-block;
|
| 479 |
+
padding: 0.5rem 1rem;
|
| 480 |
+
background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
|
| 481 |
+
background-size: 200% 200%;
|
| 482 |
+
animation: gradient-idle 3s ease infinite;
|
| 483 |
+
color: white;
|
| 484 |
+
border-radius: 8px;
|
| 485 |
+
cursor: pointer;
|
| 486 |
+
font-weight: 500;
|
| 487 |
+
font-size: 0.9rem;
|
| 488 |
+
transition: filter 0.15s ease;
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
.file-label:hover {
|
| 492 |
+
filter: brightness(1.1);
|
| 493 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
/* Device Info - Subtle footer-like display */
|
| 497 |
+
.device-info {
|
| 498 |
+
display: none;
|
| 499 |
+
justify-content: center;
|
| 500 |
+
align-items: center;
|
| 501 |
+
gap: 1rem;
|
| 502 |
+
padding: 0.75rem 1rem;
|
| 503 |
+
font-size: 0.7rem;
|
| 504 |
+
color: var(--text-muted);
|
| 505 |
+
margin-top: 0.5rem;
|
| 506 |
+
opacity: 0.7;
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
.device-info.visible {
|
| 510 |
+
display: flex;
|
| 511 |
+
flex-wrap: wrap;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.device-info i {
|
| 515 |
+
color: var(--primary);
|
| 516 |
+
opacity: 0.8;
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.device-memory {
|
| 520 |
+
opacity: 0.9;
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
.device-batch {
|
| 524 |
+
background: var(--surface);
|
| 525 |
+
padding: 0.2rem 0.5rem;
|
| 526 |
+
border-radius: 4px;
|
| 527 |
+
font-size: 0.65rem;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
/* Icon spacing in buttons and tabs */
|
| 531 |
+
.tab i,
|
| 532 |
+
.submit-btn i,
|
| 533 |
+
.file-label i {
|
| 534 |
+
margin-right: 0.4rem;
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
/* Status Messages */
|
| 538 |
+
.status {
|
| 539 |
+
font-size: 0.85rem;
|
| 540 |
+
display: none;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
.status.visible {
|
| 544 |
+
display: block;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
.status i {
|
| 548 |
+
margin-right: 0.4rem;
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
.status.loading {
|
| 552 |
+
color: var(--text-muted);
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
.status.error {
|
| 556 |
+
color: var(--error);
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.status.success {
|
| 560 |
+
color: var(--success);
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
/* Audio Player */
|
| 564 |
+
.player {
|
| 565 |
+
margin-top: 1.5rem;
|
| 566 |
+
width: 100%;
|
| 567 |
+
display: none;
|
| 568 |
+
padding: 1.25rem;
|
| 569 |
+
background: var(--surface);
|
| 570 |
+
border-radius: 12px;
|
| 571 |
+
border: 1px solid var(--border);
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
.player.visible {
|
| 575 |
+
display: block;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
/* Hidden audio element */
|
| 579 |
+
#audio {
|
| 580 |
+
display: none;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
/* Custom Audio Player */
|
| 584 |
+
.custom-player {
|
| 585 |
+
display: flex;
|
| 586 |
+
align-items: center;
|
| 587 |
+
gap: 0.75rem;
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
.player-btn {
|
| 591 |
+
width: 36px;
|
| 592 |
+
height: 36px;
|
| 593 |
+
border: none;
|
| 594 |
+
border-radius: 8px;
|
| 595 |
+
background: linear-gradient(135deg, var(--primary), #c06030, var(--primary));
|
| 596 |
+
background-size: 200% 200%;
|
| 597 |
+
animation: gradient-idle 3s ease infinite;
|
| 598 |
+
color: white;
|
| 599 |
+
cursor: pointer;
|
| 600 |
+
display: flex;
|
| 601 |
+
align-items: center;
|
| 602 |
+
justify-content: center;
|
| 603 |
+
font-size: 0.85rem;
|
| 604 |
+
transition: filter 0.15s ease;
|
| 605 |
+
flex-shrink: 0;
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
.player-btn:hover {
|
| 609 |
+
filter: brightness(1.1);
|
| 610 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
.player-btn.volume-btn {
|
| 614 |
+
background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
|
| 615 |
+
background-size: 200% 200%;
|
| 616 |
+
animation: gradient-idle 3s ease infinite;
|
| 617 |
+
color: var(--text-muted);
|
| 618 |
+
width: 32px;
|
| 619 |
+
height: 32px;
|
| 620 |
+
font-size: 0.8rem;
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
.player-btn.volume-btn:hover {
|
| 624 |
+
color: var(--text);
|
| 625 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
.player-btn.download-btn {
|
| 629 |
+
background: linear-gradient(135deg, var(--bg), #252540, var(--bg));
|
| 630 |
+
background-size: 200% 200%;
|
| 631 |
+
animation: gradient-idle 3s ease infinite;
|
| 632 |
+
color: var(--text-muted);
|
| 633 |
+
width: 32px;
|
| 634 |
+
height: 32px;
|
| 635 |
+
font-size: 0.8rem;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
.player-btn.download-btn:hover {
|
| 639 |
+
color: var(--primary);
|
| 640 |
+
animation: gradient-shift 0.8s ease infinite;
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
.progress-container {
|
| 644 |
+
flex: 1;
|
| 645 |
+
height: 6px;
|
| 646 |
+
background: var(--bg);
|
| 647 |
+
border-radius: 3px;
|
| 648 |
+
position: relative;
|
| 649 |
+
cursor: pointer;
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
.progress-bar {
|
| 653 |
+
height: 100%;
|
| 654 |
+
background: var(--primary);
|
| 655 |
+
border-radius: 3px;
|
| 656 |
+
width: 0%;
|
| 657 |
+
transition: width 0.1s ease;
|
| 658 |
+
pointer-events: none;
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
.progress-slider {
|
| 662 |
+
position: absolute;
|
| 663 |
+
top: 0;
|
| 664 |
+
left: 0;
|
| 665 |
+
width: 100%;
|
| 666 |
+
height: 100%;
|
| 667 |
+
opacity: 0;
|
| 668 |
+
cursor: pointer;
|
| 669 |
+
margin: 0;
|
| 670 |
+
appearance: none;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
.progress-slider::-webkit-slider-thumb {
|
| 674 |
+
appearance: none;
|
| 675 |
+
width: 14px;
|
| 676 |
+
height: 14px;
|
| 677 |
+
background: var(--primary);
|
| 678 |
+
border-radius: 50%;
|
| 679 |
+
cursor: pointer;
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
.progress-slider::-moz-range-thumb {
|
| 683 |
+
width: 14px;
|
| 684 |
+
height: 14px;
|
| 685 |
+
background: var(--primary);
|
| 686 |
+
border-radius: 50%;
|
| 687 |
+
cursor: pointer;
|
| 688 |
+
border: none;
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
.time-display {
|
| 692 |
+
font-size: 0.75rem;
|
| 693 |
+
color: var(--text-muted);
|
| 694 |
+
min-width: 80px;
|
| 695 |
+
text-align: center;
|
| 696 |
+
font-variant-numeric: tabular-nums;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
.filename {
|
| 700 |
+
margin-bottom: 0.75rem;
|
| 701 |
+
font-size: 0.85rem;
|
| 702 |
+
font-weight: 500;
|
| 703 |
+
color: var(--text);
|
| 704 |
+
word-break: break-all;
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
/* Spinner Animation */
|
| 708 |
+
.spinner {
|
| 709 |
+
display: inline-block;
|
| 710 |
+
width: 14px;
|
| 711 |
+
height: 14px;
|
| 712 |
+
border: 2px solid var(--text-muted);
|
| 713 |
+
border-top-color: var(--primary);
|
| 714 |
+
border-radius: 50%;
|
| 715 |
+
animation: spin 1s linear infinite;
|
| 716 |
+
margin-right: 0.4rem;
|
| 717 |
+
vertical-align: middle;
|
| 718 |
+
}
|
| 719 |
+
|
| 720 |
+
@keyframes spin {
|
| 721 |
+
to {
|
| 722 |
+
transform: rotate(360deg);
|
| 723 |
+
}
|
| 724 |
+
}
|
| 725 |
+
|
| 726 |
+
/* Footer */
|
| 727 |
+
footer {
|
| 728 |
+
margin-top: auto;
|
| 729 |
+
padding-top: 1.5rem;
|
| 730 |
+
color: var(--text-muted);
|
| 731 |
+
font-size: 0.75rem;
|
| 732 |
+
flex-shrink: 0;
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
footer a {
|
| 736 |
+
color: var(--primary);
|
| 737 |
+
text-decoration: none;
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
footer a:hover {
|
| 741 |
+
text-decoration: underline;
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
footer i.fa-heart {
|
| 745 |
+
color: var(--error);
|
| 746 |
+
}
|
| 747 |
+
|
| 748 |
+
footer i.fa-github {
|
| 749 |
+
margin-right: 0.2rem;
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
/* Logo */
|
| 753 |
+
.logo {
|
| 754 |
+
width: 250px;
|
| 755 |
+
height: auto;
|
| 756 |
+
margin-bottom: 0.75rem;
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
/* Tablet styles */
|
| 760 |
+
@media (width <= 768px) {
|
| 761 |
+
body {
|
| 762 |
+
padding: 1rem;
|
| 763 |
+
}
|
| 764 |
+
|
| 765 |
+
h1 {
|
| 766 |
+
font-size: 1.5rem;
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
.logo {
|
| 770 |
+
width: 200px;
|
| 771 |
+
}
|
| 772 |
+
|
| 773 |
+
.drop-zone {
|
| 774 |
+
padding: 1.25rem 1rem;
|
| 775 |
+
}
|
| 776 |
+
|
| 777 |
+
.tabs {
|
| 778 |
+
flex-direction: column;
|
| 779 |
+
}
|
| 780 |
+
|
| 781 |
+
.tab {
|
| 782 |
+
width: 100%;
|
| 783 |
+
}
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
/* Mobile styles */
|
| 787 |
+
@media (width <= 480px) {
|
| 788 |
+
body {
|
| 789 |
+
padding: 0.75rem;
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
h1 {
|
| 793 |
+
font-size: 1.35rem;
|
| 794 |
+
}
|
| 795 |
+
|
| 796 |
+
.subtitle {
|
| 797 |
+
font-size: 0.8rem;
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
.logo {
|
| 801 |
+
width: 160px;
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
.drop-zone {
|
| 805 |
+
padding: 1rem;
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
+
.drop-zone p {
|
| 809 |
+
font-size: 0.9rem;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
.url-form {
|
| 813 |
+
padding: 0.75rem;
|
| 814 |
+
}
|
| 815 |
+
|
| 816 |
+
.url-form input[type="url"] {
|
| 817 |
+
padding: 0.5rem;
|
| 818 |
+
font-size: 0.85rem;
|
| 819 |
+
}
|
| 820 |
+
|
| 821 |
+
.submit-btn,
|
| 822 |
+
.file-label {
|
| 823 |
+
padding: 0.5rem 0.75rem;
|
| 824 |
+
font-size: 0.85rem;
|
| 825 |
+
}
|
| 826 |
+
|
| 827 |
+
.filename {
|
| 828 |
+
font-size: 0.75rem;
|
| 829 |
+
padding: 0.4rem 0.5rem;
|
| 830 |
+
}
|
| 831 |
+
|
| 832 |
+
footer {
|
| 833 |
+
font-size: 0.7rem;
|
| 834 |
+
text-align: center;
|
| 835 |
+
}
|
| 836 |
+
}
|
| 837 |
+
|
| 838 |
+
/* Ensure touch targets are large enough */
|
| 839 |
+
@media (pointer: coarse) {
|
| 840 |
+
.tab,
|
| 841 |
+
.submit-btn,
|
| 842 |
+
.file-label {
|
| 843 |
+
min-height: 44px;
|
| 844 |
+
display: flex;
|
| 845 |
+
align-items: center;
|
| 846 |
+
justify-content: center;
|
| 847 |
+
}
|
| 848 |
+
}
|
src/talking_snake/static/talking_snake.png
ADDED
|
Git LFS Details
|
src/talking_snake/tts.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TTS engine wrapper for Qwen3-TTS."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import wave
|
| 7 |
+
from abc import ABC, abstractmethod
|
| 8 |
+
from collections.abc import Iterator
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
import numpy as np
|
| 13 |
+
import numpy.typing as npt
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TTSEngineProtocol(ABC):
|
| 17 |
+
"""Protocol for TTS engines, enabling dependency injection and mocking."""
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
def synthesize(self, text: str) -> Iterator[bytes]:
|
| 21 |
+
"""Synthesize text to audio.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
text: Text to synthesize.
|
| 25 |
+
|
| 26 |
+
Yields:
|
| 27 |
+
WAV audio data chunks.
|
| 28 |
+
"""
|
| 29 |
+
...
|
| 30 |
+
|
| 31 |
+
@property
|
| 32 |
+
@abstractmethod
|
| 33 |
+
def sample_rate(self) -> int:
|
| 34 |
+
"""Return the sample rate of generated audio."""
|
| 35 |
+
...
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def batch_size(self) -> int:
|
| 39 |
+
"""Return the batch size for parallel processing (default: 1)."""
|
| 40 |
+
return 1
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Professional narration style prompt
|
| 44 |
+
# This instructs the model to read with clear, authoritative delivery
|
| 45 |
+
PROFESSIONAL_STYLE = (
|
| 46 |
+
"Read this as a professional narrator with clear enunciation, "
|
| 47 |
+
"measured pacing, and an authoritative yet warm tone. "
|
| 48 |
+
"Speak naturally as if presenting an audiobook or documentary. "
|
| 49 |
+
"Avoid sounding robotic or monotone. Emphasize key points and maintain a steady rhythm. "
|
| 50 |
+
"Use appropriate intonation to convey meaning and keep the listener engaged. "
|
| 51 |
+
"This is not casual conversation, but a polished narration style. "
|
| 52 |
+
"Use proper diction, read correctly acronyms, and pronounce all words clearly."
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Language to default voice mapping
|
| 56 |
+
LANGUAGE_VOICES: dict[str, str] = {
|
| 57 |
+
"english": "Ryan",
|
| 58 |
+
"chinese": "Vivian",
|
| 59 |
+
"japanese": "Ono_Anna",
|
| 60 |
+
"korean": "Sohee",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Default chunk size for streaming
|
| 64 |
+
# Larger chunks = more stable voice, fewer artifacts at boundaries
|
| 65 |
+
# Smaller chunks = faster first audio but potential voice instability
|
| 66 |
+
# 1200 chars provides good balance for natural speech flow
|
| 67 |
+
DEFAULT_CHUNK_SIZE = 1200
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class QwenTTSEngine(TTSEngineProtocol):
|
| 71 |
+
"""TTS engine using Qwen3-TTS model."""
|
| 72 |
+
|
| 73 |
+
# Available voices for CustomVoice model:
|
| 74 |
+
# Chinese: Vivian, Serena, Uncle_Fu, Dylan (Beijing), Eric (Sichuan)
|
| 75 |
+
# English: Ryan, Aiden
|
| 76 |
+
# Japanese: Ono_Anna
|
| 77 |
+
# Korean: Sohee
|
| 78 |
+
AVAILABLE_VOICES = [
|
| 79 |
+
"Vivian",
|
| 80 |
+
"Serena",
|
| 81 |
+
"Uncle_Fu",
|
| 82 |
+
"Dylan",
|
| 83 |
+
"Eric",
|
| 84 |
+
"Ryan",
|
| 85 |
+
"Aiden",
|
| 86 |
+
"Ono_Anna",
|
| 87 |
+
"Sohee",
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
def __init__(
|
| 91 |
+
self,
|
| 92 |
+
voice: str | None = None,
|
| 93 |
+
language: str = "english",
|
| 94 |
+
device: str = "cuda",
|
| 95 |
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
| 96 |
+
model_name: str = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
|
| 97 |
+
) -> None:
|
| 98 |
+
"""Initialize the TTS engine.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
voice: Voice name to use for synthesis. If None, uses default for language.
|
| 102 |
+
Available voices:
|
| 103 |
+
Chinese: Vivian, Serena, Uncle_Fu, Dylan, Eric
|
| 104 |
+
English: Ryan, Aiden
|
| 105 |
+
Japanese: Ono_Anna
|
| 106 |
+
Korean: Sohee
|
| 107 |
+
language: Language for TTS. One of: english, chinese, japanese, korean.
|
| 108 |
+
Sets default voice if voice is None.
|
| 109 |
+
device: Device to run the model on ('cuda' or 'cpu').
|
| 110 |
+
chunk_size: Maximum characters per chunk (smaller = faster streaming start).
|
| 111 |
+
model_name: HuggingFace model identifier.
|
| 112 |
+
"""
|
| 113 |
+
import logging
|
| 114 |
+
import warnings
|
| 115 |
+
|
| 116 |
+
import torch
|
| 117 |
+
from qwen_tts import Qwen3TTSModel
|
| 118 |
+
|
| 119 |
+
# Suppress the pad_token_id warning from transformers
|
| 120 |
+
logging.getLogger("transformers.generation.utils").setLevel(logging.ERROR)
|
| 121 |
+
warnings.filterwarnings("ignore", message=".*pad_token_id.*")
|
| 122 |
+
|
| 123 |
+
self.language = language.lower()
|
| 124 |
+
self.voice = voice or LANGUAGE_VOICES.get(self.language, "Ryan")
|
| 125 |
+
self.device = device
|
| 126 |
+
self.chunk_size = chunk_size
|
| 127 |
+
self._sample_rate = 24000
|
| 128 |
+
self._batch_size = 1 # Will be calculated after model loads
|
| 129 |
+
|
| 130 |
+
# Determine dtype based on device
|
| 131 |
+
dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
| 132 |
+
|
| 133 |
+
# Try to use flash attention on CUDA
|
| 134 |
+
attn_impl = "flash_attention_2" if device == "cuda" else "eager"
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
self.model = Qwen3TTSModel.from_pretrained(
|
| 138 |
+
model_name,
|
| 139 |
+
device_map=device,
|
| 140 |
+
dtype=dtype,
|
| 141 |
+
attn_implementation=attn_impl,
|
| 142 |
+
)
|
| 143 |
+
except Exception:
|
| 144 |
+
# Fallback without flash attention
|
| 145 |
+
self.model = Qwen3TTSModel.from_pretrained(
|
| 146 |
+
model_name,
|
| 147 |
+
device_map=device,
|
| 148 |
+
dtype=dtype,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Calculate optimal batch size based on available VRAM
|
| 152 |
+
if device == "cuda":
|
| 153 |
+
self._batch_size = self._calculate_batch_size()
|
| 154 |
+
print(f" Batch size: {self._batch_size} (based on available VRAM)")
|
| 155 |
+
|
| 156 |
+
def _calculate_batch_size(self) -> int:
|
| 157 |
+
"""Calculate optimal batch size based on available GPU memory.
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Recommended batch size for parallel chunk processing.
|
| 161 |
+
"""
|
| 162 |
+
import torch
|
| 163 |
+
|
| 164 |
+
if not torch.cuda.is_available():
|
| 165 |
+
return 1
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
# Get GPU memory info
|
| 169 |
+
gpu_mem = torch.cuda.get_device_properties(0).total_memory
|
| 170 |
+
allocated = torch.cuda.memory_allocated(0)
|
| 171 |
+
reserved = torch.cuda.memory_reserved(0)
|
| 172 |
+
|
| 173 |
+
# Available memory (conservative estimate)
|
| 174 |
+
available = gpu_mem - max(allocated, reserved)
|
| 175 |
+
|
| 176 |
+
# Model uses ~6GB, each batch item needs ~2-3GB for generation
|
| 177 |
+
# Use conservative 3GB per batch item estimate
|
| 178 |
+
mem_per_batch = 3 * 1024 * 1024 * 1024 # 3GB
|
| 179 |
+
|
| 180 |
+
# Calculate batch size, minimum 1, cap at 8
|
| 181 |
+
batch_size = max(1, min(8, int(available / mem_per_batch)))
|
| 182 |
+
|
| 183 |
+
return batch_size
|
| 184 |
+
except Exception:
|
| 185 |
+
return 1
|
| 186 |
+
|
| 187 |
+
@property
|
| 188 |
+
def sample_rate(self) -> int:
|
| 189 |
+
"""Return the sample rate of generated audio."""
|
| 190 |
+
return self._sample_rate
|
| 191 |
+
|
| 192 |
+
@property
|
| 193 |
+
def batch_size(self) -> int:
|
| 194 |
+
"""Return the current batch size."""
|
| 195 |
+
return self._batch_size
|
| 196 |
+
|
| 197 |
+
def synthesize(self, text: str) -> Iterator[bytes]:
|
| 198 |
+
"""Synthesize text to WAV audio using batched GPU inference.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
text: Text to synthesize.
|
| 202 |
+
|
| 203 |
+
Yields:
|
| 204 |
+
WAV audio data chunks.
|
| 205 |
+
"""
|
| 206 |
+
if not text.strip():
|
| 207 |
+
return
|
| 208 |
+
|
| 209 |
+
# Split text into chunks for streaming
|
| 210 |
+
chunks = self._split_text(text)
|
| 211 |
+
|
| 212 |
+
# First chunk includes WAV header
|
| 213 |
+
first_chunk = True
|
| 214 |
+
|
| 215 |
+
# Process chunks in batches for GPU efficiency
|
| 216 |
+
batch_size = self._batch_size
|
| 217 |
+
|
| 218 |
+
for i in range(0, len(chunks), batch_size):
|
| 219 |
+
batch = chunks[i : i + batch_size]
|
| 220 |
+
|
| 221 |
+
# Filter empty chunks
|
| 222 |
+
batch = [c for c in batch if c.strip()]
|
| 223 |
+
if not batch:
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
# Always use batched call for consistent GPU memory allocation
|
| 227 |
+
# Use professional narration style for clear, authoritative delivery
|
| 228 |
+
batch_instruct = (
|
| 229 |
+
[PROFESSIONAL_STYLE] * len(batch) if len(batch) > 1 else PROFESSIONAL_STYLE
|
| 230 |
+
)
|
| 231 |
+
audios, sr = self.model.generate_custom_voice(
|
| 232 |
+
text=batch if len(batch) > 1 else batch[0],
|
| 233 |
+
speaker=[self.voice] * len(batch) if len(batch) > 1 else self.voice,
|
| 234 |
+
instruct=batch_instruct,
|
| 235 |
+
# Use lower temperature for more stable, consistent voice
|
| 236 |
+
temperature=0.7,
|
| 237 |
+
repetition_penalty=1.1,
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Ensure audios is a list for consistent iteration
|
| 241 |
+
if len(batch) == 1:
|
| 242 |
+
audios = [audios]
|
| 243 |
+
|
| 244 |
+
# Yield each audio chunk in order
|
| 245 |
+
for audio in audios:
|
| 246 |
+
wav_bytes = self._audio_to_wav(audio, sr, include_header=first_chunk)
|
| 247 |
+
first_chunk = False
|
| 248 |
+
yield wav_bytes
|
| 249 |
+
|
| 250 |
+
def _split_text(self, text: str, max_chars: int | None = None) -> list[str]:
|
| 251 |
+
"""Split text into chunks suitable for TTS.
|
| 252 |
+
|
| 253 |
+
Splits on sentence boundaries when possible.
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
text: Text to split.
|
| 257 |
+
max_chars: Maximum characters per chunk. Uses self.chunk_size if None.
|
| 258 |
+
|
| 259 |
+
Returns:
|
| 260 |
+
List of text chunks.
|
| 261 |
+
"""
|
| 262 |
+
import re
|
| 263 |
+
|
| 264 |
+
if max_chars is None:
|
| 265 |
+
max_chars = self.chunk_size
|
| 266 |
+
|
| 267 |
+
# Split on sentence boundaries
|
| 268 |
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
| 269 |
+
|
| 270 |
+
chunks: list[str] = []
|
| 271 |
+
current_chunk: list[str] = []
|
| 272 |
+
current_length = 0
|
| 273 |
+
|
| 274 |
+
for sentence in sentences:
|
| 275 |
+
sentence = sentence.strip()
|
| 276 |
+
if not sentence:
|
| 277 |
+
continue
|
| 278 |
+
|
| 279 |
+
if current_length + len(sentence) > max_chars and current_chunk:
|
| 280 |
+
chunks.append(" ".join(current_chunk))
|
| 281 |
+
current_chunk = []
|
| 282 |
+
current_length = 0
|
| 283 |
+
|
| 284 |
+
current_chunk.append(sentence)
|
| 285 |
+
current_length += len(sentence) + 1
|
| 286 |
+
|
| 287 |
+
if current_chunk:
|
| 288 |
+
chunks.append(" ".join(current_chunk))
|
| 289 |
+
|
| 290 |
+
return chunks
|
| 291 |
+
|
| 292 |
+
def _audio_to_wav(
|
| 293 |
+
self,
|
| 294 |
+
audio: npt.NDArray[np.float32] | list[float],
|
| 295 |
+
sample_rate: int,
|
| 296 |
+
include_header: bool = True,
|
| 297 |
+
) -> bytes:
|
| 298 |
+
"""Convert audio array to WAV bytes.
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
audio: Audio data as numpy array or list.
|
| 302 |
+
sample_rate: Sample rate of the audio.
|
| 303 |
+
include_header: Whether to include WAV header.
|
| 304 |
+
|
| 305 |
+
Returns:
|
| 306 |
+
WAV audio data as bytes.
|
| 307 |
+
"""
|
| 308 |
+
import numpy as np
|
| 309 |
+
|
| 310 |
+
# Convert to numpy array if needed
|
| 311 |
+
if isinstance(audio, list):
|
| 312 |
+
audio = np.array(audio, dtype=np.float32)
|
| 313 |
+
|
| 314 |
+
# Ensure audio is 1D
|
| 315 |
+
if audio.ndim > 1:
|
| 316 |
+
audio = audio.flatten()
|
| 317 |
+
|
| 318 |
+
# Normalize and convert to 16-bit PCM
|
| 319 |
+
audio = np.clip(audio, -1.0, 1.0)
|
| 320 |
+
audio_int16 = (audio * 32767).astype(np.int16)
|
| 321 |
+
|
| 322 |
+
if include_header:
|
| 323 |
+
# Write full WAV file
|
| 324 |
+
buffer = io.BytesIO()
|
| 325 |
+
with wave.open(buffer, "wb") as wav_file:
|
| 326 |
+
wav_file.setnchannels(1)
|
| 327 |
+
wav_file.setsampwidth(2) # 16-bit
|
| 328 |
+
wav_file.setframerate(sample_rate)
|
| 329 |
+
wav_file.writeframes(audio_int16.tobytes())
|
| 330 |
+
result: bytes = buffer.getvalue()
|
| 331 |
+
return result
|
| 332 |
+
else:
|
| 333 |
+
# Return raw PCM data
|
| 334 |
+
pcm_data: bytes = audio_int16.tobytes()
|
| 335 |
+
return pcm_data
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
class MockTTSEngine(TTSEngineProtocol):
|
| 339 |
+
"""Mock TTS engine for testing."""
|
| 340 |
+
|
| 341 |
+
def __init__(self, sample_rate: int = 24000) -> None:
|
| 342 |
+
"""Initialize the mock TTS engine.
|
| 343 |
+
|
| 344 |
+
Args:
|
| 345 |
+
sample_rate: Sample rate for generated audio.
|
| 346 |
+
"""
|
| 347 |
+
self._sample_rate = sample_rate
|
| 348 |
+
|
| 349 |
+
@property
|
| 350 |
+
def sample_rate(self) -> int:
|
| 351 |
+
"""Return the sample rate of generated audio."""
|
| 352 |
+
return self._sample_rate
|
| 353 |
+
|
| 354 |
+
def synthesize(self, text: str) -> Iterator[bytes]:
|
| 355 |
+
"""Generate silent WAV audio for testing.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
text: Text to synthesize (used to determine duration).
|
| 359 |
+
|
| 360 |
+
Yields:
|
| 361 |
+
WAV audio data with silence.
|
| 362 |
+
"""
|
| 363 |
+
if not text.strip():
|
| 364 |
+
return
|
| 365 |
+
|
| 366 |
+
# Generate ~0.1 seconds of silence per word
|
| 367 |
+
words = len(text.split())
|
| 368 |
+
duration_samples = int(self._sample_rate * 0.1 * max(1, words))
|
| 369 |
+
|
| 370 |
+
# Create silent audio
|
| 371 |
+
silence = b"\x00\x00" * duration_samples
|
| 372 |
+
|
| 373 |
+
# Write WAV header + silence
|
| 374 |
+
buffer = io.BytesIO()
|
| 375 |
+
with wave.open(buffer, "wb") as wav_file:
|
| 376 |
+
wav_file.setnchannels(1)
|
| 377 |
+
wav_file.setsampwidth(2)
|
| 378 |
+
wav_file.setframerate(self._sample_rate)
|
| 379 |
+
wav_file.writeframes(silence)
|
| 380 |
+
|
| 381 |
+
yield buffer.getvalue()
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|