Spaces:
Running
Running
Update app
Browse files- app.py +82 -4
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -25,6 +25,19 @@ import re
|
|
| 25 |
import gradio as gr
|
| 26 |
from openai import OpenAI
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
# Configuration
|
| 30 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -348,6 +361,69 @@ def generate_silence(duration_sec: float, output_path: str):
|
|
| 348 |
)
|
| 349 |
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
# Main pipeline
|
| 353 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -364,8 +440,10 @@ def generate_audiobook(
|
|
| 364 |
# ββ Resolve text source ββ
|
| 365 |
if file_input is not None:
|
| 366 |
try:
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
| 369 |
except Exception as e:
|
| 370 |
raise gr.Error(f"Failed to read file: {e}")
|
| 371 |
elif text_input and text_input.strip():
|
|
@@ -563,8 +641,8 @@ with gr.Blocks(
|
|
| 563 |
)
|
| 564 |
|
| 565 |
file_input = gr.File(
|
| 566 |
-
label="Or Upload a
|
| 567 |
-
file_types=[".txt", ".md", ".text"],
|
| 568 |
type="filepath",
|
| 569 |
)
|
| 570 |
|
|
|
|
| 25 |
import gradio as gr
|
| 26 |
from openai import OpenAI
|
| 27 |
|
| 28 |
+
# Optional document parsers β installed via requirements.txt
|
| 29 |
+
try:
|
| 30 |
+
import pypdf
|
| 31 |
+
HAS_PYPDF = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_PYPDF = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
import docx
|
| 37 |
+
HAS_DOCX = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
HAS_DOCX = False
|
| 40 |
+
|
| 41 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
# Configuration
|
| 43 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 361 |
)
|
| 362 |
|
| 363 |
|
| 364 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 365 |
+
# Document text extraction
|
| 366 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 367 |
+
def extract_text_from_pdf(filepath: str) -> str:
|
| 368 |
+
"""Extract text from a PDF file using pypdf."""
|
| 369 |
+
if not HAS_PYPDF:
|
| 370 |
+
raise ImportError("pypdf is not installed. Cannot read PDF files.")
|
| 371 |
+
reader = pypdf.PdfReader(filepath)
|
| 372 |
+
pages = []
|
| 373 |
+
for page in reader.pages:
|
| 374 |
+
text = page.extract_text()
|
| 375 |
+
if text:
|
| 376 |
+
pages.append(text.strip())
|
| 377 |
+
return "\n\n".join(pages)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def extract_text_from_docx(filepath: str) -> str:
|
| 381 |
+
"""Extract text from a .docx file using python-docx."""
|
| 382 |
+
if not HAS_DOCX:
|
| 383 |
+
raise ImportError("python-docx is not installed. Cannot read Word files.")
|
| 384 |
+
doc = docx.Document(filepath)
|
| 385 |
+
paragraphs = []
|
| 386 |
+
for para in doc.paragraphs:
|
| 387 |
+
text = para.text.strip()
|
| 388 |
+
if text:
|
| 389 |
+
paragraphs.append(text)
|
| 390 |
+
return "\n\n".join(paragraphs)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def extract_text_from_file(filepath: str) -> str:
|
| 394 |
+
"""Extract text from a file based on its extension."""
|
| 395 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 396 |
+
|
| 397 |
+
if ext == ".pdf":
|
| 398 |
+
return extract_text_from_pdf(filepath)
|
| 399 |
+
elif ext in (".docx", ".doc"):
|
| 400 |
+
if ext == ".doc":
|
| 401 |
+
# .doc (old format) β try converting with LibreOffice if available
|
| 402 |
+
try:
|
| 403 |
+
tmp_dir = tempfile.mkdtemp()
|
| 404 |
+
subprocess.run(
|
| 405 |
+
["libreoffice", "--headless", "--convert-to", "docx",
|
| 406 |
+
"--outdir", tmp_dir, filepath],
|
| 407 |
+
capture_output=True, check=True, timeout=60,
|
| 408 |
+
)
|
| 409 |
+
docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
|
| 410 |
+
docx_path = os.path.join(tmp_dir, docx_name)
|
| 411 |
+
if os.path.exists(docx_path):
|
| 412 |
+
text = extract_text_from_docx(docx_path)
|
| 413 |
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 414 |
+
return text
|
| 415 |
+
except Exception:
|
| 416 |
+
pass
|
| 417 |
+
raise gr.Error(
|
| 418 |
+
"Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
|
| 419 |
+
)
|
| 420 |
+
return extract_text_from_docx(filepath)
|
| 421 |
+
else:
|
| 422 |
+
# Plain text files (.txt, .md, etc.)
|
| 423 |
+
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
| 424 |
+
return f.read()
|
| 425 |
+
|
| 426 |
+
|
| 427 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 428 |
# Main pipeline
|
| 429 |
# ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 440 |
# ββ Resolve text source ββ
|
| 441 |
if file_input is not None:
|
| 442 |
try:
|
| 443 |
+
progress(0.02, desc="Extracting text from document...")
|
| 444 |
+
text = extract_text_from_file(file_input)
|
| 445 |
+
except gr.Error:
|
| 446 |
+
raise
|
| 447 |
except Exception as e:
|
| 448 |
raise gr.Error(f"Failed to read file: {e}")
|
| 449 |
elif text_input and text_input.strip():
|
|
|
|
| 641 |
)
|
| 642 |
|
| 643 |
file_input = gr.File(
|
| 644 |
+
label="Or Upload a Document (.txt, .md, .pdf, .docx)",
|
| 645 |
+
file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
|
| 646 |
type="filepath",
|
| 647 |
)
|
| 648 |
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
openai>=1.52.0
|
| 2 |
gradio>=5.25.0
|
| 3 |
audioop-lts; python_version >= "3.13"
|
|
|
|
|
|
|
|
|
| 1 |
openai>=1.52.0
|
| 2 |
gradio>=5.25.0
|
| 3 |
audioop-lts; python_version >= "3.13"
|
| 4 |
+
pypdf>=4.0.0
|
| 5 |
+
python-docx>=1.1.0
|