Spaces:

PlotweaverModel
/

AudioBook

Running

App Files Files Community

PlotweaverModel commited on 10 days ago

Commit

21a8f39

verified ·

1 Parent(s): 500a984

Update app

Browse files

Files changed (2) hide show

app.py +82 -4
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -25,6 +25,19 @@ import re
 import gradio as gr
 from openai import OpenAI
 # ──────────────────────────────────────────────
 # Configuration
 # ──────────────────────────────────────────────
@@ -348,6 +361,69 @@ def generate_silence(duration_sec: float, output_path: str):
     )
 # ──────────────────────────────────────────────
 # Main pipeline
 # ──────────────────────────────────────────────
@@ -364,8 +440,10 @@ def generate_audiobook(
     # ── Resolve text source ──
     if file_input is not None:
         try:
-            with open(file_input, "r", encoding="utf-8", errors="replace") as f:
-                text = f.read()
         except Exception as e:
             raise gr.Error(f"Failed to read file: {e}")
     elif text_input and text_input.strip():
@@ -563,8 +641,8 @@ with gr.Blocks(
             )
             file_input = gr.File(
-                label="Or Upload a Text File (.txt, .md)",
-                file_types=[".txt", ".md", ".text"],
                 type="filepath",
             )

 import gradio as gr
 from openai import OpenAI
+# Optional document parsers — installed via requirements.txt
+try:
+    import pypdf
+    HAS_PYPDF = True
+except ImportError:
+    HAS_PYPDF = False
+try:
+    import docx
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
 # ──────────────────────────────────────────────
 # Configuration
 # ──────────────────────────────────────────────
     )
+# ──────────────────────────────────────────────
+# Document text extraction
+# ──────────────────────────────────────────────
+def extract_text_from_pdf(filepath: str) -> str:
+    """Extract text from a PDF file using pypdf."""
+    if not HAS_PYPDF:
+        raise ImportError("pypdf is not installed. Cannot read PDF files.")
+    reader = pypdf.PdfReader(filepath)
+    pages = []
+    for page in reader.pages:
+        text = page.extract_text()
+        if text:
+            pages.append(text.strip())
+    return "\n\n".join(pages)
+def extract_text_from_docx(filepath: str) -> str:
+    """Extract text from a .docx file using python-docx."""
+    if not HAS_DOCX:
+        raise ImportError("python-docx is not installed. Cannot read Word files.")
+    doc = docx.Document(filepath)
+    paragraphs = []
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if text:
+            paragraphs.append(text)
+    return "\n\n".join(paragraphs)
+def extract_text_from_file(filepath: str) -> str:
+    """Extract text from a file based on its extension."""
+    ext = os.path.splitext(filepath)[1].lower()
+    if ext == ".pdf":
+        return extract_text_from_pdf(filepath)
+    elif ext in (".docx", ".doc"):
+        if ext == ".doc":
+            # .doc (old format) — try converting with LibreOffice if available
+            try:
+                tmp_dir = tempfile.mkdtemp()
+                subprocess.run(
+                    ["libreoffice", "--headless", "--convert-to", "docx",
+                     "--outdir", tmp_dir, filepath],
+                    capture_output=True, check=True, timeout=60,
+                )
+                docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
+                docx_path = os.path.join(tmp_dir, docx_name)
+                if os.path.exists(docx_path):
+                    text = extract_text_from_docx(docx_path)
+                    shutil.rmtree(tmp_dir, ignore_errors=True)
+                    return text
+            except Exception:
+                pass
+            raise gr.Error(
+                "Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
+            )
+        return extract_text_from_docx(filepath)
+    else:
+        # Plain text files (.txt, .md, etc.)
+        with open(filepath, "r", encoding="utf-8", errors="replace") as f:
+            return f.read()
 # ──────────────────────────────────────────────
 # Main pipeline
 # ──────────────────────────────────────────────
     # ── Resolve text source ──
     if file_input is not None:
         try:
+            progress(0.02, desc="Extracting text from document...")
+            text = extract_text_from_file(file_input)
+        except gr.Error:
+            raise
         except Exception as e:
             raise gr.Error(f"Failed to read file: {e}")
     elif text_input and text_input.strip():
             )
             file_input = gr.File(
+                label="Or Upload a Document (.txt, .md, .pdf, .docx)",
+                file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
                 type="filepath",
             )

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 openai>=1.52.0
 gradio>=5.25.0
 audioop-lts; python_version >= "3.13"

 openai>=1.52.0
 gradio>=5.25.0
 audioop-lts; python_version >= "3.13"
+pypdf>=4.0.0
+python-docx>=1.1.0