PlotweaverModel commited on
Commit
21a8f39
Β·
verified Β·
1 Parent(s): 500a984

Update app

Browse files
Files changed (2) hide show
  1. app.py +82 -4
  2. requirements.txt +2 -0
app.py CHANGED
@@ -25,6 +25,19 @@ import re
25
  import gradio as gr
26
  from openai import OpenAI
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # ──────────────────────────────────────────────
29
  # Configuration
30
  # ──────────────────────────────────────────────
@@ -348,6 +361,69 @@ def generate_silence(duration_sec: float, output_path: str):
348
  )
349
 
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  # ──────────────────────────────────────────────
352
  # Main pipeline
353
  # ──────────────────────────────────────────────
@@ -364,8 +440,10 @@ def generate_audiobook(
364
  # ── Resolve text source ──
365
  if file_input is not None:
366
  try:
367
- with open(file_input, "r", encoding="utf-8", errors="replace") as f:
368
- text = f.read()
 
 
369
  except Exception as e:
370
  raise gr.Error(f"Failed to read file: {e}")
371
  elif text_input and text_input.strip():
@@ -563,8 +641,8 @@ with gr.Blocks(
563
  )
564
 
565
  file_input = gr.File(
566
- label="Or Upload a Text File (.txt, .md)",
567
- file_types=[".txt", ".md", ".text"],
568
  type="filepath",
569
  )
570
 
 
25
  import gradio as gr
26
  from openai import OpenAI
27
 
28
+ # Optional document parsers β€” installed via requirements.txt
29
+ try:
30
+ import pypdf
31
+ HAS_PYPDF = True
32
+ except ImportError:
33
+ HAS_PYPDF = False
34
+
35
+ try:
36
+ import docx
37
+ HAS_DOCX = True
38
+ except ImportError:
39
+ HAS_DOCX = False
40
+
41
  # ──────────────────────────────────────────────
42
  # Configuration
43
  # ──────────────────────────────────────────────
 
361
  )
362
 
363
 
364
+ # ──────────────────────────────────────────────
365
+ # Document text extraction
366
+ # ──────────────────────────────────────────────
367
+ def extract_text_from_pdf(filepath: str) -> str:
368
+ """Extract text from a PDF file using pypdf."""
369
+ if not HAS_PYPDF:
370
+ raise ImportError("pypdf is not installed. Cannot read PDF files.")
371
+ reader = pypdf.PdfReader(filepath)
372
+ pages = []
373
+ for page in reader.pages:
374
+ text = page.extract_text()
375
+ if text:
376
+ pages.append(text.strip())
377
+ return "\n\n".join(pages)
378
+
379
+
380
+ def extract_text_from_docx(filepath: str) -> str:
381
+ """Extract text from a .docx file using python-docx."""
382
+ if not HAS_DOCX:
383
+ raise ImportError("python-docx is not installed. Cannot read Word files.")
384
+ doc = docx.Document(filepath)
385
+ paragraphs = []
386
+ for para in doc.paragraphs:
387
+ text = para.text.strip()
388
+ if text:
389
+ paragraphs.append(text)
390
+ return "\n\n".join(paragraphs)
391
+
392
+
393
+ def extract_text_from_file(filepath: str) -> str:
394
+ """Extract text from a file based on its extension."""
395
+ ext = os.path.splitext(filepath)[1].lower()
396
+
397
+ if ext == ".pdf":
398
+ return extract_text_from_pdf(filepath)
399
+ elif ext in (".docx", ".doc"):
400
+ if ext == ".doc":
401
+ # .doc (old format) β€” try converting with LibreOffice if available
402
+ try:
403
+ tmp_dir = tempfile.mkdtemp()
404
+ subprocess.run(
405
+ ["libreoffice", "--headless", "--convert-to", "docx",
406
+ "--outdir", tmp_dir, filepath],
407
+ capture_output=True, check=True, timeout=60,
408
+ )
409
+ docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
410
+ docx_path = os.path.join(tmp_dir, docx_name)
411
+ if os.path.exists(docx_path):
412
+ text = extract_text_from_docx(docx_path)
413
+ shutil.rmtree(tmp_dir, ignore_errors=True)
414
+ return text
415
+ except Exception:
416
+ pass
417
+ raise gr.Error(
418
+ "Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
419
+ )
420
+ return extract_text_from_docx(filepath)
421
+ else:
422
+ # Plain text files (.txt, .md, etc.)
423
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
424
+ return f.read()
425
+
426
+
427
  # ──────────────────────────────────────────────
428
  # Main pipeline
429
  # ──────────────────────────────────────────────
 
440
  # ── Resolve text source ──
441
  if file_input is not None:
442
  try:
443
+ progress(0.02, desc="Extracting text from document...")
444
+ text = extract_text_from_file(file_input)
445
+ except gr.Error:
446
+ raise
447
  except Exception as e:
448
  raise gr.Error(f"Failed to read file: {e}")
449
  elif text_input and text_input.strip():
 
641
  )
642
 
643
  file_input = gr.File(
644
+ label="Or Upload a Document (.txt, .md, .pdf, .docx)",
645
+ file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
646
  type="filepath",
647
  )
648
 
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  openai>=1.52.0
2
  gradio>=5.25.0
3
  audioop-lts; python_version >= "3.13"
 
 
 
1
  openai>=1.52.0
2
  gradio>=5.25.0
3
  audioop-lts; python_version >= "3.13"
4
+ pypdf>=4.0.0
5
+ python-docx>=1.1.0