Update app.py
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ import gradio as gr
|
|
| 17 |
|
| 18 |
# ---- Parsers ----
|
| 19 |
from docx import Document
|
|
|
|
| 20 |
|
| 21 |
# ---- Embeddings ----
|
| 22 |
# We try sentence-transformers. If unavailable (or offline), we fall back to HashingVectorizer.
|
|
@@ -37,8 +38,53 @@ def _load_st_model():
|
|
| 37 |
except Exception as e:
|
| 38 |
return None
|
| 39 |
|
| 40 |
-
def
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
paras = [p.text.strip() for p in doc.paragraphs]
|
| 43 |
paras = [p for p in paras if p and not p.isspace()]
|
| 44 |
return paras
|
|
@@ -295,7 +341,7 @@ def process_pipeline(docx_file, units_mode, K, iters, beta, bins, tau, seed):
|
|
| 295 |
return gr.update(value="# Please upload a .docx file."), None, None, None, None
|
| 296 |
|
| 297 |
# Read file
|
| 298 |
-
paras =
|
| 299 |
units = paragraphs_to_units(paras, mode=units_mode)
|
| 300 |
|
| 301 |
if len(units) == 0:
|
|
@@ -380,7 +426,7 @@ with gr.Blocks(title="Constellation Harvest Regularization (CHR)") as demo:
|
|
| 380 |
|
| 381 |
with gr.Row():
|
| 382 |
with gr.Column(scale=1):
|
| 383 |
-
docx_file = gr.File(label=".docx document", file_types=[".docx"])
|
| 384 |
units_mode = gr.Radio(choices=["paragraphs", "sentences"], value="paragraphs", label="Unit granularity")
|
| 385 |
K = gr.Slider(2, 24, value=8, step=1, label="K (number of constellations)")
|
| 386 |
iters = gr.Slider(5, 100, value=30, step=1, label="Iterations")
|
|
|
|
| 17 |
|
| 18 |
# ---- Parsers ----
|
| 19 |
from docx import Document
|
| 20 |
+
import traceback
|
| 21 |
|
| 22 |
# ---- Embeddings ----
|
| 23 |
# We try sentence-transformers. If unavailable (or offline), we fall back to HashingVectorizer.
|
|
|
|
| 38 |
except Exception as e:
|
| 39 |
return None
|
| 40 |
|
| 41 |
+
def _resolve_file_input(file_obj):
|
| 42 |
+
"""Return (bytes_io, display_name) for a variety of Gradio/HF file input shapes.
|
| 43 |
+
Supports: tempfile objects, dicts with 'name'/'path'/'data', raw path strings, or bytes.
|
| 44 |
+
"""
|
| 45 |
+
import io, os
|
| 46 |
+
# 1) Dict shape (some Gradio environments)
|
| 47 |
+
if isinstance(file_obj, dict):
|
| 48 |
+
# Prefer an on-disk path if present
|
| 49 |
+
for key in ("path", "name"):
|
| 50 |
+
p = file_obj.get(key)
|
| 51 |
+
if isinstance(p, str) and os.path.exists(p):
|
| 52 |
+
with open(p, "rb") as f:
|
| 53 |
+
return io.BytesIO(f.read()), os.path.basename(p)
|
| 54 |
+
# Raw bytes in 'data'
|
| 55 |
+
data = file_obj.get("data")
|
| 56 |
+
if isinstance(data, (bytes, bytearray)):
|
| 57 |
+
return io.BytesIO(bytes(data)), file_obj.get("orig_name", "upload.docx")
|
| 58 |
+
# 2) Tempfile-like object
|
| 59 |
+
if hasattr(file_obj, "read") and hasattr(file_obj, "name"):
|
| 60 |
+
try:
|
| 61 |
+
file_obj.seek(0)
|
| 62 |
+
content = file_obj.read()
|
| 63 |
+
if isinstance(content, (bytes, bytearray)):
|
| 64 |
+
return io.BytesIO(content), os.path.basename(getattr(file_obj, "name", "upload.docx"))
|
| 65 |
+
except Exception:
|
| 66 |
+
pass
|
| 67 |
+
# Fallback: open by path
|
| 68 |
+
p = getattr(file_obj, "name", None)
|
| 69 |
+
if isinstance(p, str) and os.path.exists(p):
|
| 70 |
+
with open(p, "rb") as f:
|
| 71 |
+
return io.BytesIO(f.read()), os.path.basename(p)
|
| 72 |
+
# 3) Path string
|
| 73 |
+
if isinstance(file_obj, str) and os.path.exists(file_obj):
|
| 74 |
+
with open(file_obj, "rb") as f:
|
| 75 |
+
import os
|
| 76 |
+
return io.BytesIO(f.read()), os.path.basename(file_obj)
|
| 77 |
+
# 4) Raw bytes
|
| 78 |
+
if isinstance(file_obj, (bytes, bytearray)):
|
| 79 |
+
return io.BytesIO(bytes(file_obj)), "upload.docx"
|
| 80 |
+
# Unknown shape
|
| 81 |
+
return None, "upload.docx"
|
| 82 |
+
|
| 83 |
+
def read_docx_any(file_obj) -> List[str]:
|
| 84 |
+
bio, _ = _resolve_file_input(file_obj)
|
| 85 |
+
if bio is None:
|
| 86 |
+
raise ValueError("Could not read uploaded .docx file; unsupported input shape.")
|
| 87 |
+
doc = Document(bio)
|
| 88 |
paras = [p.text.strip() for p in doc.paragraphs]
|
| 89 |
paras = [p for p in paras if p and not p.isspace()]
|
| 90 |
return paras
|
|
|
|
| 341 |
return gr.update(value="# Please upload a .docx file."), None, None, None, None
|
| 342 |
|
| 343 |
# Read file
|
| 344 |
+
paras = read_docx_any(docx_file)
|
| 345 |
units = paragraphs_to_units(paras, mode=units_mode)
|
| 346 |
|
| 347 |
if len(units) == 0:
|
|
|
|
| 426 |
|
| 427 |
with gr.Row():
|
| 428 |
with gr.Column(scale=1):
|
| 429 |
+
docx_file = gr.File(label=".docx document", file_types=[".docx"], file_count="single")
|
| 430 |
units_mode = gr.Radio(choices=["paragraphs", "sentences"], value="paragraphs", label="Unit granularity")
|
| 431 |
K = gr.Slider(2, 24, value=8, step=1, label="K (number of constellations)")
|
| 432 |
iters = gr.Slider(5, 100, value=30, step=1, label="Iterations")
|