Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,16 @@
|
|
| 1 |
# app.py
|
| 2 |
"""
|
| 3 |
-
Gradio app
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
gradio>=3.0
|
| 9 |
PyMuPDF
|
| 10 |
pytesseract
|
|
@@ -12,11 +18,11 @@ Requirements (add to requirements.txt for HF Space or local venv):
|
|
| 12 |
openai>=1.0.0
|
| 13 |
jsonschema
|
| 14 |
|
| 15 |
-
System packages
|
| 16 |
tesseract-ocr
|
| 17 |
poppler-utils
|
| 18 |
|
| 19 |
-
Put OPENAI_API_KEY into
|
| 20 |
"""
|
| 21 |
|
| 22 |
import os
|
|
@@ -31,20 +37,18 @@ from PIL import Image
|
|
| 31 |
import fitz # PyMuPDF
|
| 32 |
import pytesseract
|
| 33 |
from jsonschema import validate as json_validate, ValidationError
|
| 34 |
-
|
| 35 |
-
# new OpenAI client surface
|
| 36 |
from openai import OpenAI
|
| 37 |
|
| 38 |
# -----------------------
|
| 39 |
-
# Config
|
| 40 |
# -----------------------
|
| 41 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 42 |
if not OPENAI_API_KEY:
|
| 43 |
-
raise RuntimeError("OPENAI_API_KEY not found in environment. Add to HF Space Secrets or env var.")
|
| 44 |
|
| 45 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 46 |
|
| 47 |
-
LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
|
| 48 |
MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
|
| 49 |
|
| 50 |
# -----------------------
|
|
@@ -53,17 +57,8 @@ MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
|
|
| 53 |
METADATA_SCHEMA = {
|
| 54 |
"type": "object",
|
| 55 |
"required": [
|
| 56 |
-
"doc_id",
|
| 57 |
-
"
|
| 58 |
-
"summary",
|
| 59 |
-
"doc_type",
|
| 60 |
-
"source",
|
| 61 |
-
"tags",
|
| 62 |
-
"tag_confidences",
|
| 63 |
-
"taxonomy_path",
|
| 64 |
-
"extracted_entities",
|
| 65 |
-
"raw_url",
|
| 66 |
-
"ingest_timestamp",
|
| 67 |
],
|
| 68 |
"properties": {
|
| 69 |
"doc_id": {"type": "string"},
|
|
@@ -82,33 +77,38 @@ METADATA_SCHEMA = {
|
|
| 82 |
}
|
| 83 |
|
| 84 |
# -----------------------
|
| 85 |
-
#
|
| 86 |
# -----------------------
|
| 87 |
-
def extract_text_from_pdf(path: str) -> str:
|
|
|
|
| 88 |
try:
|
| 89 |
doc = fitz.open(path)
|
| 90 |
except Exception as e:
|
| 91 |
raise RuntimeError(f"Failed to open PDF: {e}")
|
| 92 |
-
|
| 93 |
texts: List[str] = []
|
| 94 |
for i in range(len(doc)):
|
| 95 |
page = doc.load_page(i)
|
| 96 |
txt = page.get_text("text").strip()
|
| 97 |
if txt:
|
|
|
|
| 98 |
texts.append(txt)
|
| 99 |
else:
|
| 100 |
-
|
| 101 |
pix = page.get_pixmap(dpi=200)
|
| 102 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 103 |
pix.save(tmp.name)
|
| 104 |
ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
|
|
|
|
| 105 |
texts.append(ocr_text)
|
| 106 |
return "\n\n".join(texts).strip()
|
| 107 |
|
| 108 |
|
| 109 |
-
def extract_text_from_image(path: str) -> str:
|
|
|
|
| 110 |
img = Image.open(path).convert("RGB")
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
|
|
@@ -127,17 +127,10 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
|
|
| 127 |
return chunks
|
| 128 |
|
| 129 |
# -----------------------
|
| 130 |
-
#
|
| 131 |
# -----------------------
|
| 132 |
-
def save_uploaded_to_tmp(file_obj):
|
| 133 |
-
""
|
| 134 |
-
Accepts common Gradio upload types:
|
| 135 |
-
- file-like (has .read())
|
| 136 |
-
- dict-like {"name": ..., "data": b'...'}
|
| 137 |
-
- path string
|
| 138 |
-
- objects with .name attribute pointing to a path (NamedString)
|
| 139 |
-
Returns (tmp_path, original_filename)
|
| 140 |
-
"""
|
| 141 |
# file-like
|
| 142 |
if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
|
| 143 |
try:
|
|
@@ -148,13 +141,13 @@ def save_uploaded_to_tmp(file_obj):
|
|
| 148 |
suffix = os.path.splitext(name)[1] or ""
|
| 149 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 150 |
tmp.write(content)
|
|
|
|
| 151 |
return tmp.name, os.path.basename(name)
|
| 152 |
-
except Exception:
|
| 153 |
-
|
| 154 |
-
|
| 155 |
# dict-like
|
| 156 |
-
if isinstance(file_obj, dict):
|
| 157 |
-
|
| 158 |
data = file_obj["data"]
|
| 159 |
if isinstance(data, str):
|
| 160 |
data = data.encode("utf-8")
|
|
@@ -162,11 +155,14 @@ def save_uploaded_to_tmp(file_obj):
|
|
| 162 |
suffix = os.path.splitext(name)[1] or ""
|
| 163 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 164 |
tmp.write(data)
|
|
|
|
| 165 |
return tmp.name, os.path.basename(name)
|
| 166 |
-
|
|
|
|
| 167 |
# path string
|
| 168 |
if isinstance(file_obj, str):
|
| 169 |
if os.path.exists(file_obj):
|
|
|
|
| 170 |
return file_obj, os.path.basename(file_obj)
|
| 171 |
try:
|
| 172 |
with open(file_obj, "rb") as f:
|
|
@@ -174,11 +170,11 @@ def save_uploaded_to_tmp(file_obj):
|
|
| 174 |
suffix = os.path.splitext(file_obj)[1] or ""
|
| 175 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 176 |
tmp.write(data)
|
|
|
|
| 177 |
return tmp.name, os.path.basename(file_obj)
|
| 178 |
-
except Exception:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
# object with .name attribute referencing existing path
|
| 182 |
name = getattr(file_obj, "name", None)
|
| 183 |
if name and isinstance(name, str):
|
| 184 |
try:
|
|
@@ -187,21 +183,16 @@ def save_uploaded_to_tmp(file_obj):
|
|
| 187 |
suffix = os.path.splitext(name)[1] or ""
|
| 188 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 189 |
tmp.write(data)
|
|
|
|
| 190 |
return tmp.name, os.path.basename(name)
|
| 191 |
-
except Exception:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
|
| 195 |
|
| 196 |
-
|
| 197 |
# -----------------------
|
| 198 |
-
# JSON extraction & validation
|
| 199 |
# -----------------------
|
| 200 |
def extract_json_from_text(text: str) -> str:
|
| 201 |
-
"""
|
| 202 |
-
Prefer explicit markers <<BEGIN_JSON>> ... <<END_JSON>>.
|
| 203 |
-
Otherwise try to get the last {...} block, then first {...} block.
|
| 204 |
-
"""
|
| 205 |
m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
|
| 206 |
if m:
|
| 207 |
return m.group(1).strip()
|
|
@@ -215,79 +206,42 @@ def extract_json_from_text(text: str) -> str:
|
|
| 215 |
|
| 216 |
|
| 217 |
def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
|
| 218 |
-
"""
|
| 219 |
-
Returns (ok, parsed_dict_or_none, error_message_or_empty)
|
| 220 |
-
"""
|
| 221 |
try:
|
| 222 |
parsed = json.loads(json_text)
|
| 223 |
except Exception as e:
|
| 224 |
return False, None, f"json.loads error: {e}"
|
| 225 |
-
|
| 226 |
try:
|
| 227 |
json_validate(parsed, METADATA_SCHEMA)
|
| 228 |
except ValidationError as e:
|
| 229 |
return False, parsed, f"schema validation error: {e}"
|
| 230 |
except Exception as e:
|
| 231 |
-
# other validation errors
|
| 232 |
return False, parsed, f"schema validation unexpected error: {e}"
|
| 233 |
-
|
| 234 |
return True, parsed, ""
|
| 235 |
|
| 236 |
-
|
| 237 |
# -----------------------
|
| 238 |
-
# LLM
|
| 239 |
# -----------------------
|
| 240 |
-
def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], max_attempts: int =
|
| 241 |
-
""
|
| 242 |
-
Robust LLM call:
|
| 243 |
-
- uses system message to enforce JSON-only output between markers
|
| 244 |
-
- retries up to max_attempts
|
| 245 |
-
- if model returns partial/invalid JSON, asks model to repair it
|
| 246 |
-
- validates the JSON against METADATA_SCHEMA
|
| 247 |
-
Returns:
|
| 248 |
-
- valid metadata dict OR dict with keys like _parsing_error/raw_output for UI consumption
|
| 249 |
-
"""
|
| 250 |
system_msg = (
|
| 251 |
"You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
|
| 252 |
-
"
|
| 253 |
-
"Wrap the JSON in explicit markers: <<BEGIN_JSON>> and <<END_JSON>>. "
|
| 254 |
-
"Do not include any commentary, explanation, or text outside those markers."
|
| 255 |
-
)
|
| 256 |
-
|
| 257 |
-
prompt_intro = (
|
| 258 |
-
f"Document title: {title}\n\n"
|
| 259 |
-
f"Short document text (first ~1000 chars): {short_text}\n\n"
|
| 260 |
-
"Top content chunks (short):\n"
|
| 261 |
)
|
| 262 |
-
|
| 263 |
prompt_chunks = ""
|
| 264 |
for i, c in enumerate(top_chunks[:6]):
|
| 265 |
chunk_text_clean = c[:800].replace("\n", " ")
|
| 266 |
prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
|
| 267 |
-
|
| 268 |
prompt_end = (
|
| 269 |
-
"Task: Produce a
|
| 270 |
-
"
|
| 271 |
-
"
|
| 272 |
-
"Guidelines:\n"
|
| 273 |
-
"- summary: 1-2 sentences.\n"
|
| 274 |
-
"- doc_type: short enum-like string (e.g., architecture_comparison).\n"
|
| 275 |
-
"- tags: up to 8 short tags like arch:docai.\n"
|
| 276 |
-
"- tag_confidences: floats 0-1 for each tag.\n"
|
| 277 |
-
"- taxonomy_path: hierarchical list.\n\n"
|
| 278 |
-
"Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>.\n"
|
| 279 |
)
|
| 280 |
-
|
| 281 |
-
user_prompt = prompt_intro + prompt_chunks + prompt_end
|
| 282 |
-
|
| 283 |
-
messages = [
|
| 284 |
-
{"role": "system", "content": system_msg},
|
| 285 |
-
{"role": "user", "content": user_prompt},
|
| 286 |
-
]
|
| 287 |
-
|
| 288 |
last_raw = None
|
| 289 |
|
| 290 |
for attempt in range(1, max_attempts + 1):
|
|
|
|
| 291 |
try:
|
| 292 |
resp = client.chat.completions.create(
|
| 293 |
model=LLM_MODEL,
|
|
@@ -295,147 +249,79 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], m
|
|
| 295 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 296 |
)
|
| 297 |
except Exception as e:
|
| 298 |
-
|
|
|
|
| 299 |
|
| 300 |
-
# extract text
|
| 301 |
try:
|
| 302 |
-
|
| 303 |
except Exception:
|
| 304 |
try:
|
| 305 |
-
|
| 306 |
except Exception:
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
|
| 311 |
-
#
|
| 312 |
-
json_text = extract_json_from_text(
|
| 313 |
if not json_text:
|
| 314 |
-
|
| 315 |
if attempt < max_attempts:
|
| 316 |
-
fix_prompt = (
|
| 317 |
-
"The previous response did not include a JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> markers, "
|
| 318 |
-
"or returned invalid JSON. Here is the raw output:\n\n"
|
| 319 |
-
f"{text}\n\n"
|
| 320 |
-
"Please return ONLY a valid JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
|
| 321 |
-
"Do not include anything else."
|
| 322 |
-
)
|
| 323 |
messages = [
|
| 324 |
{"role": "system", "content": system_msg},
|
| 325 |
-
{"role": "user", "content":
|
| 326 |
]
|
| 327 |
continue
|
| 328 |
else:
|
| 329 |
-
return {"_parsing_error": True, "raw_output": last_raw, "
|
| 330 |
|
| 331 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 332 |
if ok:
|
| 333 |
-
|
|
|
|
|
|
|
| 334 |
else:
|
| 335 |
-
|
| 336 |
if attempt < max_attempts:
|
| 337 |
-
repair_prompt = (
|
| 338 |
-
"The JSON you returned is invalid or does not meet the schema. Here is the JSON you returned:\n\n"
|
| 339 |
-
f"{json_text}\n\n"
|
| 340 |
-
"Please return ONLY a corrected JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> that includes the required keys: "
|
| 341 |
-
"doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
|
| 342 |
-
"If you must guess missing fields, use reasonable defaults (empty string or empty list/map)."
|
| 343 |
-
)
|
| 344 |
messages = [
|
| 345 |
{"role": "system", "content": system_msg},
|
| 346 |
-
{"role": "user", "content":
|
| 347 |
]
|
| 348 |
continue
|
| 349 |
else:
|
| 350 |
-
return {
|
| 351 |
-
"_parsing_error": True,
|
| 352 |
-
"raw_output": last_raw,
|
| 353 |
-
"parsed_partial": parsed_or_partial,
|
| 354 |
-
"parse_error": parse_err,
|
| 355 |
-
}
|
| 356 |
|
| 357 |
-
return {"_parsing_error": True, "raw_output": last_raw
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
#
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
# extract text
|
| 370 |
-
try:
|
| 371 |
-
if orig_name.lower().endswith(".pdf"):
|
| 372 |
-
extracted_text = extract_text_from_pdf(tmp_path)
|
| 373 |
else:
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
return {"error": f"Text extraction failed: {e}"}
|
| 377 |
-
|
| 378 |
-
if not extracted_text:
|
| 379 |
-
return {"error": "No text found in document after extraction."}
|
| 380 |
-
|
| 381 |
-
chunks = chunk_text(extracted_text)
|
| 382 |
-
sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
|
| 383 |
-
top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
|
| 384 |
-
|
| 385 |
-
short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
|
| 386 |
-
|
| 387 |
-
metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks, max_attempts=3)
|
| 388 |
-
|
| 389 |
-
# If API error
|
| 390 |
-
if metadata.get("_api_error"):
|
| 391 |
-
return {"error": metadata.get("error")}
|
| 392 |
-
|
| 393 |
-
# If parsing/validation error, include raw_output so UI can show & repair
|
| 394 |
-
if metadata.get("_parsing_error"):
|
| 395 |
-
return {
|
| 396 |
-
"error": "LLM output parsing failed. See raw_output.",
|
| 397 |
-
"raw_output": metadata.get("raw_output"),
|
| 398 |
-
"parsed_partial": metadata.get("parsed_partial"),
|
| 399 |
-
"parse_error": metadata.get("parse_error"),
|
| 400 |
-
}
|
| 401 |
-
|
| 402 |
-
# Ensure minimal keys and timestamp
|
| 403 |
-
now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
|
| 404 |
-
metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
|
| 405 |
-
metadata.setdefault("title", orig_name)
|
| 406 |
-
metadata.setdefault("source", "user_upload")
|
| 407 |
-
metadata.setdefault("raw_url", "")
|
| 408 |
-
metadata.setdefault("ingest_timestamp", now)
|
| 409 |
-
|
| 410 |
-
return metadata
|
| 411 |
|
| 412 |
-
|
| 413 |
-
# -----------------------
|
| 414 |
-
# Repair-only function (user-triggered) - repair raw_output into valid JSON
|
| 415 |
-
# -----------------------
|
| 416 |
-
def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
|
| 417 |
-
"""
|
| 418 |
-
Send the raw output back to the model and ask for corrected JSON between markers.
|
| 419 |
-
This function is useful if the initial parsing failed and you want a manual 'Repair' button in UI.
|
| 420 |
-
"""
|
| 421 |
system_msg = (
|
| 422 |
-
"You are an
|
| 423 |
-
"
|
| 424 |
-
"<<BEGIN_JSON>> and <<END_JSON>>. Do NOT include any other text."
|
| 425 |
)
|
| 426 |
-
|
| 427 |
repair_prompt = (
|
| 428 |
-
"Here is the raw output
|
| 429 |
-
|
| 430 |
-
"
|
| 431 |
-
"Ensure the object contains keys: doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
|
| 432 |
-
"If a field is missing, use a reasonable default (empty string, empty list, or empty map)."
|
| 433 |
)
|
| 434 |
-
|
| 435 |
-
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt}]
|
| 436 |
-
|
| 437 |
last_raw = None
|
| 438 |
for attempt in range(1, max_attempts + 1):
|
|
|
|
| 439 |
try:
|
| 440 |
resp = client.chat.completions.create(
|
| 441 |
model=LLM_MODEL,
|
|
@@ -443,116 +329,268 @@ def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
|
|
| 443 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 444 |
)
|
| 445 |
except Exception as e:
|
| 446 |
-
|
| 447 |
-
|
| 448 |
try:
|
| 449 |
-
|
| 450 |
except Exception:
|
| 451 |
try:
|
| 452 |
-
|
| 453 |
except Exception:
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
json_text = extract_json_from_text(
|
| 458 |
if not json_text:
|
|
|
|
| 459 |
if attempt < max_attempts:
|
| 460 |
-
messages = [
|
| 461 |
-
{"role": "system", "content": system_msg},
|
| 462 |
-
{"role": "user", "content": "Your previous reply did not include a JSON block. Please return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
|
| 463 |
-
]
|
| 464 |
continue
|
| 465 |
else:
|
| 466 |
-
return {"_parsing_error": True, "raw_output": last_raw, "
|
| 467 |
-
|
| 468 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 469 |
if ok:
|
| 470 |
-
|
|
|
|
| 471 |
else:
|
|
|
|
| 472 |
if attempt < max_attempts:
|
| 473 |
-
messages = [
|
| 474 |
-
{"role": "system", "content": system_msg},
|
| 475 |
-
{"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
|
| 476 |
-
]
|
| 477 |
continue
|
| 478 |
else:
|
| 479 |
-
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err}
|
|
|
|
| 480 |
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
# -----------------------
|
| 484 |
# Gradio UI
|
| 485 |
# -----------------------
|
| 486 |
-
with gr.Blocks(title="DocClassify —
|
| 487 |
-
gr.Markdown("## 📂 Upload
|
| 488 |
with gr.Row():
|
| 489 |
with gr.Column(scale=1):
|
| 490 |
uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
|
| 491 |
run_button = gr.Button("Process document")
|
| 492 |
status = gr.Textbox(label="Status", value="", interactive=False)
|
| 493 |
download_button = gr.File(label="Download metadata JSON", visible=False)
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
| 495 |
with gr.Column(scale=1):
|
| 496 |
-
output_json = gr.JSON(label="
|
| 497 |
-
raw_output_box = gr.Textbox(label="
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
|
|
|
| 507 |
try:
|
| 508 |
result = process_file(file_obj)
|
| 509 |
except Exception as e:
|
| 510 |
-
return
|
| 511 |
-
|
|
|
|
|
|
|
| 512 |
if result.get("error"):
|
| 513 |
-
#
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
if
|
| 518 |
-
|
| 519 |
-
#
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
# success
|
|
|
|
| 523 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 524 |
with open(tmpf.name, "w", encoding="utf8") as f:
|
| 525 |
-
json.dump(
|
| 526 |
-
|
| 527 |
-
return
|
| 528 |
-
|
| 529 |
-
def
|
| 530 |
-
if not
|
| 531 |
-
return {}, "No
|
| 532 |
-
try
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
if repaired.get("_api_error"):
|
| 538 |
-
return {}, f"Repair API error: {repaired.get('error')}", None
|
| 539 |
-
|
| 540 |
if repaired.get("_parsing_error"):
|
| 541 |
-
|
| 542 |
-
display
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
# success -> create download file
|
| 546 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 547 |
with open(tmpf.name, "w", encoding="utf8") as f:
|
| 548 |
-
json.dump(
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
|
| 556 |
-
# launch
|
| 557 |
if __name__ == "__main__":
|
| 558 |
demo.launch()
|
|
|
|
| 1 |
# app.py
|
| 2 |
"""
|
| 3 |
+
Final Gradio app — robust document tagging + automated taxonomy via GPT-5 (OpenAI new client).
|
| 4 |
+
Features:
|
| 5 |
+
- Upload PDF or Image
|
| 6 |
+
- Extract text (PyMuPDF + Tesseract fallback)
|
| 7 |
+
- Chunk text, call GPT-5 to produce JSON metadata between markers <<BEGIN_JSON>><<END_JSON>>
|
| 8 |
+
- Validate JSON with jsonschema
|
| 9 |
+
- Automatic repair attempts + manual-repair (paste raw output)
|
| 10 |
+
- Detailed step-by-step logs displayed on the UI and full GPT response shown
|
| 11 |
+
- Download metadata JSON on success
|
| 12 |
+
|
| 13 |
+
Requirements (requirements.txt):
|
| 14 |
gradio>=3.0
|
| 15 |
PyMuPDF
|
| 16 |
pytesseract
|
|
|
|
| 18 |
openai>=1.0.0
|
| 19 |
jsonschema
|
| 20 |
|
| 21 |
+
System packages (apt-packages for HF Spaces):
|
| 22 |
tesseract-ocr
|
| 23 |
poppler-utils
|
| 24 |
|
| 25 |
+
Put OPENAI_API_KEY into HF Space Secrets or environment.
|
| 26 |
"""
|
| 27 |
|
| 28 |
import os
|
|
|
|
| 37 |
import fitz # PyMuPDF
|
| 38 |
import pytesseract
|
| 39 |
from jsonschema import validate as json_validate, ValidationError
|
|
|
|
|
|
|
| 40 |
from openai import OpenAI
|
| 41 |
|
| 42 |
# -----------------------
|
| 43 |
+
# Config & OpenAI client
|
| 44 |
# -----------------------
|
| 45 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 46 |
if not OPENAI_API_KEY:
|
| 47 |
+
raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to HF Space Secrets or env var.")
|
| 48 |
|
| 49 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 50 |
|
| 51 |
+
LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # change if needed
|
| 52 |
MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
|
| 53 |
|
| 54 |
# -----------------------
|
|
|
|
| 57 |
METADATA_SCHEMA = {
|
| 58 |
"type": "object",
|
| 59 |
"required": [
|
| 60 |
+
"doc_id", "title", "summary", "doc_type", "source", "tags",
|
| 61 |
+
"tag_confidences", "taxonomy_path", "extracted_entities", "raw_url", "ingest_timestamp"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
],
|
| 63 |
"properties": {
|
| 64 |
"doc_id": {"type": "string"},
|
|
|
|
| 77 |
}
|
| 78 |
|
| 79 |
# -----------------------
|
| 80 |
+
# Helpers: extraction & chunking
|
| 81 |
# -----------------------
|
| 82 |
+
def extract_text_from_pdf(path: str, log: List[str]) -> str:
|
| 83 |
+
log.append(f"Opening PDF: {path}")
|
| 84 |
try:
|
| 85 |
doc = fitz.open(path)
|
| 86 |
except Exception as e:
|
| 87 |
raise RuntimeError(f"Failed to open PDF: {e}")
|
|
|
|
| 88 |
texts: List[str] = []
|
| 89 |
for i in range(len(doc)):
|
| 90 |
page = doc.load_page(i)
|
| 91 |
txt = page.get_text("text").strip()
|
| 92 |
if txt:
|
| 93 |
+
log.append(f"Page {i+1}: text extracted ({len(txt)} chars)")
|
| 94 |
texts.append(txt)
|
| 95 |
else:
|
| 96 |
+
log.append(f"Page {i+1}: no text found, performing OCR fallback")
|
| 97 |
pix = page.get_pixmap(dpi=200)
|
| 98 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 99 |
pix.save(tmp.name)
|
| 100 |
ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
|
| 101 |
+
log.append(f"Page {i+1}: OCR extracted ({len(ocr_text)} chars)")
|
| 102 |
texts.append(ocr_text)
|
| 103 |
return "\n\n".join(texts).strip()
|
| 104 |
|
| 105 |
|
| 106 |
+
def extract_text_from_image(path: str, log: List[str]) -> str:
|
| 107 |
+
log.append(f"OCR on image: {path}")
|
| 108 |
img = Image.open(path).convert("RGB")
|
| 109 |
+
txt = pytesseract.image_to_string(img).strip()
|
| 110 |
+
log.append(f"OCR extracted ({len(txt)} chars)")
|
| 111 |
+
return txt
|
| 112 |
|
| 113 |
|
| 114 |
def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
|
|
|
|
| 127 |
return chunks
|
| 128 |
|
| 129 |
# -----------------------
|
| 130 |
+
# Upload handling
|
| 131 |
# -----------------------
|
| 132 |
+
def save_uploaded_to_tmp(file_obj, log: List[str]):
|
| 133 |
+
log.append(f"Saving uploaded object of type {type(file_obj)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
# file-like
|
| 135 |
if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
|
| 136 |
try:
|
|
|
|
| 141 |
suffix = os.path.splitext(name)[1] or ""
|
| 142 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 143 |
tmp.write(content)
|
| 144 |
+
log.append(f"Saved uploaded file-like as {tmp.name}")
|
| 145 |
return tmp.name, os.path.basename(name)
|
| 146 |
+
except Exception as e:
|
| 147 |
+
log.append(f"file-like save failed: {e}")
|
|
|
|
| 148 |
# dict-like
|
| 149 |
+
if isinstance(file_obj, dict) and "data" in file_obj and "name" in file_obj:
|
| 150 |
+
try:
|
| 151 |
data = file_obj["data"]
|
| 152 |
if isinstance(data, str):
|
| 153 |
data = data.encode("utf-8")
|
|
|
|
| 155 |
suffix = os.path.splitext(name)[1] or ""
|
| 156 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 157 |
tmp.write(data)
|
| 158 |
+
log.append(f"Saved dict-like upload as {tmp.name}")
|
| 159 |
return tmp.name, os.path.basename(name)
|
| 160 |
+
except Exception as e:
|
| 161 |
+
log.append(f"dict-like save failed: {e}")
|
| 162 |
# path string
|
| 163 |
if isinstance(file_obj, str):
|
| 164 |
if os.path.exists(file_obj):
|
| 165 |
+
log.append(f"Upload was path string existing on disk: {file_obj}")
|
| 166 |
return file_obj, os.path.basename(file_obj)
|
| 167 |
try:
|
| 168 |
with open(file_obj, "rb") as f:
|
|
|
|
| 170 |
suffix = os.path.splitext(file_obj)[1] or ""
|
| 171 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 172 |
tmp.write(data)
|
| 173 |
+
log.append(f"Copied path-string file to {tmp.name}")
|
| 174 |
return tmp.name, os.path.basename(file_obj)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
log.append(f"path-string handling failed: {e}")
|
| 177 |
+
# object with .name attr
|
|
|
|
| 178 |
name = getattr(file_obj, "name", None)
|
| 179 |
if name and isinstance(name, str):
|
| 180 |
try:
|
|
|
|
| 183 |
suffix = os.path.splitext(name)[1] or ""
|
| 184 |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 185 |
tmp.write(data)
|
| 186 |
+
log.append(f"Saved file from .name attr to {tmp.name}")
|
| 187 |
return tmp.name, os.path.basename(name)
|
| 188 |
+
except Exception as e:
|
| 189 |
+
log.append(f".name-based save failed: {e}")
|
|
|
|
| 190 |
raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
|
| 191 |
|
|
|
|
| 192 |
# -----------------------
|
| 193 |
+
# JSON extraction & validation
|
| 194 |
# -----------------------
|
| 195 |
def extract_json_from_text(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
|
| 197 |
if m:
|
| 198 |
return m.group(1).strip()
|
|
|
|
| 206 |
|
| 207 |
|
| 208 |
def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
|
|
|
|
|
|
|
|
|
|
| 209 |
try:
|
| 210 |
parsed = json.loads(json_text)
|
| 211 |
except Exception as e:
|
| 212 |
return False, None, f"json.loads error: {e}"
|
|
|
|
| 213 |
try:
|
| 214 |
json_validate(parsed, METADATA_SCHEMA)
|
| 215 |
except ValidationError as e:
|
| 216 |
return False, parsed, f"schema validation error: {e}"
|
| 217 |
except Exception as e:
|
|
|
|
| 218 |
return False, parsed, f"schema validation unexpected error: {e}"
|
|
|
|
| 219 |
return True, parsed, ""
|
| 220 |
|
|
|
|
| 221 |
# -----------------------
|
| 222 |
+
# LLM interactions (metadata, repair, autocomplete)
|
| 223 |
# -----------------------
|
| 224 |
+
def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
|
| 225 |
+
log.append("Preparing prompt for metadata generation")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
system_msg = (
|
| 227 |
"You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
|
| 228 |
+
"Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
)
|
| 230 |
+
prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
|
| 231 |
prompt_chunks = ""
|
| 232 |
for i, c in enumerate(top_chunks[:6]):
|
| 233 |
chunk_text_clean = c[:800].replace("\n", " ")
|
| 234 |
prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
|
|
|
|
| 235 |
prompt_end = (
|
| 236 |
+
"Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
|
| 237 |
+
"tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp. "
|
| 238 |
+
"Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
)
|
| 240 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt_intro + prompt_chunks + prompt_end}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
last_raw = None
|
| 242 |
|
| 243 |
for attempt in range(1, max_attempts + 1):
|
| 244 |
+
log.append(f"Calling OpenAI (attempt {attempt})")
|
| 245 |
try:
|
| 246 |
resp = client.chat.completions.create(
|
| 247 |
model=LLM_MODEL,
|
|
|
|
| 249 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 250 |
)
|
| 251 |
except Exception as e:
|
| 252 |
+
log.append(f"OpenAI API call failed: {e}")
|
| 253 |
+
return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
|
| 254 |
|
| 255 |
+
# extract full model response text for UI logs
|
| 256 |
try:
|
| 257 |
+
full_text = resp.choices[0].message["content"].strip()
|
| 258 |
except Exception:
|
| 259 |
try:
|
| 260 |
+
full_text = resp.choices[0].message.content.strip()
|
| 261 |
except Exception:
|
| 262 |
+
full_text = str(resp)
|
| 263 |
+
last_raw = full_text
|
| 264 |
+
log.append("OpenAI response received (raw length: " + str(len(full_text)) + ")")
|
| 265 |
|
| 266 |
+
# attempt to extract JSON
|
| 267 |
+
json_text = extract_json_from_text(full_text)
|
| 268 |
if not json_text:
|
| 269 |
+
log.append("No JSON found in response")
|
| 270 |
if attempt < max_attempts:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
messages = [
|
| 272 |
{"role": "system", "content": system_msg},
|
| 273 |
+
{"role": "user", "content": "Previous response lacked JSON markers. Return only JSON between <<BEGIN_JSON>> and <<END_JSON>>."},
|
| 274 |
]
|
| 275 |
continue
|
| 276 |
else:
|
| 277 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 278 |
|
| 279 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 280 |
if ok:
|
| 281 |
+
log.append("JSON parsed and validated successfully")
|
| 282 |
+
# attach model raw response as well
|
| 283 |
+
return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
|
| 284 |
else:
|
| 285 |
+
log.append(f"JSON parsed but schema validation failed: {parse_err}")
|
| 286 |
if attempt < max_attempts:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
messages = [
|
| 288 |
{"role": "system", "content": system_msg},
|
| 289 |
+
{"role": "user", "content": "The JSON you returned is invalid vs schema. Return corrected JSON only between markers."},
|
| 290 |
]
|
| 291 |
continue
|
| 292 |
else:
|
| 293 |
+
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 296 |
|
| 297 |
+
def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
|
| 298 |
+
log.append("Starting repair flow")
|
| 299 |
+
# if manual JSON pasted by user, try parse+validate directly
|
| 300 |
+
if manual_pasted_json:
|
| 301 |
+
log.append("User provided manual pasted JSON — trying to parse and validate")
|
| 302 |
+
jtxt = extract_json_from_text(manual_pasted_json) or manual_pasted_json
|
| 303 |
+
ok, parsed, err = try_parse_and_validate(jtxt)
|
| 304 |
+
if ok:
|
| 305 |
+
log.append("Manual pasted JSON validated successfully")
|
| 306 |
+
return {"metadata": parsed, "log": log, "raw_response": manual_pasted_json}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
else:
|
| 308 |
+
log.append(f"Manual pasted JSON validation failed: {err}")
|
| 309 |
+
return {"_parsing_error": True, "raw_output": manual_pasted_json, "parsed_partial": parsed, "parse_error": err, "log": log}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
+
# otherwise instruct model to repair the raw_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
system_msg = (
|
| 313 |
+
"You are an assistant that must extract and/or correct a malformed JSON from the user's raw_output. "
|
| 314 |
+
"Return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
|
|
|
|
| 315 |
)
|
|
|
|
| 316 |
repair_prompt = (
|
| 317 |
+
"Here is the raw output (possibly containing a malformed JSON). Extract and return a corrected JSON object "
|
| 318 |
+
"containing keys: doc_id,title,summary,doc_type,source,tags,tag_confidences,taxonomy_path,extracted_entities,raw_url,ingest_timestamp. "
|
| 319 |
+
"If fields are missing, use reasonable defaults (empty string, empty list or empty map)."
|
|
|
|
|
|
|
| 320 |
)
|
| 321 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt + "\n\nRaw output:\n\n" + (raw_output or "")}]
|
|
|
|
|
|
|
| 322 |
last_raw = None
|
| 323 |
for attempt in range(1, max_attempts + 1):
|
| 324 |
+
log.append(f"Repair attempt {attempt}")
|
| 325 |
try:
|
| 326 |
resp = client.chat.completions.create(
|
| 327 |
model=LLM_MODEL,
|
|
|
|
| 329 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 330 |
)
|
| 331 |
except Exception as e:
|
| 332 |
+
log.append(f"Repair API call failed: {e}")
|
| 333 |
+
return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
|
| 334 |
try:
|
| 335 |
+
full_text = resp.choices[0].message["content"].strip()
|
| 336 |
except Exception:
|
| 337 |
try:
|
| 338 |
+
full_text = resp.choices[0].message.content.strip()
|
| 339 |
except Exception:
|
| 340 |
+
full_text = str(resp)
|
| 341 |
+
last_raw = full_text
|
| 342 |
+
log.append("Repair model response received (raw length: " + str(len(full_text)) + ")")
|
| 343 |
+
json_text = extract_json_from_text(full_text)
|
| 344 |
if not json_text:
|
| 345 |
+
log.append("Repair response contained no JSON")
|
| 346 |
if attempt < max_attempts:
|
| 347 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your previous reply did not include the JSON. Return ONLY the corrected JSON between markers."}]
|
|
|
|
|
|
|
|
|
|
| 348 |
continue
|
| 349 |
else:
|
| 350 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
|
|
|
| 351 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 352 |
if ok:
|
| 353 |
+
log.append("Repair produced valid JSON")
|
| 354 |
+
return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
|
| 355 |
else:
|
| 356 |
+
log.append(f"Repair produced JSON but validation failed: {parse_err}")
|
| 357 |
if attempt < max_attempts:
|
| 358 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your JSON is invalid. Please correct and return ONLY the corrected JSON between markers."}]
|
|
|
|
|
|
|
|
|
|
| 359 |
continue
|
| 360 |
else:
|
| 361 |
+
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
| 362 |
+
return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
|
| 363 |
|
| 364 |
+
def auto_complete_partial(parsed_partial: Dict[str, Any], orig_name: str, extracted_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
|
| 365 |
+
log.append("Starting auto-complete for parsed partial")
|
| 366 |
+
system_msg = (
|
| 367 |
+
"You are an assistant that must fill missing metadata fields for a document. "
|
| 368 |
+
"Return ONLY a single JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> with the exact keys: "
|
| 369 |
+
"doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
|
| 370 |
+
"If you cannot infer a value, use reasonable defaults."
|
| 371 |
+
)
|
| 372 |
+
partial_str = json.dumps(parsed_partial, ensure_ascii=False)
|
| 373 |
+
short_text = (extracted_text[:1200] + "...") if len(extracted_text) > 1200 else extracted_text
|
| 374 |
+
prompt = f"Original filename: {orig_name}\n\nPreviously parsed partial JSON:\n{partial_str}\n\nDocument short text:\n{short_text}\n\nTop chunks:\n"
|
| 375 |
+
for i, c in enumerate(top_chunks[:6]):
|
| 376 |
+
prompt += f"CHUNK_{i+1}: {c[:900].replace(chr(10), ' ')}\n\n"
|
| 377 |
+
prompt += ("Task: Fill any missing or empty fields in the JSON above using the document context. "
|
| 378 |
+
"Return ONLY the completed JSON wrapped between <<BEGIN_JSON>> and <<END_JSON>>.")
|
| 379 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt}]
|
| 380 |
+
last_raw = None
|
| 381 |
+
for attempt in range(1, max_attempts + 1):
|
| 382 |
+
log.append(f"Auto-complete attempt {attempt}")
|
| 383 |
+
try:
|
| 384 |
+
resp = client.chat.completions.create(
|
| 385 |
+
model=LLM_MODEL,
|
| 386 |
+
messages=messages,
|
| 387 |
+
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 388 |
+
)
|
| 389 |
+
except Exception as e:
|
| 390 |
+
log.append(f"Auto-complete API call failed: {e}")
|
| 391 |
+
return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log}
|
| 392 |
+
try:
|
| 393 |
+
full_text = resp.choices[0].message["content"].strip()
|
| 394 |
+
except Exception:
|
| 395 |
+
try:
|
| 396 |
+
full_text = resp.choices[0].message.content.strip()
|
| 397 |
+
except Exception:
|
| 398 |
+
full_text = str(resp)
|
| 399 |
+
last_raw = full_text
|
| 400 |
+
log.append("Auto-complete model response received")
|
| 401 |
+
json_text = extract_json_from_text(full_text)
|
| 402 |
+
if not json_text:
|
| 403 |
+
log.append("Auto-complete response had no JSON")
|
| 404 |
+
if attempt < max_attempts:
|
| 405 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
|
| 406 |
+
continue
|
| 407 |
+
else:
|
| 408 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 409 |
+
ok, parsed_or_partial2, parse_err = try_parse_and_validate(json_text)
|
| 410 |
+
if ok:
|
| 411 |
+
log.append("Auto-complete succeeded and validated")
|
| 412 |
+
return {"metadata": parsed_or_partial2, "log": log, "raw_response": full_text}
|
| 413 |
+
else:
|
| 414 |
+
log.append(f"Auto-complete produced JSON but validation failed: {parse_err}")
|
| 415 |
+
if attempt < max_attempts:
|
| 416 |
+
messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
|
| 417 |
+
continue
|
| 418 |
+
else:
|
| 419 |
+
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial2, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
| 420 |
+
return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
|
| 421 |
+
|
| 422 |
+
# -----------------------
|
| 423 |
+
# Orchestration: process file
|
| 424 |
+
# -----------------------
|
| 425 |
+
def process_file(file_obj):
|
| 426 |
+
ui_log: List[str] = []
|
| 427 |
+
try:
|
| 428 |
+
tmp_path, orig_name = save_uploaded_to_tmp(file_obj, ui_log)
|
| 429 |
+
except Exception as e:
|
| 430 |
+
ui_log.append(f"Failed to save upload: {e}")
|
| 431 |
+
return {"error": f"Failed to save uploaded file: {e}", "log": ui_log, "raw_response": ""}
|
| 432 |
+
|
| 433 |
+
try:
|
| 434 |
+
if orig_name.lower().endswith(".pdf"):
|
| 435 |
+
extracted_text = extract_text_from_pdf(tmp_path, ui_log)
|
| 436 |
+
else:
|
| 437 |
+
extracted_text = extract_text_from_image(tmp_path, ui_log)
|
| 438 |
+
except Exception as e:
|
| 439 |
+
ui_log.append(f"Text extraction failed: {e}")
|
| 440 |
+
return {"error": f"Text extraction failed: {e}", "log": ui_log, "raw_response": ""}
|
| 441 |
+
|
| 442 |
+
if not extracted_text:
|
| 443 |
+
ui_log.append("No text found after extraction.")
|
| 444 |
+
return {"error": "No text found in document after extraction.", "log": ui_log, "raw_response": ""}
|
| 445 |
+
|
| 446 |
+
chunks = chunk_text(extracted_text)
|
| 447 |
+
ui_log.append(f"Document split into {len(chunks)} chunks")
|
| 448 |
+
sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
|
| 449 |
+
top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
|
| 450 |
+
short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
|
| 451 |
+
|
| 452 |
+
# Primary LLM call
|
| 453 |
+
result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=2)
|
| 454 |
+
|
| 455 |
+
# If API error
|
| 456 |
+
if result.get("_api_error"):
|
| 457 |
+
return {"error": result.get("error"), "log": ui_log + result.get("log", []), "raw_response": result.get("raw_response")}
|
| 458 |
+
|
| 459 |
+
# If parsing error, attempt auto-complete if we have parsed_partial
|
| 460 |
+
if result.get("_parsing_error"):
|
| 461 |
+
ui_log += result.get("log", [])
|
| 462 |
+
raw_out = result.get("raw_output", result.get("raw_response", ""))
|
| 463 |
+
parsed_partial = result.get("parsed_partial", {})
|
| 464 |
+
ui_log.append("Initial parse failed; attempting auto-complete if partial available")
|
| 465 |
+
if parsed_partial:
|
| 466 |
+
ac = auto_complete_partial(parsed_partial, orig_name, extracted_text, top_chunks, ui_log, max_attempts=2)
|
| 467 |
+
if ac.get("_api_error"):
|
| 468 |
+
ui_log += ac.get("log", [])
|
| 469 |
+
return {"error": "Auto-complete API error", "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
|
| 470 |
+
if ac.get("_parsing_error"):
|
| 471 |
+
ui_log += ac.get("log", [])
|
| 472 |
+
return {"error": "LLM output parsing failed. See raw_output.", "raw_output": ac.get("raw_output", raw_out), "parsed_partial": ac.get("parsed_partial"), "parse_error": ac.get("parse_error"), "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
|
| 473 |
+
# success
|
| 474 |
+
metadata = ac.get("metadata")
|
| 475 |
+
ui_log += ac.get("log", [])
|
| 476 |
+
ui_log.append("Auto-complete produced metadata")
|
| 477 |
+
# ensure defaults
|
| 478 |
+
now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
|
| 479 |
+
metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
|
| 480 |
+
metadata.setdefault("title", orig_name)
|
| 481 |
+
metadata.setdefault("source", "user_upload")
|
| 482 |
+
metadata.setdefault("raw_url", "")
|
| 483 |
+
metadata.setdefault("ingest_timestamp", now)
|
| 484 |
+
return {"metadata": metadata, "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
|
| 485 |
+
else:
|
| 486 |
+
ui_log.append("No parsed_partial to auto-complete; returning raw output for manual repair")
|
| 487 |
+
return {"error": "LLM output parsing failed. See raw_output.", "raw_output": raw_out, "parsed_partial": parsed_partial, "parse_error": result.get("parse_error"), "log": ui_log, "raw_response": result.get("raw_response", raw_out)}
|
| 488 |
+
|
| 489 |
+
# success path
|
| 490 |
+
metadata = result.get("metadata")
|
| 491 |
+
ui_log += result.get("log", [])
|
| 492 |
+
raw_model_response = result.get("raw_response")
|
| 493 |
+
now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
|
| 494 |
+
metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
|
| 495 |
+
metadata.setdefault("title", orig_name)
|
| 496 |
+
metadata.setdefault("source", "user_upload")
|
| 497 |
+
metadata.setdefault("raw_url", "")
|
| 498 |
+
metadata.setdefault("ingest_timestamp", now)
|
| 499 |
+
ui_log.append("Metadata generation successful")
|
| 500 |
+
return {"metadata": metadata, "log": ui_log, "raw_response": raw_model_response}
|
| 501 |
|
| 502 |
# -----------------------
|
| 503 |
# Gradio UI
|
| 504 |
# -----------------------
|
| 505 |
+
with gr.Blocks(title="DocClassify — Final Robust") as demo:
|
| 506 |
+
gr.Markdown("## 📂 Upload PDF / Image → automated taxonomy & tagging (GPT-5). Logs & GPT response shown below.")
|
| 507 |
with gr.Row():
|
| 508 |
with gr.Column(scale=1):
|
| 509 |
uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
|
| 510 |
run_button = gr.Button("Process document")
|
| 511 |
status = gr.Textbox(label="Status", value="", interactive=False)
|
| 512 |
download_button = gr.File(label="Download metadata JSON", visible=False)
|
| 513 |
+
gr.Markdown("### Manual repair (paste raw LLM output if needed)")
|
| 514 |
+
manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
|
| 515 |
+
repair_from_paste_btn = gr.Button("Repair from pasted raw output")
|
| 516 |
+
repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
|
| 517 |
with gr.Column(scale=1):
|
| 518 |
+
output_json = gr.JSON(label="Metadata JSON (parsed)")
|
| 519 |
+
raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
|
| 520 |
+
logs_box = gr.Textbox(label="Step-by-step logs", lines=12, interactive=False)
|
| 521 |
+
|
| 522 |
+
# state holders
|
| 523 |
+
last_raw_state = gr.State(value=None) # store last raw model response
|
| 524 |
+
last_metadata_file = gr.State(value=None) # path to downloadable json
|
| 525 |
+
|
| 526 |
+
def on_process(file_obj):
|
| 527 |
+
if not file_obj:
|
| 528 |
+
return {}, "No file uploaded", None, "", ""
|
| 529 |
+
status_msg = "Processing..."
|
| 530 |
try:
|
| 531 |
result = process_file(file_obj)
|
| 532 |
except Exception as e:
|
| 533 |
+
return {}, f"Failed: {e}", None, "", "\n".join([f"Exception: {e}"])
|
| 534 |
+
# handle errors and success
|
| 535 |
+
logs = result.get("log", [])
|
| 536 |
+
raw_response = result.get("raw_response", "")
|
| 537 |
if result.get("error"):
|
| 538 |
+
# show raw_output for manual repair if present
|
| 539 |
+
raw_out = result.get("raw_output", raw_response) or ""
|
| 540 |
+
parsed_partial = result.get("parsed_partial")
|
| 541 |
+
display = {"error": result.get("error")}
|
| 542 |
+
if parsed_partial is not None:
|
| 543 |
+
display["parsed_partial"] = parsed_partial
|
| 544 |
+
# put logs and raw_response into UI
|
| 545 |
+
logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
|
| 546 |
+
return display, f"Error: {result.get('error')}", None, raw_out, logs_text
|
| 547 |
+
# success -> create temp file for download
|
| 548 |
+
metadata = result.get("metadata")
|
| 549 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 550 |
with open(tmpf.name, "w", encoding="utf8") as f:
|
| 551 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 552 |
+
logs_text = "\n".join(logs)
|
| 553 |
+
return metadata, "Done", tmpf.name, raw_response or "", logs_text
|
| 554 |
+
|
| 555 |
+
def on_repair_from_paste(manual_text):
|
| 556 |
+
if not manual_text:
|
| 557 |
+
return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
|
| 558 |
+
# try repair using model (or direct parse)
|
| 559 |
+
ui_log = ["Repair-from-paste initiated"]
|
| 560 |
+
repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
|
| 561 |
+
logs_text = "\n".join(repaired.get("log", ui_log))
|
|
|
|
| 562 |
if repaired.get("_api_error"):
|
| 563 |
+
return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", manual_text), logs_text
|
|
|
|
| 564 |
if repaired.get("_parsing_error"):
|
| 565 |
+
display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
|
| 566 |
+
return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
|
| 567 |
+
# success
|
| 568 |
+
metadata = repaired.get("metadata")
|
|
|
|
| 569 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 570 |
with open(tmpf.name, "w", encoding="utf8") as f:
|
| 571 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 572 |
+
return metadata, "Repair succeeded", tmpf.name, repaired.get("raw_response", manual_text), logs_text
|
| 573 |
+
|
| 574 |
+
def on_repair_auto(raw_response_text):
|
| 575 |
+
if not raw_response_text:
|
| 576 |
+
return {}, "No raw_response available for auto repair. Run process or paste raw output.", None, "", "No raw_response available."
|
| 577 |
+
ui_log = ["Auto repair initiated"]
|
| 578 |
+
repaired = repair_raw_output(raw_output=raw_response_text, manual_pasted_json=None, log=ui_log, max_attempts=2)
|
| 579 |
+
logs_text = "\n".join(repaired.get("log", ui_log))
|
| 580 |
+
if repaired.get("_api_error"):
|
| 581 |
+
return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", raw_response_text), logs_text
|
| 582 |
+
if repaired.get("_parsing_error"):
|
| 583 |
+
display = {"error": "Auto-repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
|
| 584 |
+
return display, "Auto-repair failed", None, repaired.get("raw_response", raw_response_text), logs_text
|
| 585 |
+
metadata = repaired.get("metadata")
|
| 586 |
+
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 587 |
+
with open(tmpf.name, "w", encoding="utf8") as f:
|
| 588 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 589 |
+
return metadata, "Auto-repair succeeded", tmpf.name, repaired.get("raw_response", raw_response_text), logs_text
|
| 590 |
|
| 591 |
+
run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button, raw_output_box, logs_box])
|
| 592 |
+
repair_from_paste_btn.click(on_repair_from_paste, inputs=[manual_raw_input], outputs=[output_json, status, download_button, raw_output_box, logs_box])
|
| 593 |
+
repair_auto_btn.click(on_repair_auto, inputs=[raw_output_box], outputs=[output_json, status, download_button, raw_output_box, logs_box])
|
| 594 |
|
|
|
|
| 595 |
if __name__ == "__main__":
|
| 596 |
demo.launch()
|