Spaces:
Running
Running
File size: 4,500 Bytes
459372e 4b3f51e cce0884 688144c 21ae957 2e237ce 4b3f51e 21ae957 55e3c9a 2e237ce 55e3c9a 21ae957 cce0884 21ae957 688144c 2e237ce 21ae957 2e237ce cce0884 2e237ce cce0884 2e237ce cce0884 2e237ce cce0884 2e237ce cce0884 21ae957 cce0884 4b3f51e 55e3c9a 459372e 55e3c9a 21ae957 459372e 4b3f51e 6afedff 55e3c9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import tempfile
import os
import shutil
import subprocess
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
def run_cmd(cmd, cwd=None, env=None):
"""Run a command, print nice logs, and also save them to run.log in cwd."""
cwd = str(cwd or os.getcwd())
print(f"π¦ Running: {' '.join(cmd)} (cwd={cwd})")
proc = subprocess.run(
cmd,
cwd=cwd,
env=env,
capture_output=True,
text=True
)
if proc.stdout:
print("π© STDOUT:")
print(proc.stdout)
if proc.stderr:
print("π₯ STDERR:")
print(proc.stderr)
# Save to run.log for debugging
try:
runlog = Path(cwd) / "run.log"
with open(runlog, "a", encoding="utf-8") as f:
f.write(f"$ {' '.join(cmd)}\n")
if proc.stdout:
f.write(proc.stdout + "\n")
if proc.stderr:
f.write(proc.stderr + "\n")
print(f"π§Ύ Run log saved to: {runlog}")
except Exception as e:
print(f"β οΈ Could not write run.log: {e}")
if proc.returncode != 0:
# Let Gradio see the failure so it surfaces properly
raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
return proc
def _locate_pdf_json(temp_dir: str) -> str:
"""
Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
Find it (and a few common fallbacks). Raise if not found.
"""
td = Path(temp_dir)
# Prefer exactly-named file if present
candidates = [
td / "pdf_data.json", # legacy name (if ever created)
td / "input_comprehensive_data.json", # most common from your logs
td / "comprehensive_data.json", # another common alias
td / "output.json", # generic
]
for p in candidates:
if p.exists():
print(f"β
Using PDF JSON: {p}")
return str(p)
# Generic pattern: anything *_comprehensive_data.json
globs = list(td.glob("*_comprehensive_data.json"))
if globs:
print(f"β
Using PDF JSON (glob): {globs[0]}")
return str(globs[0])
# If still not found, surface a helpful error
searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
raise FileNotFoundError(
f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
)
def process_files(pdf_file, word_file):
# Create a unique temporary directory for this run
temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
print(f"π Temp dir: {temp_dir}")
# Define standard filenames for use in the pipeline
pdf_path = os.path.join(temp_dir, "input.pdf")
word_path = os.path.join(temp_dir, "input.docx")
word_json_path = os.path.join(temp_dir, "word_data.json")
updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
final_docx_path = os.path.join(temp_dir, "updated.docx")
# Copy the uploaded files to the temp directory
shutil.copy(pdf_file, pdf_path)
print(f"π PDF copied to: {pdf_path}")
shutil.copy(word_file, word_path)
print(f"π DOCX copied to: {word_path}")
# 1) PDF β JSON (extractor writes <stem>_comprehensive_data.json into cwd)
run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)
# Find the JSON produced by the extractor
pdf_json_path = _locate_pdf_json(temp_dir)
# 2) DOCX red text β JSON
run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)
# 3) Merge JSON (uses the resolved pdf_json_path)
run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)
# 4) Apply updates to DOCX
run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)
# Return the final .docx file
return final_docx_path
iface = gr.Interface(
fn=process_files,
inputs=[
gr.File(label="Upload PDF File", type="filepath"),
gr.File(label="Upload Word File", type="filepath")
],
outputs=gr.File(label="Download Updated Word File"),
title="Red Text Replacer",
description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
)
if __name__ == "__main__":
iface.launch() |