File size: 4,500 Bytes
459372e
4b3f51e
cce0884
688144c
21ae957
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b3f51e
 
21ae957
55e3c9a
2e237ce
55e3c9a
21ae957
cce0884
 
21ae957
 
 
 
 
688144c
2e237ce
21ae957
2e237ce
 
 
 
cce0884
2e237ce
 
cce0884
2e237ce
 
cce0884
2e237ce
 
cce0884
2e237ce
 
cce0884
21ae957
cce0884
4b3f51e
55e3c9a
459372e
 
55e3c9a
21ae957
459372e
4b3f51e
6afedff
 
55e3c9a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import tempfile
import os
import shutil
import subprocess
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent

def run_cmd(cmd, cwd=None, env=None):
    """Run a command, print nice logs, and also save them to run.log in cwd."""
    cwd = str(cwd or os.getcwd())
    print(f"🟦 Running: {' '.join(cmd)}  (cwd={cwd})")
    proc = subprocess.run(
        cmd,
        cwd=cwd,
        env=env,
        capture_output=True,
        text=True
    )
    if proc.stdout:
        print("🟩 STDOUT:")
        print(proc.stdout)
    if proc.stderr:
        print("πŸŸ₯ STDERR:")
        print(proc.stderr)
    # Save to run.log for debugging
    try:
        runlog = Path(cwd) / "run.log"
        with open(runlog, "a", encoding="utf-8") as f:
            f.write(f"$ {' '.join(cmd)}\n")
            if proc.stdout:
                f.write(proc.stdout + "\n")
            if proc.stderr:
                f.write(proc.stderr + "\n")
        print(f"🧾 Run log saved to: {runlog}")
    except Exception as e:
        print(f"⚠️ Could not write run.log: {e}")

    if proc.returncode != 0:
        # Let Gradio see the failure so it surfaces properly
        raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
    return proc

def _locate_pdf_json(temp_dir: str) -> str:
    """
    Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
    Find it (and a few common fallbacks). Raise if not found.
    """
    td = Path(temp_dir)

    # Prefer exactly-named file if present
    candidates = [
        td / "pdf_data.json",                    # legacy name (if ever created)
        td / "input_comprehensive_data.json",    # most common from your logs
        td / "comprehensive_data.json",          # another common alias
        td / "output.json",                      # generic
    ]
    for p in candidates:
        if p.exists():
            print(f"βœ… Using PDF JSON: {p}")
            return str(p)

    # Generic pattern: anything *_comprehensive_data.json
    globs = list(td.glob("*_comprehensive_data.json"))
    if globs:
        print(f"βœ… Using PDF JSON (glob): {globs[0]}")
        return str(globs[0])

    # If still not found, surface a helpful error
    searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
    raise FileNotFoundError(
        f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
    )

def process_files(pdf_file, word_file):
    # Create a unique temporary directory for this run
    temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
    print(f"πŸ“‚ Temp dir: {temp_dir}")

    # Define standard filenames for use in the pipeline
    pdf_path = os.path.join(temp_dir, "input.pdf")
    word_path = os.path.join(temp_dir, "input.docx")
    word_json_path = os.path.join(temp_dir, "word_data.json")
    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
    final_docx_path = os.path.join(temp_dir, "updated.docx")

    # Copy the uploaded files to the temp directory
    shutil.copy(pdf_file, pdf_path)
    print(f"πŸ“„ PDF copied to: {pdf_path}")
    shutil.copy(word_file, word_path)
    print(f"πŸ“ DOCX copied to: {word_path}")

    # 1) PDF β†’ JSON  (extractor writes <stem>_comprehensive_data.json into cwd)
    run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)

    # Find the JSON produced by the extractor
    pdf_json_path = _locate_pdf_json(temp_dir)

    # 2) DOCX red text β†’ JSON
    run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)

    # 3) Merge JSON (uses the resolved pdf_json_path)
    run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)

    # 4) Apply updates to DOCX
    run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)

    # Return the final .docx file
    return final_docx_path

iface = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.File(label="Upload Word File", type="filepath")
    ],
    outputs=gr.File(label="Download Updated Word File"),
    title="Red Text Replacer",
    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
)

if __name__ == "__main__":
    iface.launch()