Spaces:
Running
Running
| import gradio as gr | |
| import tempfile | |
| import os | |
| import shutil | |
| import subprocess | |
| from pathlib import Path | |
| SCRIPT_DIR = Path(__file__).resolve().parent | |
| def run_cmd(cmd, cwd=None, env=None): | |
| """Run a command, print nice logs, and also save them to run.log in cwd.""" | |
| cwd = str(cwd or os.getcwd()) | |
| print(f"π¦ Running: {' '.join(cmd)} (cwd={cwd})") | |
| proc = subprocess.run( | |
| cmd, | |
| cwd=cwd, | |
| env=env, | |
| capture_output=True, | |
| text=True | |
| ) | |
| if proc.stdout: | |
| print("π© STDOUT:") | |
| print(proc.stdout) | |
| if proc.stderr: | |
| print("π₯ STDERR:") | |
| print(proc.stderr) | |
| # Save to run.log for debugging | |
| try: | |
| runlog = Path(cwd) / "run.log" | |
| with open(runlog, "a", encoding="utf-8") as f: | |
| f.write(f"$ {' '.join(cmd)}\n") | |
| if proc.stdout: | |
| f.write(proc.stdout + "\n") | |
| if proc.stderr: | |
| f.write(proc.stderr + "\n") | |
| print(f"π§Ύ Run log saved to: {runlog}") | |
| except Exception as e: | |
| print(f"β οΈ Could not write run.log: {e}") | |
| if proc.returncode != 0: | |
| # Let Gradio see the failure so it surfaces properly | |
| raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr) | |
| return proc | |
| def _locate_pdf_json(temp_dir: str) -> str: | |
| """ | |
| Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json. | |
| Find it (and a few common fallbacks). Raise if not found. | |
| """ | |
| td = Path(temp_dir) | |
| # Prefer exactly-named file if present | |
| candidates = [ | |
| td / "pdf_data.json", # legacy name (if ever created) | |
| td / "input_comprehensive_data.json", # most common from your logs | |
| td / "comprehensive_data.json", # another common alias | |
| td / "output.json", # generic | |
| ] | |
| for p in candidates: | |
| if p.exists(): | |
| print(f"β Using PDF JSON: {p}") | |
| return str(p) | |
| # Generic pattern: anything *_comprehensive_data.json | |
| globs = list(td.glob("*_comprehensive_data.json")) | |
| if globs: | |
| print(f"β Using PDF JSON (glob): {globs[0]}") | |
| return str(globs[0]) | |
| # If still not found, surface a helpful error | |
| searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json") | |
| raise FileNotFoundError( | |
| f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}" | |
| ) | |
| def process_files(pdf_file, word_file): | |
| # Create a unique temporary directory for this run | |
| temp_dir = tempfile.mkdtemp(prefix="hf_redtext_") | |
| print(f"π Temp dir: {temp_dir}") | |
| # Define standard filenames for use in the pipeline | |
| pdf_path = os.path.join(temp_dir, "input.pdf") | |
| word_path = os.path.join(temp_dir, "input.docx") | |
| word_json_path = os.path.join(temp_dir, "word_data.json") | |
| updated_json_path = os.path.join(temp_dir, "updated_word_data.json") | |
| final_docx_path = os.path.join(temp_dir, "updated.docx") | |
| # Copy the uploaded files to the temp directory | |
| shutil.copy(pdf_file, pdf_path) | |
| print(f"π PDF copied to: {pdf_path}") | |
| shutil.copy(word_file, word_path) | |
| print(f"π DOCX copied to: {word_path}") | |
| # 1) PDF β JSON (extractor writes <stem>_comprehensive_data.json into cwd) | |
| run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir) | |
| # Find the JSON produced by the extractor | |
| pdf_json_path = _locate_pdf_json(temp_dir) | |
| # 2) DOCX red text β JSON | |
| run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir) | |
| # 3) Merge JSON (uses the resolved pdf_json_path) | |
| run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir) | |
| # 4) Apply updates to DOCX | |
| run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir) | |
| # Return the final .docx file | |
| return final_docx_path | |
| iface = gr.Interface( | |
| fn=process_files, | |
| inputs=[ | |
| gr.File(label="Upload PDF File", type="filepath"), | |
| gr.File(label="Upload Word File", type="filepath") | |
| ], | |
| outputs=gr.File(label="Download Updated Word File"), | |
| title="Red Text Replacer", | |
| description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |