import os import json import tempfile import subprocess from pathlib import Path # 统一把 HOME 指到 /tmp os.environ["HOME"] = "/tmp" Path("/tmp").mkdir(parents=True, exist_ok=True) # 再确保所有 streamlit 相关路径也指向 /tmp os.environ["XDG_CACHE_HOME"] = "/tmp" os.environ["STREAMLIT_CACHE_DIR"] = "/tmp" os.environ["STREAMLIT_GLOBAL_DATA_DIR"] = "/tmp" os.environ["STREAMLIT_RUNTIME_DIR"] = "/tmp" os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" Path("/tmp/.streamlit").mkdir(parents=True, exist_ok=True) # 可选:关掉使用统计 os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false" os.environ.setdefault("STREAMLIT_SERVER_ENABLE_CORS", "false") os.environ.setdefault("STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION", "false") import streamlit as st import langextract as lx # pip: langextract[openai] # 你的原始页面设置 st.set_page_config(page_title="LangmyOCR (Streamlit)", layout="wide") st.title("LangmyOCR: OCRmyPDF + LangExtract (Streamlit Demo)") st.caption("先 OCR,后(可选)结构化抽取与交互式复核。数据仅用于会话处理。") # ---------------- Utilities ---------------- def has_bin(name: str) -> bool: return subprocess.run(["bash", "-lc", f"command -v {name} >/dev/null 2>&1"]).returncode == 0 def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool, optimize_level: int, force_ocr: bool, skip_text: bool, export_sidecar: bool): if pdf_file is None: st.error("请先上传 PDF。") return None, None, None if not has_bin("ocrmypdf"): st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。") return None, None, None # 修复:重置文件指针到开头,然后读取内容 try: pdf_file.seek(0) # 重置文件指针 pdf_content = pdf_file.read() if not pdf_content: st.error("PDF 文件内容为空。") return None, None, None except Exception as e: st.error(f"读取 PDF 文件失败:{e}") return None, None, None # 保存上传文件到临时路径 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(pdf_content) # 使用读取到的内容 in_path = Path(tmp.name) work = Path(tempfile.mkdtemp(prefix="ocr_")) out_pdf = work / "output_ocr.pdf" sidecar = work / "out.txt" cmd = ["ocrmypdf", "-l", langs, str(in_path), str(out_pdf)] # 选项(插入到命令靠前位置,保持可读) if rotate_pages: cmd.insert(1, "--rotate-pages") if deskew: cmd.insert(1, "--deskew") if clean: cmd.insert(1, "--clean") cmd[1:1] = ["--optimize", str(optimize_level)] if skip_text: cmd.insert(1, "--skip-text") if force_ocr: cmd.insert(1, "--force-ocr") if export_sidecar: cmd[1:1] = ["--sidecar", str(sidecar)] cmd[1:1] = ["--output-type", "pdfa"] # 归档友好 with st.status("正在执行 OCR …", expanded=False) as s: proc = subprocess.run(cmd, capture_output=True, text=True) if proc.returncode != 0: s.update(label="OCR 失败", state="error") st.error(f"OCR 失败(退出码 {proc.returncode})") st.code(proc.stderr[-2000:], language="bash") return None, None, None s.update(label="OCR 完成", state="complete") preview = "" sidecar_path = None if export_sidecar and sidecar.exists(): sidecar_path = str(sidecar) try: preview = sidecar.read_text(encoding="utf-8", errors="ignore")[:3000] except Exception: preview = "(sidecar 文本预览读取失败)" return str(out_pdf), sidecar_path, preview def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str): if not sidecar_text: return None, None, "没有可供抽取的文本。" if provider == "None": return None, None, "未选择模型,跳过抽取。" # 1) 读取 Key,并统一默认打开 fence_output fence_output = True # << 对 Gemini 也打开 use_schema_constraints = False # 先不启 Schema(必要时再开) if provider == "Gemini": api_key = os.environ.get("LANGEXTRACT_API_KEY") if not api_key: return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。" elif provider == "OpenAI": api_key = os.environ.get("OPENAI_API_KEY") if not api_key: return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。" else: return None, None, "未知的 provider。" # 2) 收紧提示语(覆盖面向法律的 schema),严格要求“只返回 JSON 数组” strict_prompt = ( "You are an information extraction engine. " "Extract legal entities, events, relationships, and evidence anchors from the input text. " "Return ONLY a JSON array, no prose, no markdown, no comments. " "Schema per item: {" "\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], " "\"text\": string (exact span), " "\"attributes\": object (key-value), " "\"source_hint\": string (optional page/line) " "}." ) # 3) 精简可运行的 few-shot(与法律场景贴近) examples = [ lx.data.ExampleData( text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.", extractions=[ lx.data.Extraction( extraction_class="party", extraction_text="Walker Law Group", attributes={"role": "law_firm"}, ), lx.data.Extraction( extraction_class="event", extraction_text="completed a medicolegal report", attributes={"actor": "Dr Gavin Soo"}, ), lx.data.Extraction( extraction_class="date", extraction_text="15 February 2022", attributes={} ), ], ) ] # 4) 先跑一次;若解析失败,再以更强硬提示重试一次 work = Path(tempfile.mkdtemp(prefix="lx_")) jsonl_path = work / "extractions.jsonl" html_path = work / "review.html" raw_path1 = work / "raw_attempt1.txt" raw_path2 = work / "raw_attempt2.txt" def _try_extract(prompt_text): # LangExtract 没有公开 raw 输出参数,我们用 try/except 捕获并让其保存在日志(同时缩短输入验证) return lx.extract( text_or_documents=sidecar_text[:15000], # 先限长,避免超长触发安全策略 prompt_description=prompt_text.strip(), examples=examples, model_id=model_id.strip(), api_key=api_key, fence_output=fence_output, use_schema_constraints=use_schema_constraints, ) with st.status("正在进行结构化抽取 …", expanded=False) as s: try: result = _try_extract(strict_prompt) except Exception as e1: # 第一次失败:很可能是返回了非 JSON。我们把提示再加强,强调 “only JSON array” hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text." try: result = _try_extract(hard_prompt) except Exception as e2: s.update(label="抽取失败", state="error") return None, None, f"LangExtract 抽取失败:{e2}" # 保存结果并可视化 try: lx.io.save_annotated_documents([result], output_name=str(jsonl_path)) html_content = lx.visualize(str(jsonl_path)) html_path.write_text(html_content, encoding="utf-8") except Exception as e: s.update(label="可视化失败", state="error") return None, None, f"可视化失败:{e}" s.update(label="抽取完成", state="complete") return str(html_path), str(jsonl_path), "抽取成功。" # ---------------- UI ---------------- with st.sidebar: st.header("参数") # 用 form 把"上传 + 参数 + 提交"打包,避免按钮重跑导致 file_uploader 丢值 with st.form("run_form", clear_on_submit=False): pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader") langs = st.text_input("OCR 语言(Tesseract 语法)", value="eng+chi_sim") col_a, col_b, col_c = st.columns(3) with col_a: rotate_pages = st.checkbox("自动旋转校正", value=True) with col_b: deskew = st.checkbox("去偏斜", value=True) with col_c: clean = st.checkbox("清理底噪/污渍", value=True) optimize_level = st.select_slider("优化级别", options=[0,1,2], value=1) skip_text = st.checkbox("跳过已有文本层 (--skip-text)", value=True) force_ocr = st.checkbox("强制重做文本层 (--force-ocr) [谨慎]", value=False) export_sidecar = st.checkbox("导出 sidecar 文本", value=True) st.markdown("---") provider = st.selectbox("抽取提供方", ["None", "Gemini", "OpenAI"], index=0) model_id = st.text_input("模型 ID", value="gemini-2.5-flash") prompt = st.text_area( "抽取任务描述(建议按你的法律场景定制)", value=("Extract legal entities, events, relationships, and evidence anchors. " "Return JSON objects with fields: {party, role, event, date, relation, citation, quote}. " "Preserve exact source spans for traceability."), height=160, ) submitted = st.form_submit_button("运行 OCR(+可选抽取)", type="primary") col1, col2 = st.columns([1,1]) with col1: st.subheader("OCR 结果") ocr_pdf_slot = st.empty() sidecar_slot = st.empty() preview_slot = st.empty() with col2: st.subheader("抽取与复核") html_slot = st.empty() jsonl_slot = st.empty() status_slot = st.empty() # 辅助:显示文件已被接收(提交前就可见,便于确认) if "pdf_uploader" in st.session_state and st.session_state["pdf_uploader"]: st.sidebar.success(f"已选择:{st.session_state['pdf_uploader'].name} " f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)") if submitted: # 添加调试信息 if pdf is None: st.error("PDF 为 None - 检查文件上传") else: st.info(f"PDF 文件信息:名称={pdf.name}, 大小={pdf.size} bytes") out_pdf, sidecar_path, preview = run_ocr( pdf, langs, rotate_pages, deskew, clean, optimize_level, force_ocr, skip_text, export_sidecar ) if out_pdf: with open(out_pdf, "rb") as f: ocr_pdf_slot.download_button("下载 OCR 后 PDF", f, file_name="output_ocr.pdf") if sidecar_path: with open(sidecar_path, "rb") as f: sidecar_slot.download_button("下载 sidecar 文本", f, file_name="out.txt") preview_slot.text_area("sidecar 文本预览(前 3000 字)", value=preview, height=240) if sidecar_path and provider != "None": txt = Path(sidecar_path).read_text(encoding="utf-8", errors="ignore") html_path, jsonl_path, status = run_extract(txt, provider, model_id, prompt) status_slot.info(status) if html_path and Path(html_path).exists(): html_content = Path(html_path).read_text(encoding="utf-8", errors="ignore") st.components.v1.html(html_content, height=650, scrolling=True) if jsonl_path and Path(jsonl_path).exists(): with open(jsonl_path, "rb") as f: jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")