LangmyOCR / app.py
nanoapple's picture
Update app.py
1788c95 verified
import os
import json
import tempfile
import subprocess
from pathlib import Path
# 统一把 HOME 指到 /tmp
os.environ["HOME"] = "/tmp"
Path("/tmp").mkdir(parents=True, exist_ok=True)
# 再确保所有 streamlit 相关路径也指向 /tmp
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["STREAMLIT_CACHE_DIR"] = "/tmp"
os.environ["STREAMLIT_GLOBAL_DATA_DIR"] = "/tmp"
os.environ["STREAMLIT_RUNTIME_DIR"] = "/tmp"
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"
Path("/tmp/.streamlit").mkdir(parents=True, exist_ok=True)
# 可选:关掉使用统计
os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
os.environ.setdefault("STREAMLIT_SERVER_ENABLE_CORS", "false")
os.environ.setdefault("STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION", "false")
import streamlit as st
import langextract as lx # pip: langextract[openai]
# 你的原始页面设置
st.set_page_config(page_title="LangmyOCR (Streamlit)", layout="wide")
st.title("LangmyOCR: OCRmyPDF + LangExtract (Streamlit Demo)")
st.caption("先 OCR,后(可选)结构化抽取与交互式复核。数据仅用于会话处理。")
# ---------------- Utilities ----------------
def has_bin(name: str) -> bool:
return subprocess.run(["bash", "-lc", f"command -v {name} >/dev/null 2>&1"]).returncode == 0
def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
optimize_level: int, force_ocr: bool, skip_text: bool, export_sidecar: bool):
if pdf_file is None:
st.error("请先上传 PDF。")
return None, None, None
if not has_bin("ocrmypdf"):
st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。")
return None, None, None
# 修复:重置文件指针到开头,然后读取内容
try:
pdf_file.seek(0) # 重置文件指针
pdf_content = pdf_file.read()
if not pdf_content:
st.error("PDF 文件内容为空。")
return None, None, None
except Exception as e:
st.error(f"读取 PDF 文件失败:{e}")
return None, None, None
# 保存上传文件到临时路径
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(pdf_content) # 使用读取到的内容
in_path = Path(tmp.name)
work = Path(tempfile.mkdtemp(prefix="ocr_"))
out_pdf = work / "output_ocr.pdf"
sidecar = work / "out.txt"
cmd = ["ocrmypdf", "-l", langs, str(in_path), str(out_pdf)]
# 选项(插入到命令靠前位置,保持可读)
if rotate_pages: cmd.insert(1, "--rotate-pages")
if deskew: cmd.insert(1, "--deskew")
if clean: cmd.insert(1, "--clean")
cmd[1:1] = ["--optimize", str(optimize_level)]
if skip_text: cmd.insert(1, "--skip-text")
if force_ocr: cmd.insert(1, "--force-ocr")
if export_sidecar: cmd[1:1] = ["--sidecar", str(sidecar)]
cmd[1:1] = ["--output-type", "pdfa"] # 归档友好
with st.status("正在执行 OCR …", expanded=False) as s:
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
s.update(label="OCR 失败", state="error")
st.error(f"OCR 失败(退出码 {proc.returncode})")
st.code(proc.stderr[-2000:], language="bash")
return None, None, None
s.update(label="OCR 完成", state="complete")
preview = ""
sidecar_path = None
if export_sidecar and sidecar.exists():
sidecar_path = str(sidecar)
try:
preview = sidecar.read_text(encoding="utf-8", errors="ignore")[:3000]
except Exception:
preview = "(sidecar 文本预览读取失败)"
return str(out_pdf), sidecar_path, preview
def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
if not sidecar_text:
return None, None, "没有可供抽取的文本。"
if provider == "None":
return None, None, "未选择模型,跳过抽取。"
# 1) 读取 Key,并统一默认打开 fence_output
fence_output = True # << 对 Gemini 也打开
use_schema_constraints = False # 先不启 Schema(必要时再开)
if provider == "Gemini":
api_key = os.environ.get("LANGEXTRACT_API_KEY")
if not api_key:
return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。"
elif provider == "OpenAI":
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。"
else:
return None, None, "未知的 provider。"
# 2) 收紧提示语(覆盖面向法律的 schema),严格要求“只返回 JSON 数组”
strict_prompt = (
"You are an information extraction engine. "
"Extract legal entities, events, relationships, and evidence anchors from the input text. "
"Return ONLY a JSON array, no prose, no markdown, no comments. "
"Schema per item: {"
"\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], "
"\"text\": string (exact span), "
"\"attributes\": object (key-value), "
"\"source_hint\": string (optional page/line) "
"}."
)
# 3) 精简可运行的 few-shot(与法律场景贴近)
examples = [
lx.data.ExampleData(
text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.",
extractions=[
lx.data.Extraction(
extraction_class="party",
extraction_text="Walker Law Group",
attributes={"role": "law_firm"},
),
lx.data.Extraction(
extraction_class="event",
extraction_text="completed a medicolegal report",
attributes={"actor": "Dr Gavin Soo"},
),
lx.data.Extraction(
extraction_class="date",
extraction_text="15 February 2022",
attributes={}
),
],
)
]
# 4) 先跑一次;若解析失败,再以更强硬提示重试一次
work = Path(tempfile.mkdtemp(prefix="lx_"))
jsonl_path = work / "extractions.jsonl"
html_path = work / "review.html"
raw_path1 = work / "raw_attempt1.txt"
raw_path2 = work / "raw_attempt2.txt"
def _try_extract(prompt_text):
# LangExtract 没有公开 raw 输出参数,我们用 try/except 捕获并让其保存在日志(同时缩短输入验证)
return lx.extract(
text_or_documents=sidecar_text[:15000], # 先限长,避免超长触发安全策略
prompt_description=prompt_text.strip(),
examples=examples,
model_id=model_id.strip(),
api_key=api_key,
fence_output=fence_output,
use_schema_constraints=use_schema_constraints,
)
with st.status("正在进行结构化抽取 …", expanded=False) as s:
try:
result = _try_extract(strict_prompt)
except Exception as e1:
# 第一次失败:很可能是返回了非 JSON。我们把提示再加强,强调 “only JSON array”
hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text."
try:
result = _try_extract(hard_prompt)
except Exception as e2:
s.update(label="抽取失败", state="error")
return None, None, f"LangExtract 抽取失败:{e2}"
# 保存结果并可视化
try:
lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
html_content = lx.visualize(str(jsonl_path))
html_path.write_text(html_content, encoding="utf-8")
except Exception as e:
s.update(label="可视化失败", state="error")
return None, None, f"可视化失败:{e}"
s.update(label="抽取完成", state="complete")
return str(html_path), str(jsonl_path), "抽取成功。"
# ---------------- UI ----------------
with st.sidebar:
st.header("参数")
# 用 form 把"上传 + 参数 + 提交"打包,避免按钮重跑导致 file_uploader 丢值
with st.form("run_form", clear_on_submit=False):
pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader")
langs = st.text_input("OCR 语言(Tesseract 语法)", value="eng+chi_sim")
col_a, col_b, col_c = st.columns(3)
with col_a:
rotate_pages = st.checkbox("自动旋转校正", value=True)
with col_b:
deskew = st.checkbox("去偏斜", value=True)
with col_c:
clean = st.checkbox("清理底噪/污渍", value=True)
optimize_level = st.select_slider("优化级别", options=[0,1,2], value=1)
skip_text = st.checkbox("跳过已有文本层 (--skip-text)", value=True)
force_ocr = st.checkbox("强制重做文本层 (--force-ocr) [谨慎]", value=False)
export_sidecar = st.checkbox("导出 sidecar 文本", value=True)
st.markdown("---")
provider = st.selectbox("抽取提供方", ["None", "Gemini", "OpenAI"], index=0)
model_id = st.text_input("模型 ID", value="gemini-2.5-flash")
prompt = st.text_area(
"抽取任务描述(建议按你的法律场景定制)",
value=("Extract legal entities, events, relationships, and evidence anchors. "
"Return JSON objects with fields: {party, role, event, date, relation, citation, quote}. "
"Preserve exact source spans for traceability."),
height=160,
)
submitted = st.form_submit_button("运行 OCR(+可选抽取)", type="primary")
col1, col2 = st.columns([1,1])
with col1:
st.subheader("OCR 结果")
ocr_pdf_slot = st.empty()
sidecar_slot = st.empty()
preview_slot = st.empty()
with col2:
st.subheader("抽取与复核")
html_slot = st.empty()
jsonl_slot = st.empty()
status_slot = st.empty()
# 辅助:显示文件已被接收(提交前就可见,便于确认)
if "pdf_uploader" in st.session_state and st.session_state["pdf_uploader"]:
st.sidebar.success(f"已选择:{st.session_state['pdf_uploader'].name} "
f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)")
if submitted:
# 添加调试信息
if pdf is None:
st.error("PDF 为 None - 检查文件上传")
else:
st.info(f"PDF 文件信息:名称={pdf.name}, 大小={pdf.size} bytes")
out_pdf, sidecar_path, preview = run_ocr(
pdf, langs, rotate_pages, deskew, clean, optimize_level,
force_ocr, skip_text, export_sidecar
)
if out_pdf:
with open(out_pdf, "rb") as f:
ocr_pdf_slot.download_button("下载 OCR 后 PDF", f, file_name="output_ocr.pdf")
if sidecar_path:
with open(sidecar_path, "rb") as f:
sidecar_slot.download_button("下载 sidecar 文本", f, file_name="out.txt")
preview_slot.text_area("sidecar 文本预览(前 3000 字)", value=preview, height=240)
if sidecar_path and provider != "None":
txt = Path(sidecar_path).read_text(encoding="utf-8", errors="ignore")
html_path, jsonl_path, status = run_extract(txt, provider, model_id, prompt)
status_slot.info(status)
if html_path and Path(html_path).exists():
html_content = Path(html_path).read_text(encoding="utf-8", errors="ignore")
st.components.v1.html(html_content, height=650, scrolling=True)
if jsonl_path and Path(jsonl_path).exists():
with open(jsonl_path, "rb") as f:
jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")