|
import os |
|
import json |
|
import tempfile |
|
import subprocess |
|
from pathlib import Path |
|
|
|
|
|
os.environ["HOME"] = "/tmp" |
|
Path("/tmp").mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
os.environ["XDG_CACHE_HOME"] = "/tmp" |
|
os.environ["STREAMLIT_CACHE_DIR"] = "/tmp" |
|
os.environ["STREAMLIT_GLOBAL_DATA_DIR"] = "/tmp" |
|
os.environ["STREAMLIT_RUNTIME_DIR"] = "/tmp" |
|
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" |
|
Path("/tmp/.streamlit").mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false" |
|
os.environ.setdefault("STREAMLIT_SERVER_ENABLE_CORS", "false") |
|
os.environ.setdefault("STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION", "false") |
|
|
|
import streamlit as st |
|
import langextract as lx |
|
|
|
|
|
st.set_page_config(page_title="LangmyOCR (Streamlit)", layout="wide") |
|
st.title("LangmyOCR: OCRmyPDF + LangExtract (Streamlit Demo)") |
|
st.caption("先 OCR,后(可选)结构化抽取与交互式复核。数据仅用于会话处理。") |
|
|
|
|
|
def has_bin(name: str) -> bool: |
|
return subprocess.run(["bash", "-lc", f"command -v {name} >/dev/null 2>&1"]).returncode == 0 |
|
|
|
def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool, |
|
optimize_level: int, force_ocr: bool, skip_text: bool, export_sidecar: bool): |
|
if pdf_file is None: |
|
st.error("请先上传 PDF。") |
|
return None, None, None |
|
|
|
if not has_bin("ocrmypdf"): |
|
st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。") |
|
return None, None, None |
|
|
|
|
|
try: |
|
pdf_file.seek(0) |
|
pdf_content = pdf_file.read() |
|
if not pdf_content: |
|
st.error("PDF 文件内容为空。") |
|
return None, None, None |
|
except Exception as e: |
|
st.error(f"读取 PDF 文件失败:{e}") |
|
return None, None, None |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
|
tmp.write(pdf_content) |
|
in_path = Path(tmp.name) |
|
|
|
work = Path(tempfile.mkdtemp(prefix="ocr_")) |
|
out_pdf = work / "output_ocr.pdf" |
|
sidecar = work / "out.txt" |
|
|
|
cmd = ["ocrmypdf", "-l", langs, str(in_path), str(out_pdf)] |
|
|
|
if rotate_pages: cmd.insert(1, "--rotate-pages") |
|
if deskew: cmd.insert(1, "--deskew") |
|
if clean: cmd.insert(1, "--clean") |
|
cmd[1:1] = ["--optimize", str(optimize_level)] |
|
if skip_text: cmd.insert(1, "--skip-text") |
|
if force_ocr: cmd.insert(1, "--force-ocr") |
|
if export_sidecar: cmd[1:1] = ["--sidecar", str(sidecar)] |
|
cmd[1:1] = ["--output-type", "pdfa"] |
|
|
|
with st.status("正在执行 OCR …", expanded=False) as s: |
|
proc = subprocess.run(cmd, capture_output=True, text=True) |
|
if proc.returncode != 0: |
|
s.update(label="OCR 失败", state="error") |
|
st.error(f"OCR 失败(退出码 {proc.returncode})") |
|
st.code(proc.stderr[-2000:], language="bash") |
|
return None, None, None |
|
s.update(label="OCR 完成", state="complete") |
|
|
|
preview = "" |
|
sidecar_path = None |
|
if export_sidecar and sidecar.exists(): |
|
sidecar_path = str(sidecar) |
|
try: |
|
preview = sidecar.read_text(encoding="utf-8", errors="ignore")[:3000] |
|
except Exception: |
|
preview = "(sidecar 文本预览读取失败)" |
|
|
|
return str(out_pdf), sidecar_path, preview |
|
|
|
|
|
def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str): |
|
if not sidecar_text: |
|
return None, None, "没有可供抽取的文本。" |
|
if provider == "None": |
|
return None, None, "未选择模型,跳过抽取。" |
|
|
|
|
|
fence_output = True |
|
use_schema_constraints = False |
|
if provider == "Gemini": |
|
api_key = os.environ.get("LANGEXTRACT_API_KEY") |
|
if not api_key: |
|
return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。" |
|
elif provider == "OpenAI": |
|
api_key = os.environ.get("OPENAI_API_KEY") |
|
if not api_key: |
|
return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。" |
|
else: |
|
return None, None, "未知的 provider。" |
|
|
|
|
|
strict_prompt = ( |
|
"You are an information extraction engine. " |
|
"Extract legal entities, events, relationships, and evidence anchors from the input text. " |
|
"Return ONLY a JSON array, no prose, no markdown, no comments. " |
|
"Schema per item: {" |
|
"\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], " |
|
"\"text\": string (exact span), " |
|
"\"attributes\": object (key-value), " |
|
"\"source_hint\": string (optional page/line) " |
|
"}." |
|
) |
|
|
|
|
|
examples = [ |
|
lx.data.ExampleData( |
|
text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.", |
|
extractions=[ |
|
lx.data.Extraction( |
|
extraction_class="party", |
|
extraction_text="Walker Law Group", |
|
attributes={"role": "law_firm"}, |
|
), |
|
lx.data.Extraction( |
|
extraction_class="event", |
|
extraction_text="completed a medicolegal report", |
|
attributes={"actor": "Dr Gavin Soo"}, |
|
), |
|
lx.data.Extraction( |
|
extraction_class="date", |
|
extraction_text="15 February 2022", |
|
attributes={} |
|
), |
|
], |
|
) |
|
] |
|
|
|
|
|
work = Path(tempfile.mkdtemp(prefix="lx_")) |
|
jsonl_path = work / "extractions.jsonl" |
|
html_path = work / "review.html" |
|
raw_path1 = work / "raw_attempt1.txt" |
|
raw_path2 = work / "raw_attempt2.txt" |
|
|
|
def _try_extract(prompt_text): |
|
|
|
return lx.extract( |
|
text_or_documents=sidecar_text[:15000], |
|
prompt_description=prompt_text.strip(), |
|
examples=examples, |
|
model_id=model_id.strip(), |
|
api_key=api_key, |
|
fence_output=fence_output, |
|
use_schema_constraints=use_schema_constraints, |
|
) |
|
|
|
with st.status("正在进行结构化抽取 …", expanded=False) as s: |
|
try: |
|
result = _try_extract(strict_prompt) |
|
except Exception as e1: |
|
|
|
hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text." |
|
try: |
|
result = _try_extract(hard_prompt) |
|
except Exception as e2: |
|
s.update(label="抽取失败", state="error") |
|
return None, None, f"LangExtract 抽取失败:{e2}" |
|
|
|
|
|
try: |
|
lx.io.save_annotated_documents([result], output_name=str(jsonl_path)) |
|
html_content = lx.visualize(str(jsonl_path)) |
|
html_path.write_text(html_content, encoding="utf-8") |
|
except Exception as e: |
|
s.update(label="可视化失败", state="error") |
|
return None, None, f"可视化失败:{e}" |
|
|
|
s.update(label="抽取完成", state="complete") |
|
|
|
return str(html_path), str(jsonl_path), "抽取成功。" |
|
|
|
|
|
|
|
with st.sidebar: |
|
st.header("参数") |
|
|
|
|
|
with st.form("run_form", clear_on_submit=False): |
|
pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader") |
|
|
|
langs = st.text_input("OCR 语言(Tesseract 语法)", value="eng+chi_sim") |
|
col_a, col_b, col_c = st.columns(3) |
|
with col_a: |
|
rotate_pages = st.checkbox("自动旋转校正", value=True) |
|
with col_b: |
|
deskew = st.checkbox("去偏斜", value=True) |
|
with col_c: |
|
clean = st.checkbox("清理底噪/污渍", value=True) |
|
|
|
optimize_level = st.select_slider("优化级别", options=[0,1,2], value=1) |
|
skip_text = st.checkbox("跳过已有文本层 (--skip-text)", value=True) |
|
force_ocr = st.checkbox("强制重做文本层 (--force-ocr) [谨慎]", value=False) |
|
export_sidecar = st.checkbox("导出 sidecar 文本", value=True) |
|
|
|
st.markdown("---") |
|
provider = st.selectbox("抽取提供方", ["None", "Gemini", "OpenAI"], index=0) |
|
model_id = st.text_input("模型 ID", value="gemini-2.5-flash") |
|
prompt = st.text_area( |
|
"抽取任务描述(建议按你的法律场景定制)", |
|
value=("Extract legal entities, events, relationships, and evidence anchors. " |
|
"Return JSON objects with fields: {party, role, event, date, relation, citation, quote}. " |
|
"Preserve exact source spans for traceability."), |
|
height=160, |
|
) |
|
|
|
submitted = st.form_submit_button("运行 OCR(+可选抽取)", type="primary") |
|
|
|
|
|
col1, col2 = st.columns([1,1]) |
|
with col1: |
|
st.subheader("OCR 结果") |
|
ocr_pdf_slot = st.empty() |
|
sidecar_slot = st.empty() |
|
preview_slot = st.empty() |
|
|
|
with col2: |
|
st.subheader("抽取与复核") |
|
html_slot = st.empty() |
|
jsonl_slot = st.empty() |
|
status_slot = st.empty() |
|
|
|
|
|
if "pdf_uploader" in st.session_state and st.session_state["pdf_uploader"]: |
|
st.sidebar.success(f"已选择:{st.session_state['pdf_uploader'].name} " |
|
f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)") |
|
|
|
if submitted: |
|
|
|
if pdf is None: |
|
st.error("PDF 为 None - 检查文件上传") |
|
else: |
|
st.info(f"PDF 文件信息:名称={pdf.name}, 大小={pdf.size} bytes") |
|
|
|
out_pdf, sidecar_path, preview = run_ocr( |
|
pdf, langs, rotate_pages, deskew, clean, optimize_level, |
|
force_ocr, skip_text, export_sidecar |
|
) |
|
if out_pdf: |
|
with open(out_pdf, "rb") as f: |
|
ocr_pdf_slot.download_button("下载 OCR 后 PDF", f, file_name="output_ocr.pdf") |
|
if sidecar_path: |
|
with open(sidecar_path, "rb") as f: |
|
sidecar_slot.download_button("下载 sidecar 文本", f, file_name="out.txt") |
|
preview_slot.text_area("sidecar 文本预览(前 3000 字)", value=preview, height=240) |
|
|
|
if sidecar_path and provider != "None": |
|
txt = Path(sidecar_path).read_text(encoding="utf-8", errors="ignore") |
|
html_path, jsonl_path, status = run_extract(txt, provider, model_id, prompt) |
|
status_slot.info(status) |
|
if html_path and Path(html_path).exists(): |
|
html_content = Path(html_path).read_text(encoding="utf-8", errors="ignore") |
|
st.components.v1.html(html_content, height=650, scrolling=True) |
|
if jsonl_path and Path(jsonl_path).exists(): |
|
with open(jsonl_path, "rb") as f: |
|
jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl") |