Legal_OCR / app.py
tdnathmlenthusiast's picture
solved poppler-utils
8c6bdbd verified
import os
import fitz # PyMuPDF
from paddleocr import PPStructure
from pdf2image import convert_from_path
import numpy as np
import json
import re
import spacy
from spacy.matcher import Matcher
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import gradio as gr
from tqdm.auto import tqdm
import os
# Ensure Poppler is available
os.system("apt-get update -y && apt-get install -y poppler-utils")
# --- Initialization ---
structure_engine = PPStructure(table=True, ocr=True, layout=True)
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Regex & matcher setup
date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
matcher.add("CLAIMANT", [pattern])
# Load Legal-BERT pipelines
ner_model = "nlpaueb/legal-bert-base-uncased"
token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
tokenizer = AutoTokenizer.from_pretrained(ner_model)
ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
clf_pipeline = pipeline("text-classification", model=ner_model)
# Helper functions
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
pages = []
for i in range(len(doc)):
page = doc[i]
pages.append({"page": i + 1, "text": page.get_text("text") or ""})
doc.close()
return pages
def extract_content_from_images(pdf_path):
images = convert_from_path(pdf_path)
results = []
for i, img in enumerate(images, start=1):
img_np = np.array(img)
res = structure_engine(img_np)
text_lines, tables = [], []
for block in res:
if block['type'] == 'text':
text_lines += [line['text'] for line in block['res'] if 'text' in line]
elif block['type'] == 'table' and 'html' in block['res']:
tables.append(block['res']['html'])
results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
return results
def extract_metadata(text):
meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
# Regex
meta['dates'] = re.findall(date_pattern, text)
meta['parties'] = re.findall(party_pattern, text)
# SpaCy
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == 'ORG' and ent.text not in meta['parties']:
meta['parties'].append(ent.text)
if ent.label_ == 'GPE':
meta['tribunals'].append(ent.text)
for match_id, start, end in matcher(doc):
meta['claimants'].append(doc[start:end].text)
# Legal-BERT NER
for ent in ner_pipeline(text):
grp = ent['entity_group']
if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
meta['parties'].append(ent['word'])
if grp == 'GPE' and ent['word'] not in meta['tribunals']:
meta['tribunals'].append(ent['word'])
# Clause classification
for sent in text.split('. '):
if len(sent) < 10: continue
try:
res = clf_pipeline(sent)[0]
if res['score'] > 0.7:
meta['clauses'].append({'type': res['label'], 'text': sent})
except:
pass
return meta
def process_pdf(file_obj):
# Save uploaded file
pdf_path = file_obj.name
# 1. Text
text_pages = extract_text_from_pdf(pdf_path)
# 2. OCR & tables
img_content = extract_content_from_images(pdf_path)
# 3. Metadata
metadata = []
for page in text_pages:
metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
# Combine
output = {
"text_pages": text_pages,
"image_content": img_content,
"metadata": metadata
}
return output
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
outputs=gr.JSON(label="Extraction Result"),
title="PDF OCR & Metadata Extractor",
description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
)
if __name__ == '__main__':
iface.launch()