Spaces:

tdnathmlenthusiast
/

Legal_OCR

Running

App Files Files Community

Legal_OCR / app.py

tdnathmlenthusiast

solved poppler-utils

8c6bdbd verified 3 months ago

raw

history blame contribute delete

4.29 kB

	import os
	import fitz # PyMuPDF
	from paddleocr import PPStructure
	from pdf2image import convert_from_path
	import numpy as np
	import json
	import re
	import spacy
	from spacy.matcher import Matcher
	from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
	import gradio as gr
	from tqdm.auto import tqdm
	import os
	# Ensure Poppler is available
	os.system("apt-get update -y && apt-get install -y poppler-utils")
	# --- Initialization ---
	structure_engine = PPStructure(table=True, ocr=True, layout=True)
	nlp = spacy.load("en_core_web_sm")
	matcher = Matcher(nlp.vocab)

	# Regex & matcher setup
	date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}\|\d{2}\.\d{2}\.\d{2}"
	party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
	pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
	matcher.add("CLAIMANT", [pattern])

	# Load Legal-BERT pipelines
	ner_model = "nlpaueb/legal-bert-base-uncased"
	token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
	tokenizer = AutoTokenizer.from_pretrained(ner_model)
	ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
	clf_pipeline = pipeline("text-classification", model=ner_model)

	# Helper functions
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	pages = []
	for i in range(len(doc)):
	page = doc[i]
	pages.append({"page": i + 1, "text": page.get_text("text") or ""})
	doc.close()
	return pages


	def extract_content_from_images(pdf_path):
	images = convert_from_path(pdf_path)
	results = []
	for i, img in enumerate(images, start=1):
	img_np = np.array(img)
	res = structure_engine(img_np)
	text_lines, tables = [], []
	for block in res:
	if block['type'] == 'text':
	text_lines += [line['text'] for line in block['res'] if 'text' in line]
	elif block['type'] == 'table' and 'html' in block['res']:
	tables.append(block['res']['html'])
	results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
	return results


	def extract_metadata(text):
	meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
	# Regex
	meta['dates'] = re.findall(date_pattern, text)
	meta['parties'] = re.findall(party_pattern, text)
	# SpaCy
	doc = nlp(text)
	for ent in doc.ents:
	if ent.label_ == 'ORG' and ent.text not in meta['parties']:
	meta['parties'].append(ent.text)
	if ent.label_ == 'GPE':
	meta['tribunals'].append(ent.text)
	for match_id, start, end in matcher(doc):
	meta['claimants'].append(doc[start:end].text)
	# Legal-BERT NER
	for ent in ner_pipeline(text):
	grp = ent['entity_group']
	if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
	meta['parties'].append(ent['word'])
	if grp == 'GPE' and ent['word'] not in meta['tribunals']:
	meta['tribunals'].append(ent['word'])
	# Clause classification
	for sent in text.split('. '):
	if len(sent) < 10: continue
	try:
	res = clf_pipeline(sent)[0]
	if res['score'] > 0.7:
	meta['clauses'].append({'type': res['label'], 'text': sent})
	except:
	pass
	return meta


	def process_pdf(file_obj):
	# Save uploaded file
	pdf_path = file_obj.name
	# 1. Text
	text_pages = extract_text_from_pdf(pdf_path)
	# 2. OCR & tables
	img_content = extract_content_from_images(pdf_path)
	# 3. Metadata
	metadata = []
	for page in text_pages:
	metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
	# Combine
	output = {
	"text_pages": text_pages,
	"image_content": img_content,
	"metadata": metadata
	}
	return output

	# Gradio Interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
	outputs=gr.JSON(label="Extraction Result"),
	title="PDF OCR & Metadata Extractor",
	description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
	)

	if __name__ == '__main__':
	iface.launch()