Spaces:

vaibhavbalar
/

PDF_Extractor

Sleeping

App Files Files Community

PDF_Extractor / app.py

vaibhavbalar

Update app.py

cbbc2a6 verified 2 months ago

raw

history blame contribute delete

4.78 kB

	import gradio as gr
	import PyPDF2
	import pandas as pd
	import re
	import io

	def extract_with_lines(pdf_path):
	"""
	Extract all PDF text, displaying page+line number prefix.
	Returns raw text for training.
	"""
	with open(pdf_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	result = []
	for i, page in enumerate(reader.pages):
	page_text = page.extract_text()
	if page_text:
	lines = page_text.splitlines()
	for ln, line in enumerate(lines):
	result.append(f"[Page {i+1} Line {ln+1}] {line}")
	return "\n".join(result) if result else "[NO TEXT FOUND]"

	def get_sample_context(raw_text, example):
	"""Show where the sample occurs, for user feedback (teaching phase)"""
	context_lines = []
	lines = raw_text.splitlines()
	example = example.strip()
	for i, line in enumerate(lines):
	if example and example in line:
	prev_line = lines[i-1] if i > 0 else ""
	next_line = lines[i+1] if i+1 < len(lines) else ""
	snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."
	context_lines.append(snippet)
	if not context_lines:
	return "No match for example in extracted text."
	return "\n---\n".join(context_lines)

	def guess_extraction_regex(sample_value, all_lines):
	"""
	Use the sample_value to build a simple extraction pattern.
	If the value is after a colon or consistent header, match similar lines.
	"""
	for line in all_lines:
	if sample_value in line:
	if ':' in line:
	prefix, suffix = line.split(':', 1)
	if sample_value.strip() == suffix.strip():
	return re.compile(f"{re.escape(prefix.strip())}\\s:\\s(.+)", re.IGNORECASE)
	match = re.match(r"(.?)(\\s+)?"+re.escape(sample_value)+r"(.)?", line)
	if match and match.group(1).strip():
	return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE)
	return None

	def extract_table_from_sample(raw_text, label, sample_value):
	lines = raw_text.splitlines()
	if not label or not sample_value:
	return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
	regex = guess_extraction_regex(sample_value, lines)
	found = []
	if regex:
	for line in lines:
	m = regex.match(line)
	if m:
	found.append({label: m.group(1).strip()})
	else:
	prefix = sample_value[:5]
	for line in lines:
	if prefix in line:
	found.append({label: line.strip()})
	if not found:
	return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
	return pd.DataFrame(found)

	def export_xlsx(df):
	buf = io.BytesIO()
	with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False)
	buf.seek(0)
	return buf

	with gr.Blocks() as demo:
	gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches → 4. Download as Excel")
	file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath")
	raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
	file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
	with gr.Row():
	teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
	teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
	teach_search = gr.Button("Show Context")
	context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
	teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
	with gr.Row():
	extract_btn = gr.Button("Extract All Similar Values")
	results_table = gr.Dataframe(label="Extracted Results Table")
	download_btn = gr.Button("Download as Excel")
	xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
	def extract_and_preview(raw_text, teach_label, teach_sample):
	df = extract_table_from_sample(raw_text, teach_label, teach_sample)
	return df
	extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
	def save_xlsx(df):
	buf = export_xlsx(df)
	return ("results.xlsx", buf)
	download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
	demo.launch()