Spaces:

Shirish15
/

keywords-indexing

Sleeping

App Files Files Community

keywords-indexing / app.py

Shirish15

Update app.py

99a7e99 verified 4 months ago

raw

history blame contribute delete

3.05 kB

	import gradio as gr
	import PyPDF2
	import pandas as pd
	import io
	import traceback
	import tempfile # Import the tempfile module
	import os # Import the os module

	def find_keywords_in_pdf(pdf_file, keywords_str):
	"""Extracts text from a PDF, finds keyword occurrences, and returns a CSV file path."""
	try:
	keywords = [k.strip() for k in keywords_str.split(",")]
	keyword_page_numbers = {}

	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	num_pages = len(pdf_reader.pages)

	for keyword in keywords:
	keyword_page_numbers[keyword] = []

	for page_number in range(num_pages):
	try:
	page = pdf_reader.pages[page_number]
	text = page.extract_text()

	for keyword in keywords:
	if keyword.lower() in text.lower():
	keyword_page_numbers[keyword].append(page_number + 1)

	except Exception as e:
	print(f"Error processing page {page_number + 1}: {e}")
	traceback.print_exc()

	data = []
	for keyword, page_numbers in keyword_page_numbers.items():
	data.append({'Species Name': keyword, 'Page Number': ','.join(map(str, page_numbers))})

	df = pd.DataFrame(data)
	csv_output = df.to_csv(index=False)

	# Create a temporary file to store the CSV data
	with tempfile.NamedTemporaryFile(mode="w+t", suffix=".csv", delete=False) as tmpfile:
	tmpfile.write(csv_output)
	csv_path = tmpfile.name # Get the path to the temporary file
	return csv_path # Return the file path to Gradio

	except PyPDF2.errors.PdfReadError as e:
	return f"Error: Could not read PDF. The file might be corrupted or use an unsupported format. Details: {e}"
	except Exception as e:
	print("General PDF processing error:", e)
	traceback.print_exc()
	return f"Error: Could not process PDF. Details: {e}. Check console for traceback."

	except Exception as e:
	print("Outer error:", e)
	traceback.print_exc()
	return f"Error: An unexpected error occurred. Details: {e}. Check console for traceback."


	def gradio_interface(pdf_file, keywords):
	"""Gradio interface for the PDF keyword search."""
	result = find_keywords_in_pdf(pdf_file, keywords)
	return result


	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.File(label="Upload PDF File", file_types=[".pdf"]),
	gr.Textbox(lines=1, label="Enter Keywords (comma-separated)", placeholder="e.g., Nepal, India, China")
	],
	outputs=gr.File(label="Download CSV of Keyword Page Numbers", file_types=[".csv"]),
	title="PDF Keyword Page Finder",
	description="Upload a PDF and enter keywords to find the page numbers where they appear. The output will be a CSV file."
	)

	if __name__ == "__main__":
	iface.launch()