Shirish15's picture
Update app.py
99a7e99 verified
import gradio as gr
import PyPDF2
import pandas as pd
import io
import traceback
import tempfile # Import the tempfile module
import os # Import the os module
def find_keywords_in_pdf(pdf_file, keywords_str):
"""Extracts text from a PDF, finds keyword occurrences, and returns a CSV file path."""
try:
keywords = [k.strip() for k in keywords_str.split(",")]
keyword_page_numbers = {}
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
for keyword in keywords:
keyword_page_numbers[keyword] = []
for page_number in range(num_pages):
try:
page = pdf_reader.pages[page_number]
text = page.extract_text()
for keyword in keywords:
if keyword.lower() in text.lower():
keyword_page_numbers[keyword].append(page_number + 1)
except Exception as e:
print(f"Error processing page {page_number + 1}: {e}")
traceback.print_exc()
data = []
for keyword, page_numbers in keyword_page_numbers.items():
data.append({'Species Name': keyword, 'Page Number': ','.join(map(str, page_numbers))})
df = pd.DataFrame(data)
csv_output = df.to_csv(index=False)
# Create a temporary file to store the CSV data
with tempfile.NamedTemporaryFile(mode="w+t", suffix=".csv", delete=False) as tmpfile:
tmpfile.write(csv_output)
csv_path = tmpfile.name # Get the path to the temporary file
return csv_path # Return the file path to Gradio
except PyPDF2.errors.PdfReadError as e:
return f"Error: Could not read PDF. The file might be corrupted or use an unsupported format. Details: {e}"
except Exception as e:
print("General PDF processing error:", e)
traceback.print_exc()
return f"Error: Could not process PDF. Details: {e}. Check console for traceback."
except Exception as e:
print("Outer error:", e)
traceback.print_exc()
return f"Error: An unexpected error occurred. Details: {e}. Check console for traceback."
def gradio_interface(pdf_file, keywords):
"""Gradio interface for the PDF keyword search."""
result = find_keywords_in_pdf(pdf_file, keywords)
return result
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(label="Upload PDF File", file_types=[".pdf"]),
gr.Textbox(lines=1, label="Enter Keywords (comma-separated)", placeholder="e.g., Nepal, India, China")
],
outputs=gr.File(label="Download CSV of Keyword Page Numbers", file_types=[".csv"]),
title="PDF Keyword Page Finder",
description="Upload a PDF and enter keywords to find the page numbers where they appear. The output will be a CSV file."
)
if __name__ == "__main__":
iface.launch()