Spaces:
Sleeping
Sleeping
import gradio as gr | |
import PyPDF2 | |
import pandas as pd | |
import io | |
import traceback | |
import tempfile # Import the tempfile module | |
import os # Import the os module | |
def find_keywords_in_pdf(pdf_file, keywords_str): | |
"""Extracts text from a PDF, finds keyword occurrences, and returns a CSV file path.""" | |
try: | |
keywords = [k.strip() for k in keywords_str.split(",")] | |
keyword_page_numbers = {} | |
try: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
num_pages = len(pdf_reader.pages) | |
for keyword in keywords: | |
keyword_page_numbers[keyword] = [] | |
for page_number in range(num_pages): | |
try: | |
page = pdf_reader.pages[page_number] | |
text = page.extract_text() | |
for keyword in keywords: | |
if keyword.lower() in text.lower(): | |
keyword_page_numbers[keyword].append(page_number + 1) | |
except Exception as e: | |
print(f"Error processing page {page_number + 1}: {e}") | |
traceback.print_exc() | |
data = [] | |
for keyword, page_numbers in keyword_page_numbers.items(): | |
data.append({'Species Name': keyword, 'Page Number': ','.join(map(str, page_numbers))}) | |
df = pd.DataFrame(data) | |
csv_output = df.to_csv(index=False) | |
# Create a temporary file to store the CSV data | |
with tempfile.NamedTemporaryFile(mode="w+t", suffix=".csv", delete=False) as tmpfile: | |
tmpfile.write(csv_output) | |
csv_path = tmpfile.name # Get the path to the temporary file | |
return csv_path # Return the file path to Gradio | |
except PyPDF2.errors.PdfReadError as e: | |
return f"Error: Could not read PDF. The file might be corrupted or use an unsupported format. Details: {e}" | |
except Exception as e: | |
print("General PDF processing error:", e) | |
traceback.print_exc() | |
return f"Error: Could not process PDF. Details: {e}. Check console for traceback." | |
except Exception as e: | |
print("Outer error:", e) | |
traceback.print_exc() | |
return f"Error: An unexpected error occurred. Details: {e}. Check console for traceback." | |
def gradio_interface(pdf_file, keywords): | |
"""Gradio interface for the PDF keyword search.""" | |
result = find_keywords_in_pdf(pdf_file, keywords) | |
return result | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.File(label="Upload PDF File", file_types=[".pdf"]), | |
gr.Textbox(lines=1, label="Enter Keywords (comma-separated)", placeholder="e.g., Nepal, India, China") | |
], | |
outputs=gr.File(label="Download CSV of Keyword Page Numbers", file_types=[".csv"]), | |
title="PDF Keyword Page Finder", | |
description="Upload a PDF and enter keywords to find the page numbers where they appear. The output will be a CSV file." | |
) | |
if __name__ == "__main__": | |
iface.launch() |