PDF-text-extra / app.py
sadickam's picture
Update app.py
636fc98 verified
import gradio as gr
import pandas as pd
import io
import tempfile
import os
from langchain_community.document_loaders import PyPDFLoader
import nltk
from nltk.tokenize import sent_tokenize
# Download NLTK's punkt tokenizer if not already downloaded
nltk.download('punkt_tab')
# Create a temporary directory for storing download files
temp_dir = tempfile.TemporaryDirectory()
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
"""
Extract text from a PDF page by page using LangChain's PyPDFLoader.
Args:
pdf_file_path (str): The file path to the uploaded PDF.
start_page (int, optional): The starting page number for extraction (1-based index).
end_page (int, optional): The ending page number for extraction (1-based index).
Returns:
tuple:
- page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
- sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
"""
try:
# Initialize the loader
loader = PyPDFLoader(pdf_file_path)
documents = loader.load_and_split() # Each document corresponds to a single page
total_pages = len(documents)
doc_name = os.path.basename(pdf_file_path) # Extract document name
# Validate and adjust page range
if start_page is not None and end_page is not None:
# Convert to integers to avoid slicing issues
start_page = int(start_page)
end_page = int(end_page)
# Adjust to valid range
if start_page < 1:
start_page = 1
if end_page > total_pages:
end_page = total_pages
if start_page > end_page:
start_page, end_page = end_page, start_page # Swap if out of order
# Select the subset of documents based on user input
selected_docs = documents[start_page - 1:end_page]
else:
selected_docs = documents
start_page = 1
end_page = total_pages
# Initialize lists to store data
page_data = []
sentence_data = []
for idx, doc in enumerate(selected_docs, start=start_page):
page_num = idx
text = doc.page_content.strip()
# Append page-wise data
page_data.append({
"Document": doc_name,
"Page": page_num,
"Text": text
})
# Sentence tokenization
sentences = sent_tokenize(text)
for sentence in sentences:
sentence = sentence.strip()
if sentence:
sentence_data.append({
"Document": doc_name,
"Page": page_num,
"Sentence": sentence
})
# Create DataFrames
page_df = pd.DataFrame(page_data)
sentence_df = pd.DataFrame(sentence_data)
return page_df, sentence_df
except Exception as e:
raise RuntimeError(f"Error during PDF extraction: {e}")
def df_to_csv_bytes(df):
"""
Convert DataFrame to CSV in bytes.
Args:
df (pd.DataFrame): The DataFrame to convert.
Returns:
bytes: CSV data in bytes.
"""
try:
buffer = io.StringIO()
df.to_csv(buffer, index=False)
csv_data = buffer.getvalue().encode('utf-8')
buffer.close()
return csv_data
except Exception as e:
raise RuntimeError(f"Error during CSV conversion: {e}")
def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
"""
Callback function to extract text from PDF and return CSV data.
Args:
pdf_file_path (str): The file path to the uploaded PDF.
extraction_mode (str): "All Pages" or "Range of Pages".
start_page (float): Starting page number for extraction.
end_page (float): Ending page number for extraction.
Returns:
tuple:
- page_csv_path (str): Path to the page-wise CSV file.
- sentence_csv_path (str): Path to the sentence-wise CSV file.
- status_message (str): Status of the extraction process.
"""
if not pdf_file_path:
return None, None, "No file uploaded."
try:
# Determine page range based on extraction_mode
if extraction_mode == "All Pages":
selected_start = None
selected_end = None
else:
selected_start = start_page
selected_end = end_page
# Extract text and create DataFrames
page_df, sentence_df = extract_text_with_py_pdf_loader(
pdf_file_path,
start_page=selected_start,
end_page=selected_end
)
# Convert DataFrames to CSV bytes
page_csv_bytes = df_to_csv_bytes(page_df)
sentence_csv_bytes = df_to_csv_bytes(sentence_df)
# Define CSV filenames
page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv"
sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv"
# Define full paths within the temporary directory
page_csv_path = os.path.join(temp_dir.name, page_csv_filename)
sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename)
# Write CSV bytes to temporary files
with open(page_csv_path, 'wb') as page_csv_file:
page_csv_file.write(page_csv_bytes)
with open(sentence_csv_path, 'wb') as sentence_csv_file:
sentence_csv_file.write(sentence_csv_bytes)
# Return the paths to the temporary CSV files and a success message
return (
page_csv_path,
sentence_csv_path,
"Extraction successful!"
)
except Exception as e:
return None, None, f"Extraction failed: {e}"
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ PDF Text Extractor with Multiple Exports")
with gr.Row():
pdf_input = gr.File(
label="Upload PDF",
file_types=[".pdf"],
type="filepath", # Ensure type is set to "filepath"
interactive=True
)
with gr.Row():
extraction_mode = gr.Radio(
label="Extraction Mode",
choices=["All Pages", "Range of Pages"],
value="All Pages",
interactive=True
)
with gr.Row():
start_page = gr.Number(
label="Start Page",
value=1,
precision=0,
interactive=True,
visible=False # Initially hidden
)
end_page = gr.Number(
label="End Page",
value=1,
precision=0,
interactive=True,
visible=False # Initially hidden
)
# Toggle visibility of start_page and end_page based on extraction_mode
extraction_mode.change(
fn=lambda mode: (
gr.update(visible=(mode == "Range of Pages")),
gr.update(visible=(mode == "Range of Pages"))
),
inputs=[extraction_mode],
outputs=[start_page, end_page]
)
with gr.Row():
extract_button = gr.Button("Extract and Download")
with gr.Row():
page_csv_download = gr.File(
label="Download Page-wise CSV",
interactive=False
)
sentence_csv_download = gr.File(
label="Download Sentence-wise CSV",
interactive=False
)
with gr.Row():
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=2
)
extract_button.click(
fn=on_extract,
inputs=[pdf_input, extraction_mode, start_page, end_page],
outputs=[page_csv_download, sentence_csv_download, status_output]
)
gr.Markdown("""
---
Developed with ❀️ using Gradio and LangChain.
""")
# Launch the Gradio app
demo.queue().launch()