Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

PDF-text-extra / app.py

sadickam

Update app.py

636fc98 verified 5 months ago

raw

history blame contribute delete

8.12 kB

	import gradio as gr
	import pandas as pd
	import io
	import tempfile
	import os
	from langchain_community.document_loaders import PyPDFLoader
	import nltk
	from nltk.tokenize import sent_tokenize

	# Download NLTK's punkt tokenizer if not already downloaded
	nltk.download('punkt_tab')

	# Create a temporary directory for storing download files
	temp_dir = tempfile.TemporaryDirectory()

	def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
	"""
	Extract text from a PDF page by page using LangChain's PyPDFLoader.

	Args:
	pdf_file_path (str): The file path to the uploaded PDF.
	start_page (int, optional): The starting page number for extraction (1-based index).
	end_page (int, optional): The ending page number for extraction (1-based index).

	Returns:
	tuple:
	- page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
	- sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
	"""
	try:
	# Initialize the loader
	loader = PyPDFLoader(pdf_file_path)
	documents = loader.load_and_split() # Each document corresponds to a single page

	total_pages = len(documents)
	doc_name = os.path.basename(pdf_file_path) # Extract document name

	# Validate and adjust page range
	if start_page is not None and end_page is not None:
	# Convert to integers to avoid slicing issues
	start_page = int(start_page)
	end_page = int(end_page)

	# Adjust to valid range
	if start_page < 1:
	start_page = 1
	if end_page > total_pages:
	end_page = total_pages
	if start_page > end_page:
	start_page, end_page = end_page, start_page # Swap if out of order

	# Select the subset of documents based on user input
	selected_docs = documents[start_page - 1:end_page]
	else:
	selected_docs = documents
	start_page = 1
	end_page = total_pages

	# Initialize lists to store data
	page_data = []
	sentence_data = []

	for idx, doc in enumerate(selected_docs, start=start_page):
	page_num = idx
	text = doc.page_content.strip()

	# Append page-wise data
	page_data.append({
	"Document": doc_name,
	"Page": page_num,
	"Text": text
	})

	# Sentence tokenization
	sentences = sent_tokenize(text)
	for sentence in sentences:
	sentence = sentence.strip()
	if sentence:
	sentence_data.append({
	"Document": doc_name,
	"Page": page_num,
	"Sentence": sentence
	})

	# Create DataFrames
	page_df = pd.DataFrame(page_data)
	sentence_df = pd.DataFrame(sentence_data)

	return page_df, sentence_df

	except Exception as e:
	raise RuntimeError(f"Error during PDF extraction: {e}")

	def df_to_csv_bytes(df):
	"""
	Convert DataFrame to CSV in bytes.

	Args:
	df (pd.DataFrame): The DataFrame to convert.

	Returns:
	bytes: CSV data in bytes.
	"""
	try:
	buffer = io.StringIO()
	df.to_csv(buffer, index=False)
	csv_data = buffer.getvalue().encode('utf-8')
	buffer.close()
	return csv_data
	except Exception as e:
	raise RuntimeError(f"Error during CSV conversion: {e}")

	def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
	"""
	Callback function to extract text from PDF and return CSV data.

	Args:
	pdf_file_path (str): The file path to the uploaded PDF.
	extraction_mode (str): "All Pages" or "Range of Pages".
	start_page (float): Starting page number for extraction.
	end_page (float): Ending page number for extraction.

	Returns:
	tuple:
	- page_csv_path (str): Path to the page-wise CSV file.
	- sentence_csv_path (str): Path to the sentence-wise CSV file.
	- status_message (str): Status of the extraction process.
	"""
	if not pdf_file_path:
	return None, None, "No file uploaded."

	try:
	# Determine page range based on extraction_mode
	if extraction_mode == "All Pages":
	selected_start = None
	selected_end = None
	else:
	selected_start = start_page
	selected_end = end_page

	# Extract text and create DataFrames
	page_df, sentence_df = extract_text_with_py_pdf_loader(
	pdf_file_path,
	start_page=selected_start,
	end_page=selected_end
	)

	# Convert DataFrames to CSV bytes
	page_csv_bytes = df_to_csv_bytes(page_df)
	sentence_csv_bytes = df_to_csv_bytes(sentence_df)

	# Define CSV filenames
	page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv"
	sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv"

	# Define full paths within the temporary directory
	page_csv_path = os.path.join(temp_dir.name, page_csv_filename)
	sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename)

	# Write CSV bytes to temporary files
	with open(page_csv_path, 'wb') as page_csv_file:
	page_csv_file.write(page_csv_bytes)

	with open(sentence_csv_path, 'wb') as sentence_csv_file:
	sentence_csv_file.write(sentence_csv_bytes)

	# Return the paths to the temporary CSV files and a success message
	return (
	page_csv_path,
	sentence_csv_path,
	"Extraction successful!"
	)
	except Exception as e:
	return None, None, f"Extraction failed: {e}"

	with gr.Blocks() as demo:
	gr.Markdown("# 📄 PDF Text Extractor with Multiple Exports")

	with gr.Row():
	pdf_input = gr.File(
	label="Upload PDF",
	file_types=[".pdf"],
	type="filepath", # Ensure type is set to "filepath"
	interactive=True
	)

	with gr.Row():
	extraction_mode = gr.Radio(
	label="Extraction Mode",
	choices=["All Pages", "Range of Pages"],
	value="All Pages",
	interactive=True
	)

	with gr.Row():
	start_page = gr.Number(
	label="Start Page",
	value=1,
	precision=0,
	interactive=True,
	visible=False # Initially hidden
	)
	end_page = gr.Number(
	label="End Page",
	value=1,
	precision=0,
	interactive=True,
	visible=False # Initially hidden
	)

	# Toggle visibility of start_page and end_page based on extraction_mode
	extraction_mode.change(
	fn=lambda mode: (
	gr.update(visible=(mode == "Range of Pages")),
	gr.update(visible=(mode == "Range of Pages"))
	),
	inputs=[extraction_mode],
	outputs=[start_page, end_page]
	)

	with gr.Row():
	extract_button = gr.Button("Extract and Download")

	with gr.Row():
	page_csv_download = gr.File(
	label="Download Page-wise CSV",
	interactive=False
	)
	sentence_csv_download = gr.File(
	label="Download Sentence-wise CSV",
	interactive=False
	)

	with gr.Row():
	status_output = gr.Textbox(
	label="Status",
	interactive=False,
	lines=2
	)

	extract_button.click(
	fn=on_extract,
	inputs=[pdf_input, extraction_mode, start_page, end_page],
	outputs=[page_csv_download, sentence_csv_download, status_output]
	)

	gr.Markdown("""
	---
	Developed with ❤️ using Gradio and LangChain.
	""")

	# Launch the Gradio app
	demo.queue().launch()