Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| import io | |
| import tempfile | |
| import os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| # Download NLTK's punkt tokenizer if not already downloaded | |
| nltk.download('punkt_tab') | |
| # Create a temporary directory for storing download files | |
| temp_dir = tempfile.TemporaryDirectory() | |
| def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None): | |
| """ | |
| Extract text from a PDF page by page using LangChain's PyPDFLoader. | |
| Args: | |
| pdf_file_path (str): The file path to the uploaded PDF. | |
| start_page (int, optional): The starting page number for extraction (1-based index). | |
| end_page (int, optional): The ending page number for extraction (1-based index). | |
| Returns: | |
| tuple: | |
| - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text. | |
| - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence. | |
| """ | |
| try: | |
| # Initialize the loader | |
| loader = PyPDFLoader(pdf_file_path) | |
| documents = loader.load_and_split() # Each document corresponds to a single page | |
| total_pages = len(documents) | |
| doc_name = os.path.basename(pdf_file_path) # Extract document name | |
| # Validate and adjust page range | |
| if start_page is not None and end_page is not None: | |
| # Convert to integers to avoid slicing issues | |
| start_page = int(start_page) | |
| end_page = int(end_page) | |
| # Adjust to valid range | |
| if start_page < 1: | |
| start_page = 1 | |
| if end_page > total_pages: | |
| end_page = total_pages | |
| if start_page > end_page: | |
| start_page, end_page = end_page, start_page # Swap if out of order | |
| # Select the subset of documents based on user input | |
| selected_docs = documents[start_page - 1:end_page] | |
| else: | |
| selected_docs = documents | |
| start_page = 1 | |
| end_page = total_pages | |
| # Initialize lists to store data | |
| page_data = [] | |
| sentence_data = [] | |
| for idx, doc in enumerate(selected_docs, start=start_page): | |
| page_num = idx | |
| text = doc.page_content.strip() | |
| # Append page-wise data | |
| page_data.append({ | |
| "Document": doc_name, | |
| "Page": page_num, | |
| "Text": text | |
| }) | |
| # Sentence tokenization | |
| sentences = sent_tokenize(text) | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if sentence: | |
| sentence_data.append({ | |
| "Document": doc_name, | |
| "Page": page_num, | |
| "Sentence": sentence | |
| }) | |
| # Create DataFrames | |
| page_df = pd.DataFrame(page_data) | |
| sentence_df = pd.DataFrame(sentence_data) | |
| return page_df, sentence_df | |
| except Exception as e: | |
| raise RuntimeError(f"Error during PDF extraction: {e}") | |
| def df_to_csv_bytes(df): | |
| """ | |
| Convert DataFrame to CSV in bytes. | |
| Args: | |
| df (pd.DataFrame): The DataFrame to convert. | |
| Returns: | |
| bytes: CSV data in bytes. | |
| """ | |
| try: | |
| buffer = io.StringIO() | |
| df.to_csv(buffer, index=False) | |
| csv_data = buffer.getvalue().encode('utf-8') | |
| buffer.close() | |
| return csv_data | |
| except Exception as e: | |
| raise RuntimeError(f"Error during CSV conversion: {e}") | |
| def on_extract(pdf_file_path, extraction_mode, start_page, end_page): | |
| """ | |
| Callback function to extract text from PDF and return CSV data. | |
| Args: | |
| pdf_file_path (str): The file path to the uploaded PDF. | |
| extraction_mode (str): "All Pages" or "Range of Pages". | |
| start_page (float): Starting page number for extraction. | |
| end_page (float): Ending page number for extraction. | |
| Returns: | |
| tuple: | |
| - page_csv_path (str): Path to the page-wise CSV file. | |
| - sentence_csv_path (str): Path to the sentence-wise CSV file. | |
| - status_message (str): Status of the extraction process. | |
| """ | |
| if not pdf_file_path: | |
| return None, None, "No file uploaded." | |
| try: | |
| # Determine page range based on extraction_mode | |
| if extraction_mode == "All Pages": | |
| selected_start = None | |
| selected_end = None | |
| else: | |
| selected_start = start_page | |
| selected_end = end_page | |
| # Extract text and create DataFrames | |
| page_df, sentence_df = extract_text_with_py_pdf_loader( | |
| pdf_file_path, | |
| start_page=selected_start, | |
| end_page=selected_end | |
| ) | |
| # Convert DataFrames to CSV bytes | |
| page_csv_bytes = df_to_csv_bytes(page_df) | |
| sentence_csv_bytes = df_to_csv_bytes(sentence_df) | |
| # Define CSV filenames | |
| page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv" | |
| sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv" | |
| # Define full paths within the temporary directory | |
| page_csv_path = os.path.join(temp_dir.name, page_csv_filename) | |
| sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename) | |
| # Write CSV bytes to temporary files | |
| with open(page_csv_path, 'wb') as page_csv_file: | |
| page_csv_file.write(page_csv_bytes) | |
| with open(sentence_csv_path, 'wb') as sentence_csv_file: | |
| sentence_csv_file.write(sentence_csv_bytes) | |
| # Return the paths to the temporary CSV files and a success message | |
| return ( | |
| page_csv_path, | |
| sentence_csv_path, | |
| "Extraction successful!" | |
| ) | |
| except Exception as e: | |
| return None, None, f"Extraction failed: {e}" | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π PDF Text Extractor with Multiple Exports") | |
| with gr.Row(): | |
| pdf_input = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"], | |
| type="filepath", # Ensure type is set to "filepath" | |
| interactive=True | |
| ) | |
| with gr.Row(): | |
| extraction_mode = gr.Radio( | |
| label="Extraction Mode", | |
| choices=["All Pages", "Range of Pages"], | |
| value="All Pages", | |
| interactive=True | |
| ) | |
| with gr.Row(): | |
| start_page = gr.Number( | |
| label="Start Page", | |
| value=1, | |
| precision=0, | |
| interactive=True, | |
| visible=False # Initially hidden | |
| ) | |
| end_page = gr.Number( | |
| label="End Page", | |
| value=1, | |
| precision=0, | |
| interactive=True, | |
| visible=False # Initially hidden | |
| ) | |
| # Toggle visibility of start_page and end_page based on extraction_mode | |
| extraction_mode.change( | |
| fn=lambda mode: ( | |
| gr.update(visible=(mode == "Range of Pages")), | |
| gr.update(visible=(mode == "Range of Pages")) | |
| ), | |
| inputs=[extraction_mode], | |
| outputs=[start_page, end_page] | |
| ) | |
| with gr.Row(): | |
| extract_button = gr.Button("Extract and Download") | |
| with gr.Row(): | |
| page_csv_download = gr.File( | |
| label="Download Page-wise CSV", | |
| interactive=False | |
| ) | |
| sentence_csv_download = gr.File( | |
| label="Download Sentence-wise CSV", | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| extract_button.click( | |
| fn=on_extract, | |
| inputs=[pdf_input, extraction_mode, start_page, end_page], | |
| outputs=[page_csv_download, sentence_csv_download, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| Developed with β€οΈ using Gradio and LangChain. | |
| """) | |
| # Launch the Gradio app | |
| demo.queue().launch() |