import os import re import pdfminer from pdfminer.high_level import extract_pages from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer import tensorflow as tf import streamlit as st def preprocess_text(element): """Preprocesses text elements from the PDF.""" if isinstance(element, pdfminer.layout.LTTextBoxHorizontal): text = element.get_text().strip() # Remove non-textual elements text = re.sub(r'[^\w\s]', '', text) # Convert to lowercase text = text.lower() return text else: return "" def answer_question(text, question, max_length=512): """Answers a question using the provided text and a pre-trained model.""" qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad" qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name) tokenizer = AutoTokenizer.from_pretrained(qa_model_name) # Add special tokens and tokenize inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length) # Model prediction outputs = qa_model(inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits # Find the indices of the start and end positions answer_start = tf.argmax(start_logits, axis=1).numpy()[0] answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Extract the answer text from the original text answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) return answer if answer else "No answer found." # Streamlit app st.set_page_config(page_title="PDF Summarizer and Q&A") st.header("PDF Summarizer and Q&A") # User options st.subheader("Settings") min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100) summarization_model = "facebook/bart-large-cnn" # File upload and processing uploaded_file = st.file_uploader("Choose a PDF file") if uploaded_file is not None: with st.spinner("Processing..."): text = "" for page_layout in extract_pages(uploaded_file): for element in page_layout: text += preprocess_text(element) + "\n" if text: question = st.text_input("Ask a question about the PDF:") summarize_button = st.button("Generate Summary") if summarize_button: with st.spinner("Summarizing..."): summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length) st.subheader("Summary") st.write(summary_response[0]["summary_text"]) if question: with st.spinner("Answering..."): answer = answer_question(text, question) st.subheader("Answer") st.write(answer) else: st.error("No text found in the PDF.")