import gradio as gr from pypdf import PdfReader from sentence_transformers import SentenceTransformer from transformers import pipeline import faiss import numpy as np # ── Load Models ─────────────────────────────────────── embedder = SentenceTransformer("all-MiniLM-L6-v2") qa_model = pipeline("text-generation", model="gpt2") # ── Global storage ──────────────────────────────────── chunks = [] index = None # ── Step 1: Extract text from PDF ───────────────────── def extract_text(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() return text # ── Step 2: Split text into chunks ──────────────────── def split_chunks(text, chunk_size=300): words = text.split() result = [] for i in range(0, len(words), chunk_size): chunk = " ".join(words[i:i+chunk_size]) result.append(chunk) return result # ── Step 3: Create embeddings & store in FAISS ──────── def build_index(pdf_file): global chunks, index text = extract_text(pdf_file.name) chunks = split_chunks(text) embeddings = embedder.encode(chunks) embeddings = np.array(embeddings).astype("float32") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) return f"✅ PDF loaded! {len(chunks)} chunks created. You can now ask questions." # ── Step 4: Retrieve + Answer ───────────────────────── def answer_question(question): global chunks, index if index is None: return "⚠️ Please upload a PDF first." if not question.strip(): return "⚠️ Please enter a question." # Retrieve top 3 relevant chunks question_embedding = embedder.encode([question]).astype("float32") _, indices = index.search(question_embedding, k=3) context = " ".join([chunks[i] for i in indices[0]]) # Keep context short to stay within gpt2 token limit (1024) context = context[:500] # Build prompt prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:" result = qa_model( prompt, max_new_tokens=100, do_sample=False, pad_token_id=50256 # gpt2 has no pad token, this suppresses the warning ) # Extract only the answer part after "Answer:" full_text = result[0]["generated_text"] answer = full_text.split("Answer:")[-1].strip() return answer # ── Gradio UI ───────────────────────────────────────── with gr.Blocks(title="PDF Question Answering") as demo: gr.Markdown("# 📄 PDF Question Answering System\nUpload a PDF and ask questions based on its content.") with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) upload_btn = gr.Button("Load PDF", variant="primary") upload_status = gr.Textbox(label="Status", interactive=False) with gr.Row(): question_input = gr.Textbox(label="Ask a Question", placeholder="e.g. What is this document about?") ask_btn = gr.Button("Get Answer", variant="primary") answer_output = gr.Textbox(label="Answer", interactive=False) gr.Examples( examples=[["What is the main topic?"], ["Who are the authors?"], ["What is the conclusion?"]], inputs=question_input ) upload_btn.click(fn=build_index, inputs=pdf_input, outputs=upload_status) ask_btn.click( fn=answer_question, inputs=question_input, outputs=answer_output) demo.launch()