File size: 3,863 Bytes
cbfee92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
# ββ Load Models βββββββββββββββββββββββββββββββββββββββ
embedder = SentenceTransformer("all-MiniLM-L6-v2")
qa_model = pipeline("text-generation", model="gpt2")
# ββ Global storage ββββββββββββββββββββββββββββββββββββ
chunks = []
index = None
# ββ Step 1: Extract text from PDF βββββββββββββββββββββ
def extract_text(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# ββ Step 2: Split text into chunks ββββββββββββββββββββ
def split_chunks(text, chunk_size=300):
words = text.split()
result = []
for i in range(0, len(words), chunk_size):
chunk = " ".join(words[i:i+chunk_size])
result.append(chunk)
return result
# ββ Step 3: Create embeddings & store in FAISS ββββββββ
def build_index(pdf_file):
global chunks, index
text = extract_text(pdf_file.name)
chunks = split_chunks(text)
embeddings = embedder.encode(chunks)
embeddings = np.array(embeddings).astype("float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return f"β
PDF loaded! {len(chunks)} chunks created. You can now ask questions."
# ββ Step 4: Retrieve + Answer βββββββββββββββββββββββββ
def answer_question(question):
global chunks, index
if index is None:
return "β οΈ Please upload a PDF first."
if not question.strip():
return "β οΈ Please enter a question."
# Retrieve top 3 relevant chunks
question_embedding = embedder.encode([question]).astype("float32")
_, indices = index.search(question_embedding, k=3)
context = " ".join([chunks[i] for i in indices[0]])
# Keep context short to stay within gpt2 token limit (1024)
context = context[:500]
# Build prompt
prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
result = qa_model(
prompt,
max_new_tokens=100,
do_sample=False,
pad_token_id=50256 # gpt2 has no pad token, this suppresses the warning
)
# Extract only the answer part after "Answer:"
full_text = result[0]["generated_text"]
answer = full_text.split("Answer:")[-1].strip()
return answer
# ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks(title="PDF Question Answering") as demo:
gr.Markdown("# π PDF Question Answering System\nUpload a PDF and ask questions based on its content.")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
upload_btn = gr.Button("Load PDF", variant="primary")
upload_status = gr.Textbox(label="Status", interactive=False)
with gr.Row():
question_input = gr.Textbox(label="Ask a Question", placeholder="e.g. What is this document about?")
ask_btn = gr.Button("Get Answer", variant="primary")
answer_output = gr.Textbox(label="Answer", interactive=False)
gr.Examples(
examples=[["What is the main topic?"], ["Who are the authors?"], ["What is the conclusion?"]],
inputs=question_input
)
upload_btn.click(fn=build_index, inputs=pdf_input, outputs=upload_status)
ask_btn.click( fn=answer_question, inputs=question_input, outputs=answer_output)
demo.launch() |