File size: 3,863 Bytes
cbfee92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# ── Load Models ───────────────────────────────────────
embedder = SentenceTransformer("all-MiniLM-L6-v2")
qa_model = pipeline("text-generation", model="gpt2")

# ── Global storage ────────────────────────────────────
chunks = []
index  = None

# ── Step 1: Extract text from PDF ─────────────────────
def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text   = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# ── Step 2: Split text into chunks ────────────────────
def split_chunks(text, chunk_size=300):
    words  = text.split()
    result = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        result.append(chunk)
    return result

# ── Step 3: Create embeddings & store in FAISS ────────
def build_index(pdf_file):
    global chunks, index

    text   = extract_text(pdf_file.name)
    chunks = split_chunks(text)

    embeddings = embedder.encode(chunks)
    embeddings = np.array(embeddings).astype("float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    return f"βœ… PDF loaded! {len(chunks)} chunks created. You can now ask questions."

# ── Step 4: Retrieve + Answer ─────────────────────────
def answer_question(question):
    global chunks, index

    if index is None:
        return "⚠️ Please upload a PDF first."

    if not question.strip():
        return "⚠️ Please enter a question."

    # Retrieve top 3 relevant chunks
    question_embedding = embedder.encode([question]).astype("float32")
    _, indices         = index.search(question_embedding, k=3)
    context            = " ".join([chunks[i] for i in indices[0]])

    # Keep context short to stay within gpt2 token limit (1024)
    context = context[:500]

    # Build prompt
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"

    result = qa_model(
        prompt,
        max_new_tokens=100,
        do_sample=False,
        pad_token_id=50256      # gpt2 has no pad token, this suppresses the warning
    )

    # Extract only the answer part after "Answer:"
    full_text = result[0]["generated_text"]
    answer    = full_text.split("Answer:")[-1].strip()

    return answer

# ── Gradio UI ─────────────────────────────────────────
with gr.Blocks(title="PDF Question Answering") as demo:
    gr.Markdown("# πŸ“„ PDF Question Answering System\nUpload a PDF and ask questions based on its content.")

    with gr.Row():
        pdf_input  = gr.File(label="Upload PDF", file_types=[".pdf"])
        upload_btn = gr.Button("Load PDF", variant="primary")

    upload_status = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question_input = gr.Textbox(label="Ask a Question", placeholder="e.g. What is this document about?")
        ask_btn        = gr.Button("Get Answer", variant="primary")

    answer_output = gr.Textbox(label="Answer", interactive=False)

    gr.Examples(
        examples=[["What is the main topic?"], ["Who are the authors?"], ["What is the conclusion?"]],
        inputs=question_input
    )

    upload_btn.click(fn=build_index,     inputs=pdf_input,      outputs=upload_status)
    ask_btn.click(   fn=answer_question, inputs=question_input, outputs=answer_output)

demo.launch()