| import gradio as gr |
| from pypdf import PdfReader |
| from sentence_transformers import SentenceTransformer |
| from transformers import pipeline |
| import faiss |
| import numpy as np |
|
|
| |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") |
| qa_model = pipeline("text-generation", model="gpt2") |
|
|
| |
| chunks = [] |
| index = None |
|
|
| |
| def extract_text(pdf_path): |
| reader = PdfReader(pdf_path) |
| text = "" |
| for page in reader.pages: |
| text += page.extract_text() |
| return text |
|
|
| |
| def split_chunks(text, chunk_size=300): |
| words = text.split() |
| result = [] |
| for i in range(0, len(words), chunk_size): |
| chunk = " ".join(words[i:i+chunk_size]) |
| result.append(chunk) |
| return result |
|
|
| |
| def build_index(pdf_file): |
| global chunks, index |
|
|
| text = extract_text(pdf_file.name) |
| chunks = split_chunks(text) |
|
|
| embeddings = embedder.encode(chunks) |
| embeddings = np.array(embeddings).astype("float32") |
|
|
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
|
|
| return f"β
PDF loaded! {len(chunks)} chunks created. You can now ask questions." |
|
|
| |
| def answer_question(question): |
| global chunks, index |
|
|
| if index is None: |
| return "β οΈ Please upload a PDF first." |
|
|
| if not question.strip(): |
| return "β οΈ Please enter a question." |
|
|
| |
| question_embedding = embedder.encode([question]).astype("float32") |
| _, indices = index.search(question_embedding, k=3) |
| context = " ".join([chunks[i] for i in indices[0]]) |
|
|
| |
| context = context[:500] |
|
|
| |
| prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:" |
|
|
| result = qa_model( |
| prompt, |
| max_new_tokens=100, |
| do_sample=False, |
| pad_token_id=50256 |
| ) |
|
|
| |
| full_text = result[0]["generated_text"] |
| answer = full_text.split("Answer:")[-1].strip() |
|
|
| return answer |
|
|
| |
| with gr.Blocks(title="PDF Question Answering") as demo: |
| gr.Markdown("# π PDF Question Answering System\nUpload a PDF and ask questions based on its content.") |
|
|
| with gr.Row(): |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) |
| upload_btn = gr.Button("Load PDF", variant="primary") |
|
|
| upload_status = gr.Textbox(label="Status", interactive=False) |
|
|
| with gr.Row(): |
| question_input = gr.Textbox(label="Ask a Question", placeholder="e.g. What is this document about?") |
| ask_btn = gr.Button("Get Answer", variant="primary") |
|
|
| answer_output = gr.Textbox(label="Answer", interactive=False) |
|
|
| gr.Examples( |
| examples=[["What is the main topic?"], ["Who are the authors?"], ["What is the conclusion?"]], |
| inputs=question_input |
| ) |
|
|
| upload_btn.click(fn=build_index, inputs=pdf_input, outputs=upload_status) |
| ask_btn.click( fn=answer_question, inputs=question_input, outputs=answer_output) |
|
|
| demo.launch() |