Spaces:

NETAL24
/

RAG

Sleeping

App Files Files Community

RAG / app.py

NETAL24

Create app.py

cbfee92 verified 19 days ago

raw

history blame contribute delete

3.86 kB

	import gradio as gr
	from pypdf import PdfReader
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import faiss
	import numpy as np

	# ── Load Models ───────────────────────────────────────
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	qa_model = pipeline("text-generation", model="gpt2")

	# ── Global storage ────────────────────────────────────
	chunks = []
	index = None

	# ── Step 1: Extract text from PDF ─────────────────────
	def extract_text(pdf_path):
	reader = PdfReader(pdf_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	# ── Step 2: Split text into chunks ────────────────────
	def split_chunks(text, chunk_size=300):
	words = text.split()
	result = []
	for i in range(0, len(words), chunk_size):
	chunk = " ".join(words[i:i+chunk_size])
	result.append(chunk)
	return result

	# ── Step 3: Create embeddings & store in FAISS ────────
	def build_index(pdf_file):
	global chunks, index

	text = extract_text(pdf_file.name)
	chunks = split_chunks(text)

	embeddings = embedder.encode(chunks)
	embeddings = np.array(embeddings).astype("float32")

	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings)

	return f"✅ PDF loaded! {len(chunks)} chunks created. You can now ask questions."

	# ── Step 4: Retrieve + Answer ─────────────────────────
	def answer_question(question):
	global chunks, index

	if index is None:
	return "⚠️ Please upload a PDF first."

	if not question.strip():
	return "⚠️ Please enter a question."

	# Retrieve top 3 relevant chunks
	question_embedding = embedder.encode([question]).astype("float32")
	_, indices = index.search(question_embedding, k=3)
	context = " ".join([chunks[i] for i in indices[0]])

	# Keep context short to stay within gpt2 token limit (1024)
	context = context[:500]

	# Build prompt
	prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"

	result = qa_model(
	prompt,
	max_new_tokens=100,
	do_sample=False,
	pad_token_id=50256 # gpt2 has no pad token, this suppresses the warning
	)

	# Extract only the answer part after "Answer:"
	full_text = result[0]["generated_text"]
	answer = full_text.split("Answer:")[-1].strip()

	return answer

	# ── Gradio UI ─────────────────────────────────────────
	with gr.Blocks(title="PDF Question Answering") as demo:
	gr.Markdown("# 📄 PDF Question Answering System\nUpload a PDF and ask questions based on its content.")

	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	upload_btn = gr.Button("Load PDF", variant="primary")

	upload_status = gr.Textbox(label="Status", interactive=False)

	with gr.Row():
	question_input = gr.Textbox(label="Ask a Question", placeholder="e.g. What is this document about?")
	ask_btn = gr.Button("Get Answer", variant="primary")

	answer_output = gr.Textbox(label="Answer", interactive=False)

	gr.Examples(
	examples=[["What is the main topic?"], ["Who are the authors?"], ["What is the conclusion?"]],
	inputs=question_input
	)

	upload_btn.click(fn=build_index, inputs=pdf_input, outputs=upload_status)
	ask_btn.click( fn=answer_question, inputs=question_input, outputs=answer_output)

	demo.launch()