Spaces:

Nav772
/

rag-qa-document

Sleeping

Navneet Sai

Switch from PyMuPDF to PyPDF for HF Compatibility

376e7ad 27 days ago

8.86 kB

	"""
	RAG Document Q&A Assistant
	Upload documents, ask questions, get answers with source citations.
	"""

	import os
	import tempfile
	from typing import Optional

	import chromadb
	from pypdf import PdfReader # PyMuPDF
	import gradio as gr
	from chromadb.utils import embedding_functions
	from openai import OpenAI

	# Initialize OpenAI client
	openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Initialize embedding function
	embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name="all-MiniLM-L6-v2"
	)

	# Global state for the current session
	chroma_client = None
	collection = None
	current_chunks = []


	def extract_text_from_pdf(file_path: str) -> str:
	"""Extract text from PDF using pypdf."""
	reader = PdfReader(file_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return text


	def extract_text_from_txt(file_path: str) -> str:
	"""Extract text from TXT file."""
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()


	def chunk_fixed_size(text: str, chunk_size: int = 500, overlap: int = 100) -> list[dict]:
	"""Split text into fixed-size chunks with overlap."""
	chunks = []
	start = 0
	chunk_id = 0

	while start < len(text):
	end = start + chunk_size
	chunk_text = text[start:end].strip()

	if chunk_text:
	chunks.append({
	"id": f"chunk_{chunk_id}",
	"text": chunk_text,
	"start": start,
	"end": end
	})
	chunk_id += 1

	start = end - overlap

	return chunks


	def chunk_by_paragraph(text: str) -> list[dict]:
	"""Split text by paragraphs (double newlines)."""
	paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

	chunks = []
	for i, para in enumerate(paragraphs):
	if len(para) > 50:
	chunks.append({
	"id": f"chunk_{i}",
	"text": para,
	"start": 0,
	"end": 0
	})

	return chunks


	def process_document(file, chunking_strategy: str) -> str:
	"""Process uploaded document and store in vector DB."""
	global chroma_client, collection, current_chunks

	if file is None:
	return "❌ Please upload a document first."

	file_path = file.name
	file_ext = os.path.splitext(file_path)[1].lower()

	try:
	if file_ext == ".pdf":
	text = extract_text_from_pdf(file_path)
	elif file_ext in [".txt", ".md"]:
	text = extract_text_from_txt(file_path)
	else:
	return f"❌ Unsupported file type: {file_ext}. Please upload PDF or TXT."
	except Exception as e:
	return f"❌ Error reading file: {str(e)}"

	if not text.strip():
	return "❌ No text could be extracted from the document."

	if chunking_strategy == "Fixed-size (500 chars)":
	current_chunks = chunk_fixed_size(text, chunk_size=500, overlap=100)
	else:
	current_chunks = chunk_by_paragraph(text)

	if not current_chunks:
	return "❌ No chunks could be created from the document."

	# Initialize fresh Chroma client and collection
	chroma_client = chromadb.Client()
	try:
	chroma_client.delete_collection(name="documents")
	except:
	pass
	collection = chroma_client.create_collection(
	name="documents",
	embedding_function=embedding_func
	)

	collection.add(
	documents=[c["text"] for c in current_chunks],
	ids=[c["id"] for c in current_chunks]
	)

	return f"✅ Document processed successfully!\n\n📊 Stats:\n- Characters: {len(text):,}\n- Chunks created: {len(current_chunks)}\n- Chunking strategy: {chunking_strategy}"


	def retrieve_context(query: str, top_k: int = 3) -> list[dict]:
	"""Retrieve relevant chunks for the query."""
	if collection is None:
	return []

	results = collection.query(
	query_texts=[query],
	n_results=top_k
	)

	retrieved = []
	for i, (doc, distance) in enumerate(zip(
	results["documents"][0],
	results["distances"][0]
	)):
	similarity = 1 / (1 + distance)
	retrieved.append({
	"text": doc,
	"similarity": similarity,
	"rank": i + 1
	})

	return retrieved


	def generate_answer(query: str, context_docs: list[dict]) -> str:
	"""Generate answer using OpenAI with retrieved context."""
	if not context_docs:
	return "I don't have any context to answer this question. Please upload a document first."

	context = "\n\n".join([
	f"[Source {doc['rank']}] (relevance: {doc['similarity']:.0%})\n{doc['text']}"
	for doc in context_docs
	])

	prompt = f"""Answer the question based on the provided context.
	If the context doesn't contain enough information to answer fully, say so.
	Always reference which source(s) you used.

	CONTEXT:
	{context}

	QUESTION: {query}

	ANSWER:"""

	try:
	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and cite your sources."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3,
	max_tokens=500
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"❌ Error generating answer: {str(e)}"


	def ask_question(query: str) -> tuple[str, str]:
	"""Main function to handle user questions."""
	if not query.strip():
	return "Please enter a question.", ""

	if collection is None:
	return "Please upload and process a document first.", ""

	retrieved = retrieve_context(query, top_k=3)
	answer = generate_answer(query, retrieved)

	sources = "\n\n---\n\n📚 Retrieved Sources:\n\n"
	for doc in retrieved:
	sources += f"[Source {doc['rank']}] (relevance: {doc['similarity']:.0%})\n"
	sources += f"```\n{doc['text'][:300]}{'...' if len(doc['text']) > 300 else ''}\n```\n\n"

	return answer, sources


	# Build Gradio interface
	with gr.Blocks(title="RAG Document Q&A", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📄 RAG Document Q&A Assistant

	Upload a document (PDF or TXT), choose a chunking strategy, and ask questions!

	How it works:
	1. Your document is split into chunks using the selected strategy
	2. Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
	3. When you ask a question, relevant chunks are retrieved using semantic search
	4. GPT-4o-mini generates an answer based on the retrieved context

	---
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Step 1: Upload Document")
	file_input = gr.File(
	label="Upload PDF or TXT",
	file_types=[".pdf", ".txt", ".md"]
	)
	chunking_dropdown = gr.Dropdown(
	choices=["Fixed-size (500 chars)", "Paragraph-based"],
	value="Paragraph-based",
	label="Chunking Strategy"
	)
	process_btn = gr.Button("Process Document", variant="primary")
	process_output = gr.Markdown(label="Processing Status")

	with gr.Column(scale=2):
	gr.Markdown("### 💬 Step 2: Ask Questions")
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="What is this document about?",
	lines=2
	)
	ask_btn = gr.Button("Ask", variant="primary")

	answer_output = gr.Markdown(label="Answer")
	sources_output = gr.Markdown(label="Sources")

	gr.Markdown("""
	---

	📚 References:
	- [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
	- [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
	- [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)

	Built as part of an AI/ML Engineering portfolio project.
	""")

	process_btn.click(
	fn=process_document,
	inputs=[file_input, chunking_dropdown],
	outputs=[process_output]
	)

	ask_btn.click(
	fn=ask_question,
	inputs=[question_input],
	outputs=[answer_output, sources_output]
	)

	question_input.submit(
	fn=ask_question,
	inputs=[question_input],
	outputs=[answer_output, sources_output]
	)


	if __name__ == "__main__":
	demo.launch()