Spaces:

AlmasKanwal19
/

rag-pdf-qa-almas

Sleeping

App Files Files Community

rag-pdf-qa-almas / app.py

AlmasKanwal19

Create app.py

ababda9 verified 8 months ago

raw

history blame contribute delete

2.69 kB

	import streamlit as st
	import faiss
	import numpy as np
	import torch
	from pypdf import PdfReader
	from transformers import AutoTokenizer, AutoModel, pipeline
	from langchain.text_splitter import CharacterTextSplitter

	# Load embedding and QA models
	tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
	model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
	qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad")

	# PDF text extraction and text chunking
	def extract_text_from_pdf(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def split_text_into_chunks(text, chunk_size=500, overlap=50):
	splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
	return splitter.split_text(text)

	# Function to embed text using the embedding model
	def embed_text(text):
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	embeddings = model(**inputs).last_hidden_state.mean(dim=1)
	return embeddings.numpy()

	# Function to create FAISS index
	def create_faiss_index(embeddings):
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)
	return index

	# Function to answer questions based on retrieved context
	def answer_question(question, index, chunks, top_k=3):
	question_embedding = embed_text(question)
	_, indices = index.search(question_embedding, top_k)
	context = " ".join([chunks[i] for i in indices[0]])
	result = qa_pipeline(question=question, context=context)
	return result['answer']

	# Streamlit app layout
	st.title("PDF Question-Answering Chatbot with RAG")
	st.write("Upload a PDF, and ask questions based on its content.")

	# File uploader
	pdf_file = st.file_uploader("Upload PDF", type="pdf")
	if pdf_file is not None:
	# Extract and split text from PDF
	with st.spinner("Processing PDF..."):
	text = extract_text_from_pdf(pdf_file)
	chunks = split_text_into_chunks(text)

	# Embed and index the chunks
	embeddings = np.vstack([embed_text(chunk) for chunk in chunks])
	index = create_faiss_index(embeddings)

	st.success("PDF processed and indexed successfully!")
	st.write("You can now ask questions based on the content of the PDF.")

	# Input for user question
	question = st.text_input("Ask a question:")
	if question:
	with st.spinner("Searching for the answer..."):
	answer = answer_question(question, index, chunks)
	st.write("Answer:", answer)