AlmasKanwal19's picture
Create app.py
ababda9 verified
import streamlit as st
import faiss
import numpy as np
import torch
from pypdf import PdfReader
from transformers import AutoTokenizer, AutoModel, pipeline
from langchain.text_splitter import CharacterTextSplitter
# Load embedding and QA models
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad")
# PDF text extraction and text chunking
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def split_text_into_chunks(text, chunk_size=500, overlap=50):
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
return splitter.split_text(text)
# Function to embed text using the embedding model
def embed_text(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1)
return embeddings.numpy()
# Function to create FAISS index
def create_faiss_index(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index
# Function to answer questions based on retrieved context
def answer_question(question, index, chunks, top_k=3):
question_embedding = embed_text(question)
_, indices = index.search(question_embedding, top_k)
context = " ".join([chunks[i] for i in indices[0]])
result = qa_pipeline(question=question, context=context)
return result['answer']
# Streamlit app layout
st.title("PDF Question-Answering Chatbot with RAG")
st.write("Upload a PDF, and ask questions based on its content.")
# File uploader
pdf_file = st.file_uploader("Upload PDF", type="pdf")
if pdf_file is not None:
# Extract and split text from PDF
with st.spinner("Processing PDF..."):
text = extract_text_from_pdf(pdf_file)
chunks = split_text_into_chunks(text)
# Embed and index the chunks
embeddings = np.vstack([embed_text(chunk) for chunk in chunks])
index = create_faiss_index(embeddings)
st.success("PDF processed and indexed successfully!")
st.write("You can now ask questions based on the content of the PDF.")
# Input for user question
question = st.text_input("Ask a question:")
if question:
with st.spinner("Searching for the answer..."):
answer = answer_question(question, index, chunks)
st.write("**Answer:**", answer)