Spaces:
Running
Running
import streamlit as st | |
import faiss | |
import numpy as np | |
import torch | |
from pypdf import PdfReader | |
from transformers import AutoTokenizer, AutoModel, pipeline | |
from langchain.text_splitter import CharacterTextSplitter | |
# Load embedding and QA models | |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') | |
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') | |
qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad") | |
# PDF text extraction and text chunking | |
def extract_text_from_pdf(pdf_file): | |
reader = PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def split_text_into_chunks(text, chunk_size=500, overlap=50): | |
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
return splitter.split_text(text) | |
# Function to embed text using the embedding model | |
def embed_text(text): | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
with torch.no_grad(): | |
embeddings = model(**inputs).last_hidden_state.mean(dim=1) | |
return embeddings.numpy() | |
# Function to create FAISS index | |
def create_faiss_index(embeddings): | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
return index | |
# Function to answer questions based on retrieved context | |
def answer_question(question, index, chunks, top_k=3): | |
question_embedding = embed_text(question) | |
_, indices = index.search(question_embedding, top_k) | |
context = " ".join([chunks[i] for i in indices[0]]) | |
result = qa_pipeline(question=question, context=context) | |
return result['answer'] | |
# Streamlit app layout | |
st.title("PDF Question-Answering Chatbot with RAG") | |
st.write("Upload a PDF, and ask questions based on its content.") | |
# File uploader | |
pdf_file = st.file_uploader("Upload PDF", type="pdf") | |
if pdf_file is not None: | |
# Extract and split text from PDF | |
with st.spinner("Processing PDF..."): | |
text = extract_text_from_pdf(pdf_file) | |
chunks = split_text_into_chunks(text) | |
# Embed and index the chunks | |
embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) | |
index = create_faiss_index(embeddings) | |
st.success("PDF processed and indexed successfully!") | |
st.write("You can now ask questions based on the content of the PDF.") | |
# Input for user question | |
question = st.text_input("Ask a question:") | |
if question: | |
with st.spinner("Searching for the answer..."): | |
answer = answer_question(question, index, chunks) | |
st.write("**Answer:**", answer) | |