Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
# Load the lightweight Hugging Face transformer model | |
model_name = "distilbert-base-uncased-distilled-squad" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer) | |
# Load the SentenceTransformer model for embeddings | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Upload PDF files | |
st.header("Question and Answer Chatbot") | |
with st.sidebar: | |
st.title("Turn your PDFs into a Q&A session. Upload a file and start asking questions") | |
file = st.file_uploader("PDF file upload", type="pdf") | |
# Extract the text | |
if file is not None: | |
pdf_reader = PdfReader(file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
# Break it into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators="\n", | |
#chunk_size=1000, | |
#chunk_overlap=500, | |
chunk_size=800, | |
chunk_overlap=150, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text) | |
# Generate embeddings for each chunk | |
embeddings = embedding_model.encode(chunks) | |
# Create FAISS index and add embeddings | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(np.array(embeddings)) | |
# Get user question | |
user_question = st.text_input("Type your question here") | |
# Perform similarity search | |
if user_question: | |
question_embedding = embedding_model.encode([user_question]) | |
D, I = index.search(np.array(question_embedding), k=5) | |
matched_texts = [chunks[i] for i in I[0]] | |
# Use the lightweight transformer model for question answering | |
response = "" | |
for context in matched_texts: | |
result = qa_pipeline(question=user_question, context=context) | |
response += result['answer'] + " " | |
st.write(response) | |