intuiPy_chat / app.py
Kazeemkz's picture
Add Streamlit application and dependencies
fe4fdc2
raw
history blame
2.24 kB
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Load the lightweight Hugging Face transformer model
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer)
# Load the SentenceTransformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Upload PDF files
st.header("Question and Answer Chatbot")
with st.sidebar:
st.title("Turn your PDFs into a Q&A session. Upload a file and start asking questions")
file = st.file_uploader("PDF file upload", type="pdf")
# Extract the text
if file is not None:
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Break it into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators="\n",
#chunk_size=1000,
#chunk_overlap=500,
chunk_size=800,
chunk_overlap=150,
length_function=len
)
chunks = text_splitter.split_text(text)
# Generate embeddings for each chunk
embeddings = embedding_model.encode(chunks)
# Create FAISS index and add embeddings
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
# Get user question
user_question = st.text_input("Type your question here")
# Perform similarity search
if user_question:
question_embedding = embedding_model.encode([user_question])
D, I = index.search(np.array(question_embedding), k=5)
matched_texts = [chunks[i] for i in I[0]]
# Use the lightweight transformer model for question answering
response = ""
for context in matched_texts:
result = qa_pipeline(question=user_question, context=context)
response += result['answer'] + " "
st.write(response)