import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering from sentence_transformers import SentenceTransformer import faiss import numpy as np # Load the lightweight Hugging Face transformer model model_name = "distilbert-base-uncased-distilled-squad" tokenizer = AutoTokenizer.from_pretrained(model_name) qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name) qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer) # Load the SentenceTransformer model for embeddings embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Upload PDF files st.header("Question and Answer Chatbot") with st.sidebar: st.title("Turn your PDFs into a Q&A session. Upload a file and start asking questions") file = st.file_uploader("PDF file upload", type="pdf") # Extract the text if file is not None: pdf_reader = PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() # Break it into chunks text_splitter = RecursiveCharacterTextSplitter( separators="\n", #chunk_size=1000, #chunk_overlap=500, chunk_size=800, chunk_overlap=150, length_function=len ) chunks = text_splitter.split_text(text) # Generate embeddings for each chunk embeddings = embedding_model.encode(chunks) # Create FAISS index and add embeddings dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) # Get user question user_question = st.text_input("Type your question here") # Perform similarity search if user_question: question_embedding = embedding_model.encode([user_question]) D, I = index.search(np.array(question_embedding), k=5) matched_texts = [chunks[i] for i in I[0]] # Use the lightweight transformer model for question answering response = "" for context in matched_texts: result = qa_pipeline(question=user_question, context=context) response += result['answer'] + " " st.write(response)