import os import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Load a Hugging Face model (e.g., LLaMA or Falcon) model_name = "mixedbread-ai/mxbai-embed-2d-large-v1" # Replace with your preferred model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text # chuck_size = 1000, chunk_overlap = 200 (for shorted PDFs) def get_text_chunks(text): text_splitter= RecursiveCharacterTextSplitter( chunk_size=10000, chunk_overlap=1000, # length_function=len ) chunks=text_splitter.split_text(text) return chunks # Converting into Vector data/store (can also be stored) def get_vector_store(text_chunks): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_store = FAISS.from_texts(texts=text_chunks, embedding=embeddings) vector_store.save_local("faiss_index") # return vector_store def chat_with_huggingface(context, query): prompt_template = """ Answer the query as detailed as possible from the provided context. If the answer is not in the context, just say, "Answer is not available in the provided documents". Context: {context} Query: {query} Answer: """ inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_length=500, temperature=0.3) return tokenizer.decode(outputs[0], skip_special_tokens=True) def get_conversation_chain(): def huggingface_chain(inputs): context = inputs["input_documents"][0].page_content # Extract context from FAISS search query = inputs["question"] return {"output_text": chat_with_huggingface(context, query)} return huggingface_chain def user_input(user_question): # embeddings = GoogleGenerativeAIEmbeddings(model='embedding-gecko-001') embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Loading the embeddings new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) docs = new_db.similarity_search(user_question) chain=get_conversation_chain() response = chain( {"input_documents": docs, "question": user_question}) print(response) st.write("Reply: ", response["output_text"]) # Frontend page Processor def main(): st.set_page_config(page_title="PDF Chatbot") st.header("PDF Chatbot made for Pooja") user_question = st.text_input("Puchiye kuch apne documents se:") if user_question: user_input(user_question) with st.sidebar: st.title("Menu:") pdf_docs = st.file_uploader( "Apne PDFs yaha pe upload karo then click on 'Process'", accept_multiple_files=True) if st.button("Submit & Process"): with st.spinner("Ruko Padh raha hu..."): raw_text = get_pdf_text(pdf_docs) text_chunks = get_text_chunks(raw_text) get_vector_store(text_chunks) st.success("Saare documents padh liya. Ab swaal pucho 😤") if __name__ == '__main__': main()