# !pip install langchain # !pip install sentence-transformers # !pip install accelerate # !pip install chromadb # !pip install "unstructured[all-docs]" from langchain.vectorstores import Chroma from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import pipeline import torch from langchain.llms import HuggingFacePipeline from langchain.embeddings import SentenceTransformerEmbeddings from langchain.chains import RetrievalQA from langchain_community.document_loaders import UnstructuredFileLoader from langchain.text_splitter import CharacterTextSplitter import streamlit as st import os def main_process(uploaded_file): file_name = uploaded_file.name # Create a temporary directory temp_dir = "temp" os.makedirs(temp_dir, exist_ok=True) # Save the uploaded file to the temporary directory temp_path = os.path.join(temp_dir, file_name) with open(temp_path, "wb") as temp_file: temp_file.write(uploaded_file.getvalue()) # Process the uploaded file loader = UnstructuredFileLoader(temp_path) documents = loader.load() for document in documents: print(document.page_content) # We cant load the whole pdf into the program so we split the pdf into chunks # We use RecursiveCharacterTextSplitter to split the pdf into chunks # Each chunk is 500 characters long and the chunks overlap by 200 characters (You can change this according to your needs) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400) texts = text_splitter.split_documents(documents) # We use SentenceTransformerEmbeddings to embed the text chunks # Embeddings are used to find the similarity between the query and the text chunks # We use multi-qa-mpnet-base-dot-v1 model to embed the text chunks # We need to save the embeddings to disk so we use persist_directory to save the embeddings to disk embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1") persist_directory = "/content/chroma/" # Chroma is used to store the embeddings # We use from_documents to store the embeddings # We use the persist_directory to save the embeddings to disk db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory) # To save and load the saved vector db (if needed in the future) # Persist the database to disk # db.persist() # db = Chroma(persist_directory="db", embedding_function=embeddings) checkpoint = "MBZUAI/LaMini-Flan-T5-783M" # Initialize the tokenizer and base model for text generation tokenizer = AutoTokenizer.from_pretrained(checkpoint) base_model = AutoModelForSeq2SeqLM.from_pretrained( checkpoint, device_map="auto", torch_dtype=torch.float32 ) pipe = pipeline( 'text2text-generation', model = base_model, tokenizer = tokenizer, max_length = 512, do_sample = True, temperature = 0.3, top_p= 0.95 ) # Initialize a local language model pipeline local_llm = HuggingFacePipeline(pipeline=pipe) # Create a RetrievalQA chain qa_chain = RetrievalQA.from_chain_type( llm=local_llm, chain_type='stuff', retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}), return_source_documents=True, ) return qa_chain st.title("Document Chatbot") st.write("Upload a pdf file to get started") uploaded_file = st.file_uploader("Choose a file", type=["pdf"]) if uploaded_file is not None: qa_chain = main_process(uploaded_file) if "messages" not in st.session_state: st.session_state.messages = [] # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Accept user input if prompt := st.chat_input("What is up?"): # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) # Display user message in chat message container with st.chat_message("user"): st.markdown(prompt) # Get response from chatbot with st.chat_message("assitant"): response = qa_chain(prompt) st.markdown(response) st.session_state.messages.append({"role": "assistant", "content": response})