File size: 4,473 Bytes
99cdfe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

# !pip install langchain
# !pip install sentence-transformers
# !pip install accelerate
# !pip install chromadb
# !pip install "unstructured[all-docs]"

from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
import streamlit as st
import os


def main_process(uploaded_file):
    file_name = list(uploaded_file.keys())[0]

    # Create a temporary directory
    temp_dir = "temp"
    os.makedirs(temp_dir, exist_ok=True)

    # Save the uploaded file to the temporary directory
    temp_path = os.path.join(temp_dir, file_name)
    with open(temp_path, "wb") as temp_file:
        temp_file.write(uploaded_file[file_name])

    # Process the uploaded file
    loader = UnstructuredFileLoader(temp_path)
    documents = loader.load()
    for document in documents:
        print(document.page_content)
    # We cant load the whole pdf into the program so we split the pdf into chunks
    # We use RecursiveCharacterTextSplitter to split the pdf into chunks
    # Each chunk is 500 characters long and the chunks overlap by 200 characters (You can change this according to your needs)
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
    texts = text_splitter.split_documents(documents)

    # We use SentenceTransformerEmbeddings to embed the text chunks
    # Embeddings are used to find the similarity between the query and the text chunks
    # We use multi-qa-mpnet-base-dot-v1 model to embed the text chunks
    # We need to save the embeddings to disk so we use persist_directory to save the embeddings to disk
    embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
    persist_directory = "/content/chroma/"

    # Chroma is used to store the embeddings
    # We use from_documents to store the embeddings
    # We use the persist_directory to save the embeddings to disk
    db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)

    # To save and load the saved vector db (if needed in the future)
    # Persist the database to disk
    # db.persist()
    # db = Chroma(persist_directory="db", embedding_function=embeddings)

    checkpoint = "MBZUAI/LaMini-Flan-T5-783M"

    # Initialize the tokenizer and base model for text generation
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    base_model = AutoModelForSeq2SeqLM.from_pretrained(
        checkpoint,
        device_map="auto",
        torch_dtype=torch.float32
    )

    pipe = pipeline(
        'text2text-generation',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 512,
        do_sample = True,
        temperature = 0.3,
        top_p= 0.95
    )

    # Initialize a local language model pipeline
    local_llm = HuggingFacePipeline(pipeline=pipe)
    # Create a RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=local_llm,
        chain_type='stuff',
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
        return_source_documents=True,
    )
    return qa_chain

st.title("Document Chatbot")
st.write("Upload a pdf file to get started")

uploaded_file = st.file_uploader("Choose a file", type=["pdf"])

if uploaded_file is not None:
    qa_chain = main_process(uploaded_file)
    if "messages" not in st.session_state:
        st.session_state.messages = []

    # Display chat messages from history on app rerun
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Accept user input
    if prompt := st.chat_input("What is up?"):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        # Display user message in chat message container
        with st.chat_message("user"):
            st.markdown(prompt)
        # Get response from chatbot
        with st.chat_message("assitant"):
            response = qa_chain(prompt)
            st.markdown(response)
            st.session_state.messages.append({"role": "assistant", "content": response})