File size: 5,169 Bytes
0ff7c7e
 
5b00cc2
38a4055
0ff7c7e
 
5b00cc2
0ff7c7e
 
 
 
c3d7129
0ff7c7e
 
 
 
 
 
 
 
 
 
 
c3d7129
 
 
 
 
 
 
 
 
 
4dc8cac
c3d7129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b00cc2
 
 
c3d7129
 
 
 
 
 
 
 
 
 
5b00cc2
38a4055
5b00cc2
 
 
 
 
 
c3d7129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import time
import streamlit as st
from llama_index import ServiceContext, StorageContext, set_global_service_context, VectorStoreIndex, Document
from llama_index.prompts import PromptTemplate
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from PyPDF2 import PdfReader

# Model Specific Prompt


def modelspecific_prompt(promptmessage):
    # Model Specific Prompt
    # As per https://huggingface.co/TheBloke/phi-2-GGUF
    return f"Instruct: {promptmessage}\nOutput:"


# PDF to Text
def extract_text_from_pdf(pdf):
    pdf_reader = PdfReader(pdf)
    data = ''.join(page.extract_text() for page in pdf_reader.pages)
    data = data.split()
    return data


def main():
    # LLM Intialization
    llm = LlamaCPP(
        model_url=None,  # We'll load locally.
        # Trying small version of an already small model
        model_path='models/phi-2.Q4_K_M.gguf',
        temperature=0.1,
        max_new_tokens=512,
        context_window=2048,  # Phi-2 2K context window - this could be a limitation for RAG as it has to put the content into this context window
        generate_kwargs={},
        # set to at least 1 to use GPU
        # This is small model and there's no indication of layers offloaded to the GPU
        model_kwargs={"n_gpu_layers": 32},
        messages_to_prompt=messages_to_prompt,
        completion_to_prompt=completion_to_prompt,
        verbose=True
    )

    # Embedding Initialization
    embed_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
    )

    # Service Context
    service_context = ServiceContext.from_defaults(
        chunk_size=128,  # Number of tokens in each chunk
        chunk_overlap=20,
        # This should be automatically set with the model metadata but we'll force it to ensure wit is
        context_window=2048,
        num_output=768,  # Maximum output from the LLM, let's put this at 512 to ensure LlamaIndex saves that "space" for the output
        llm=llm,
        embed_model=embed_model
    )
    set_global_service_context(service_context)

    # Storage Context
    storage_context = StorageContext.from_defaults()
    st.title("Llama-CPP Local LLM with RAG (Phi-2 RAG)")
    # Credits
    st.markdown(
        "Made with ❤️️ By Danyaal Majid & Muhammad Bin Asif Using [HF Spaces](https://huggingface.co/spaces/DanyaalMajid/NLP-Final-LocalLLM-RAG)")

    pdf = st.file_uploader("Upload a PDF file", type=["pdf"])

    if pdf is not None:
        text_list = extract_text_from_pdf(pdf)
        documents = [Document(text=t) for t in text_list]
        nodes = (service_context.node_parser.get_nodes_from_documents(documents))
        storage_context.docstore.add_documents(nodes)
        index = (VectorStoreIndex.from_documents(
            documents, service_context=service_context, storage_context=storage_context, llm=llm))
        # chat_engine = index.as_chat_engine(chat_mode="simple", verbose=True)
        custom_prompt = PromptTemplate("")
        query_engine = index.as_query_engine()
        chat_engine = CondenseQuestionChatEngine.from_defaults(
            query_engine=query_engine,
            condense_question_prompt=custom_prompt,
            verbose=True,
        )
        # Initialize chat history
        if "messages" not in st.session_state:
            st.session_state.messages = []

        # Display chat messages from history on app rerun
        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        # Accept user input
        if prompt := st.chat_input("What is up?"):
            # Add user message to chat history
            st.session_state.messages.append(
                {"role": "user", "content": prompt})
            # Display user message in chat message container
            with st.chat_message("user"):
                st.markdown(prompt)

            # Display assistant response in chat message container
            with st.chat_message("assistant"):
                message_placeholder = st.empty()
                full_response = ""
                assistant_response = chat_engine.chat(
                    modelspecific_prompt(str(prompt)))
                assistant_response = str(assistant_response)
                # Simulate stream of response with milliseconds delay
                for chunk in assistant_response.split():
                    full_response += chunk + " "
                    time.sleep(0.05)
                    # Add a blinking cursor to simulate typing
                    message_placeholder.markdown(full_response + "▌")
                message_placeholder.markdown(full_response)
            # Add assistant response to chat history
            st.session_state.messages.append(
                {"role": "assistant", "content": full_response})


if __name__ == "__main__":
    main()