import streamlit as st
from operator import itemgetter
from langchain.llms import LlamaCpp
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
# from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from huggingface_hub import hf_hub_download
# StreamHandler to intercept streaming output from the LLM.
# This makes it appear that the Language Model is "typing"
# in realtime.
class StreamHandler(BaseCallbackHandler):
def __init__(self, container, initial_text=""):
self.container = container
self.text = initial_text
def on_llm_new_token(self, token: str, **kwargs) -> None:
self.text += token
def create_chain(system_prompt):
# --- Disabled ---
# A stream handler to direct streaming output on the chat screen.
# This will need to be handled somewhat differently.
# But it demonstrates what potential it carries.
# stream_handler = StreamHandler(st.empty())
# --- Disabled ---
# Callback manager is a way to intercept streaming output from the
# LLM and take some action on it. Here we are giving it our custom
# stream handler to make it appear as if the LLM is typing the
# responses in real time.
# callback_manager = CallbackManager([stream_handler])
(repo_id, model_file_name) = ("TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
model_path = hf_hub_download(repo_id=repo_id,
# initialize LlamaCpp llm model
# n_gpu_layers, n_batch, and n_ctx are for GPU support.
# When not set, CPU will be used.
# set 1 for mac m2, and higher numbers based on your GPU support
llm = LlamaCpp(
# callback_manager=callback_manager,
# n_gpu_layers=1,
# n_batch=512,
# n_ctx=4096,
# system_prompt will include instructions to the llm. This might also be
# related to the persona that we desire the llm to assume.
# We will then add a placeholder for the chat history and name of the input
# variable which we will use to pass the history into the template.
# Next, we specify the placeholder for the user prompt as {human_input}.
# Lastly, we include an empty "ai" prompt to indicate the end of user input
# and start of ai response.
# We create a prompt from the template so we can use it with langchain
prompt = ChatPromptTemplate.from_messages([
("system", system_prompt),
("human", "{human_input}"),
("ai", ""),
# Conversation buffer memory will keep track of the conversation in the
# memory. It will use the "chat_history" as the name of the key.
memory = ConversationBufferMemory(memory_key="chat_history",
# utility method that takes in the previous user prompt and generated ai
# response and stores it in the conversational memory.
def save_memory(inputs_outputs):
inputs = {"human": inputs_outputs["human"]}
outputs = {"ai": inputs_outputs["ai"]}
memory.save_context(inputs, outputs)
# utility function to print chat history to the console after every
# interaction.
def debug_memory():
print("\n", "#"*10, "\n")
print("\n", "#"*10, "\n")
# utility function to extract the ai response and return it. There must be
# a better way to handle it but I can't find any examples or documentation
# on how to achieve the same. So I created this function instead.
def extract_response(chain_response):
# debug_memory()
return chain_response["ai"]
# We create the internal llm chain first that takes our input and chat
# history and wraps it in a dictionary before passing it as input to our
# prompt. The prompt is then passed to our llm to generate an ai response.
llm_chain = {
"human_input": RunnablePassthrough(),
"chat_history": (
RunnableLambda(memory.load_memory_variables) |
} | prompt | llm
# Since we need to manually inject our inputs and ai response into the
# memory we need to keep track of the initial prompt that we send through
# the chain so we can then save it to the memory with the generated ai
# response. In order to do that, we create a parallel dummy "chain", which
# will serve as passthrough chain for our prompt while the second chain
# will be used to generate an ai response based on our prompt and the chat
# history using the previous "llm_chain". We then combine both chains in a
# dictionary and past it to two more chains in parallel. First chain will
# call save our prompt and ai response to the chat history and second chain
# will extract the ai response and return that as the output of the chain.
chain_with_memory = RunnablePassthrough() | {
"human": RunnablePassthrough(),
"ai": llm_chain
} | {
"save_memory": RunnableLambda(save_memory),
"ai": itemgetter("ai")
} | RunnableLambda(extract_response)
return chain_with_memory
# Set the webpage title
page_title="Your own Chat!"
# Create a header element
st.header("Your own Chat!")
# This sets the LLM's personality for each prompt.
# The initial personality privided is basic.
# Try something interesting and notice how the LLM responses are affected.
system_prompt = st.text_area(
label="System Prompt",
value="You are a helpful AI assistant who answers questions in short sentences.",
# Create llm chain to use for our chat bot.
llm_chain = create_chain(system_prompt)
# We store the conversation in the session state.
# This will be used to render the chat conversation.
# We initialize it with the first message we want to be greeted with.
if "messages" not in st.session_state:
st.session_state.messages = [
{"role": "assistant", "content": "How may I help you today?"}
if "current_response" not in st.session_state:
st.session_state.current_response = ""
# We loop through each message in the session state and render it as
# a chat message.
for message in st.session_state.messages:
with st.chat_message(message["role"]):
# We take questions/instructions from the chat input to pass to the LLM
if user_prompt := st.chat_input("Your message here", key="user_input"):
# Add our input to the session state
{"role": "user", "content": user_prompt}
# Add our input to the chat window
with st.chat_message("user"):
# Pass our input to the llm chain and capture the final responses.
# It is worth noting that the Stream Handler is already receiving the
# streaming response as the llm is generating. We get our response
# here once the llm has finished generating the complete response.
response = llm_chain.invoke(user_prompt)
# Add the response to the session state
{"role": "assistant", "content": response}
# Add the response to the chat window
with st.chat_message("assistant"):