Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.llms import LlamaCpp | |
from langchain.prompts import PromptTemplate | |
# from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks.base import BaseCallbackHandler | |
from huggingface_hub import hf_hub_download | |
# StreamHandler to intercept streaming output from the LLM. | |
# This makes it appear that the Language Model is "typing" | |
# in realtime. | |
class StreamHandler(BaseCallbackHandler): | |
def __init__(self, container, initial_text=""): | |
self.container = container | |
self.text = initial_text | |
def on_llm_new_token(self, token: str, **kwargs) -> None: | |
self.text += token | |
self.container.markdown(self.text) | |
def create_chain(system_prompt): | |
# A stream handler to direct streaming output on the chat screen. | |
# This will need to be handled somewhat differently. | |
# But it demonstrates what potential it carries. | |
# stream_handler = StreamHandler(st.empty()) | |
# Callback manager is a way to intercept streaming output from the | |
# LLM and take some action on it. Here we are giving it our custom | |
# stream handler to make it appear that the LLM is typing the | |
# responses in real-time. | |
# callback_manager = CallbackManager([stream_handler]) | |
(repo_id, model_file_name) = ("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", | |
"mistral-7b-instruct-v0.1.Q4_0.gguf") | |
model_path = hf_hub_download(repo_id=repo_id, | |
filename=model_file_name, | |
repo_type="model") | |
# initialize LlamaCpp LLM model | |
# n_gpu_layers, n_batch, and n_ctx are for GPU support. | |
# When not set, CPU will be used. | |
# set 1 for Mac m2, and higher numbers based on your GPU support | |
llm = LlamaCpp( | |
model_path=model_path, | |
temperature=0, | |
max_tokens=512, | |
top_p=1, | |
# callback_manager=callback_manager, | |
# n_gpu_layers=1, | |
# n_batch=512, | |
# n_ctx=4096, | |
stop=["[INST]"], | |
verbose=False, | |
streaming=True, | |
) | |
# Template you will use to structure your user input before converting | |
# into a prompt. Here, my template first injects the personality I wish to | |
# give to the LLM before in the form of system_prompt pushing the actual | |
# prompt from the user. Note that this chatbot doesn't have any memory of | |
# the conversation. So we will inject the system prompt for each message. | |
template = """ | |
<s>[INST]{}[/INST]</s> | |
[INST]{}[/INST] | |
""".format(system_prompt, "{question}") | |
# We create a prompt from the template so we can use it with Langchain | |
prompt = PromptTemplate(template=template, input_variables=["question"]) | |
# We create an llm chain with our LLM and prompt | |
# llm_chain = LLMChain(prompt=prompt, llm=llm) # Legacy | |
llm_chain = prompt | llm # LCEL | |
return llm_chain | |
# Set the webpage title | |
st.set_page_config( | |
page_title="Your own aiChat!" | |
) | |
# Create a header element | |
st.header("Your own aiChat!") | |
# This sets the LLM's personality for each prompt. | |
# The initial personality provided is basic. | |
# Try something interesting and notice how the LLM responses are affected. | |
system_prompt = st.text_area( | |
label="System Prompt", | |
value="You are a helpful AI assistant who answers questions in short sentences.", | |
key="system_prompt") | |
# Create LLM chain to use for our chatbot. | |
llm_chain = create_chain(system_prompt) | |
# We store the conversation in the session state. | |
# This will be used to render the chat conversation. | |
# We initialize it with the first message we want to be greeted with. | |
if "messages" not in st.session_state: | |
st.session_state.messages = [ | |
{"role": "assistant", "content": "How may I help you today?"} | |
] | |
if "current_response" not in st.session_state: | |
st.session_state.current_response = "" | |
# We loop through each message in the session state and render it as | |
# a chat message. | |
for message in st.session_state.messages: | |
with st.chat_message(message["role"]): | |
st.markdown(message["content"]) | |
# We take questions/instructions from the chat input to pass to the LLM | |
if user_prompt := st.chat_input("Your message here", key="user_input"): | |
# Add our input to the session state | |
st.session_state.messages.append( | |
{"role": "user", "content": user_prompt} | |
) | |
# Add our input to the chat window | |
with st.chat_message("user"): | |
st.markdown(user_prompt) | |
# Pass our input to the LLM chain and capture the final responses. | |
# It is worth noting that the Stream Handler is already receiving the | |
# streaming response as the llm is generating. We get our response | |
# here once the LLM has finished generating the complete response. | |
response = llm_chain.invoke({"question": user_prompt}) | |
# Add the response to the session state | |
st.session_state.messages.append( | |
{"role": "assistant", "content": response} | |
) | |
# Add the response to the chat window | |
with st.chat_message("assistant"): | |
st.markdown(response) | |