from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain_community.llms import LlamaCpp import gradio as gr import os os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" REPO = "TheBloke/Llama-2-7B-Chat-GGUF" MODEL_NAME = "llama-2-7b-chat.Q5_K_M.gguf" DOWNLOAD_MODEL = f"huggingface-cli download {REPO} {MODEL_NAME} --local-dir . --local-dir-use-symlinks False" MODEL_PATH = "llama-2-7b-chat.Q5_K_M.gguf" if not os.path.exists(MODEL_PATH): os.system(DOWNLOAD_MODEL) TEMPLATE = """ You are a helpful AI Assistant created by Mohammed Vasim. Mohammed Vasim is an AI Engineer. Question: {question} Answer: helpful answer""" prompt = PromptTemplate.from_template(TEMPLATE) # Callbacks support token-wise streaming callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # Make sure the model path is correct for your system! llm = LlamaCpp( model_path=MODEL_PATH, temperature=0.75, max_tokens=2000, top_p=1, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) llm_chain = LLMChain(prompt=prompt, llm=llm) # question = "What NFL team won the Super Bowl in the year Justin Bieber was born?" # llm_chain.run(question) title = "Welcome to Open Source LLM" description = "This is a Llama-2-GGUF" def answer_query(message, history): print(message) message = llm_chain.invoke(message) print(message, history) return message # Gradio chat interface gr.ChatInterface( fn=answer_query, title=title, description=description, examples=[ ["What is a Large Language Model?"], ["What's 9+2-1?"], ["Write Python code to print the Fibonacci sequence"] ] ).queue().launch(server_name="0.0.0.0")