import torch import transformers from transformers import AutoTokenizer, pipeline import gradio as gr import os token = os.getenv('HF_TOKEN') model = "meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained(model, token=token) llama_pipeline = pipeline( "text-generation", model=model, torch_dtype = torch.bfloat16, device_map="auto" ) BOS = "" EOS = "" BINS = "[INST] " EINS = " [/INST]" BSYS = "<>\n" ESYS = "\n<>\n\n" SYSTEM_PROMPT = BOS + BINS + BSYS + """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, just say you don't know, please don't share false information.""" + ESYS def message_format(msg: str, history: list, history_lim: int = 5): history = history[-max(len(history), history_lim):] if len(history) == 0: return SYSTEM_PROMPT + f"{msg} {EINS}" # history is list of (user_query, model_response) query = SYSTEM_PROMPT + f"{history[0][0]} {EINS} {history[0][1]} {EOS}" for user_query, model_response in history[1:]: query += f"{BOS}{BINS} {user_query} {EINS} {model_response} {EOS}" query += f"{BOS}{BINS} {msg} {EINS}" return query def response(msg: str, history: list): query = message_format(msg, history) response = "" sequences = llama_pipeline( query, do_sample=True, #randomly sample from the most likely tokens for diversity in generated text. top_k=10, #consider the top 10 likely tokens at each step num_return_sequences=1, # return the most likely answer at last generation step. eos_token_id=tokenizer.eos_token_id, # when reaching end-of-sentence token, it will stop generating max_length=1024 # set the max length if the answers is too long ) generated_text = sequences[0]["generated_text"] response = generated_text[len(query):].strip() # removing prompt print(f"AI Agent: {response}") return response app = gr.ChatInterface(fn=response, examples=["hello", "bonjour", "xin chao"], title="Llama 2 Chat") app.launch(share=True)