# -*- coding: utf-8 -*- """Llama chat bot Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1BBWPVOOR0790ZALdTJy_pYV5bFf8KaGp """ # curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh # pip install -e transformers torch accelerate # pip install -e --upgrade gradio import transformers import os import huggingface_hub hf_token=os.environ['HF_READ_TOKEN'] print(hf_token) !huggingface-cli login --token $HF_READ_TOKEN !huggingface-cli whoami from transformers import AutoTokenizer model = "meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained(model, token=hf_token) # tokenizer = AutoTokenizer.from_pretrained(model, token=hf_token) # pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu from transformers import pipeline llama_pipeline = pipeline( "text-generation", model=model, torch_dtype="auto", # torch_dtype=torch.float16, device_map="auto", ) SYSTEM_PROMPT = """[INST] <> You are a helpful bot. Your answers are clear and concise. <> """ # Formatting function for message and history def format_message(message: str, history: list, memory_limit: int = 3) -> str: """ Formats the message and history for the Llama model. Parameters: message (str): Current message to send. history (list): Past conversation history. memory_limit (int): Limit on how many past interactions to consider. Returns: str: Formatted message string """ # always keep len(history) <= memory_limit if len(history) > memory_limit: history = history[-memory_limit:] if len(history) == 0: return SYSTEM_PROMPT + f"{message} [/INST]" formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} " # Handle conversation history for user_msg, model_answer in history[1:]: formatted_message += f"[INST] {user_msg} [/INST] {model_answer} " # Handle the current message formatted_message += f"[INST] {message} [/INST]" return formatted_message # Generate a response from the Llama model def get_llama_response(message: str, history: list) -> str: """ Generates a conversational response from the Llama model. Parameters: message (str): User's input message. history (list): Past conversation history. Returns: str: Generated response from the Llama model. """ query = format_message(message, history) response = "" sequences = llama_pipeline( query, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=1024, ) generated_text = sequences[0]['generated_text'] response = generated_text[len(query):] # Remove the prompt from the output print("Chatbot:", response.strip()) return response.strip() import gradio as gr gr.ChatInterface(get_llama_response).launch()