# -*- coding: utf-8 -*- # Copyright (c) Louis Brulé Naudet. All Rights Reserved. # This software may be used and distributed according to the terms of the License Agreement. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from threading import Thread from typing import Iterator import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 2048 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) def setup( model_id: str, description: str ) -> tuple: """ Set up the model and tokenizer for a given pre-trained model ID. Parameters ---------- model_id : str The ID of the pre-trained model to load. description : str A string containing additional description information. Returns ------- tuple A tuple containing the loaded model, tokenizer, and updated description. If an error occurs during setup, model and tokenizer are None, and an error message is appended to the description. """ if not torch.cuda.is_available(): description += "\n

Running on CPU 🥶 This demo does not work on CPU.

" return None, None, description model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained( model_id ) tokenizer.use_default_system_prompt = False # Update the description description += "\n

Model and tokenizer set up successfully.

" return model, tokenizer, description # except Exception as e: # # If an error occurs during setup, append the error message to the description # description += f"\n

Error occurred during model setup: {str(e)}

" # return None, None, description DESCRIPTION = """\ # Pearl-7B-0211-ties, an xtraordinary 7B model This space showcases the [Pearl-7B-0211-ties](https://huggingface.co/louisbrulenaudet/Pearl-7B-0211-ties) model by Louis Brulé Naudet, a language model with 7.24 billion parameters that achieves a score exceeding 75.10 on the Open LLM Leaderboard (average). **03-22-2024 - To date, louisbrulenaudet/Pearl-34B-ties is the "Best 🤝 base merges and moerges model of around 30B" on the Open LLM Leaderboard.** """ model, tokenizer, description = setup( model_id="louisbrulenaudet/Pearl-7B-0211-ties", description=DESCRIPTION ) def format_prompt( message, history ) -> str: """ Format a prompt for dialogue generation using historical conversation data. Parameters ---------- message : str The user's current message or prompt. history : list of tuple A list of tuples representing past interactions, where each tuple contains a user prompt and a corresponding bot response. Returns ------- str Formatted prompt for dialogue generation, including the user's current message and historical conversation data. Examples -------- >>> message = "How are you?" >>> history = [("Hi there!", "Hello!"), ("What's up?", "Not much.")] >>> format_prompt(message, history) '[INST] Hi there! [/INST] Hello! [INST] What\'s up? [/INST] Not much. [INST] How are you? [/INST]' """ prompt = "" for user_prompt, bot_response in history: prompt += f"[INST] {user_prompt} [/INST]" prompt += f" {bot_response} " prompt += f"[INST] {message} [/INST]" return prompt @spaces.GPU def generate( message: str, history: list, max_new_tokens: int = 2048, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1, ) -> Iterator[str]: """ Generate a response to a given message within a conversation context. This function utilizes a pre-trained language model to generate a response to a given message, considering the conversation context provided in the chat history. Parameters ---------- message : str The user's message for which a response is generated. chat_history : list A list containing tuples representing the conversation history. Each tuple should consist of two elements: the user's message and the assistant's response. max_new_tokens : int, optional The maximum number of tokens to generate for the response (default is 1024). temperature : float, optional The temperature parameter controlling the randomness of token generation (default is 0.6). top_p : float, optional The cumulative probability cutoff for token generation (default is 0.9). top_k : int, optional The number of top tokens to consider for token generation (default is 50). repetition_penalty : float, optional The repetition penalty controlling the likelihood of repeating tokens in the generated sequence (default is 1). Yields ------ str A generated response to the given message. Notes ----- - This function requires a GPU for efficient processing and may not work properly on CPU. - The conversation history should be provided in the form of a list of tuples, where each tuple represents a user message followed by the assistant's response. """ global tokenizer global model conversation = format_prompt( message=message, history=history ) input_ids = tokenizer.apply_chat_template( conversation, return_tensors="pt", add_generation_prompt=True ) input_ids = input_ids.to( torch.device("cuda") ) streamer = TextIteratorStreamer( tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True ) generate_kwargs = { "input_ids": input_ids, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False, "num_beams": 1, "repetition_penalty": repetition_penalty, "eos_token_id": tokenizer.eos_token_id, "pad_token_id": tokenizer.eos_token_id } t = Thread( target=model.generate, kwargs=generate_kwargs ) t.start() outputs = [] for text in streamer: outputs.append(text) yield "".join(outputs) return "".join(outputs) chatbot = gr.Chatbot( height=400, show_copy_button=True ) chat_interface = gr.ChatInterface( fn=generate, chatbot=chatbot, additional_inputs=[ gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=MAX_MAX_NEW_TOKENS, ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=50, ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1, ), ], fill_height=True, examples=[ ["Implement snake game using pygame"], ["Can you explain briefly to me what is the Python programming language?"], ["Write a program to find the factorial of a number"], ], ) with gr.Blocks() as demo: gr.Markdown( value=DESCRIPTION ) gr.DuplicateButton() chat_interface.render() if __name__ == "__main__": demo.queue().launch( show_api=False )