import spaces import json import subprocess import os import sys import torch import gradio as gr from huggingface_hub import snapshot_download from jinja2 import Template, Environment, BaseLoader from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler import flash_attn model = None cache = None snapshot_download( repo_id="bartowski/Mistral-7B-Instruct-v0.3-exl2", revision="8_0", local_dir = "./models/Mistral-7B-instruct-exl2" ) snapshot_download( repo_id="turboderp/Mistral-Nemo-Instruct-12B-exl2", revision="3.0bpw", local_dir = "./models/Mistral-Nemo-Instruct-12B-exl2" ) # snapshot_download( # repo_id="MikeRoz/mistralai_Mistral-Small-24B-Instruct-2501-6.0bpw-h6-exl2", # revision="main", # local_dir = "./models/Mistral-Small-24B-Instruct-exl2" # ) css = """ .bubble-wrap { padding-top: calc(var(--spacing-xl) * 3) !important; } .message-row { justify-content: space-evenly !important; width: 100% !important; margin: calc(var(--spacing-xl)) 0 !important; padding: 0 calc(var(--spacing-xl) * 3) !important; } .message.user { border-bottom-right-radius: var(--radius-xl) !important; } .message.bot{ text-align: right; width: 100%; padding: 10px; border-radius: 10px; } .message-bubble-border { border-radius: 6px !important; } .message-buttons { justify-content: flex-end !important; } .message-buttons-bot, .message-buttons-user { right: 10px !important; left: auto !important; bottom: 2px !important; } .dark.message-bubble-border { border-color: #343140 !important; } .dark.user { background: #1e1c26 !important; } .dark.assistant.dark, .dark.pending.dark { background: #16141c !important; } """ # Jinja2 template for conversation formatting CHAT_TEMPLATE_MISTAL = """{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] | trim + '\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{ system_message}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] | trim + eos_token }}{% endif %}{% endfor %}""" CHAT_TEMPLATE_LLAMA_3 = """{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n' }}{% endif %}""" def load_tokenizer_config(model_path): config_path = os.path.join(model_path, "tokenizer_config.json") with open(config_path, 'r') as f: config = json.load(f) return config def create_jinja_environment(): def raise_exception(message): raise ValueError(message) env = Environment(loader=BaseLoader()) env.globals['raise_exception'] = raise_exception return env def format_conversation(system_message, history, new_message, tokenizer): env = create_jinja_environment() template = env.from_string(CHAT_TEMPLATE_MISTAL) messages = [] if system_message: messages.append({"role": "system", "content": f'{system_message}'}) for msn in history: messages.append({"role": "user", "content": msn[0]}) messages.append({"role": "assistant", "content": msn[1]}) messages.append({"role": "user", "content": new_message}) print(messages) # try: conversation = template.render( messages=messages, bos_token=tokenizer.get('bos_token'), eos_token=tokenizer.get('eos_token'), add_generation_prompt=True ) # except ValueError as e: # print(f"Error in template rendering: {str(e)}") # # Fallback to a simple format if template rendering fails # conversation = f"{tokenizer.get('bos_token', '')}" + "".join([f"[INST] {msg['content']} [/INST]" if msg['role'] == 'user' else msg['content'] for msg in messages]) return conversation @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): global model global cache model_path = "models/Mistral-7B-instruct-exl2/" # Set up the model configuration config = ExLlamaV2Config(model_path) if model is None or cache is None: # Initialize the model with the configuration model = ExLlamaV2(config) # Create a cache for the model, with lazy initialization cache = ExLlamaV2Cache(model, lazy = True) # Load the model weights, automatically splitting them if necessary model.load_autosplit(cache) # Initialize the tokenizer with the model configuration tokenizer = ExLlamaV2Tokenizer(config) # Create a generator for text generation generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer) # Load the full tokenizer config tokenizer_config = load_tokenizer_config(model_path) # Initialize an empty context tensor to store the conversation history context_ids = torch.empty((1, 0), dtype = torch.long) # Format the entire conversation conversation = format_conversation(system_message, history, message, tokenizer_config) ## Tokenize the conversation instruction_ids = tokenizer.encode(conversation, add_bos = True, encode_special_tokens=True) context_ids = torch.cat([context_ids, instruction_ids], dim = -1) print(conversation) # Create and enqueue a new generation job generator.enqueue( ExLlamaV2DynamicJob( input_ids=context_ids, max_new_tokens=max_tokens, gen_settings=ExLlamaV2Sampler.Settings( temperature=temperature, top_p=top_p, top_k=top_k, token_repetition_penalty=repeat_penalty, ), stop_conditions=[tokenizer.eos_token_id], ) ) outputs = "" eos = False # Generate and stream the response while not eos: results = generator.iterate() for result in results: if result["stage"] == "streaming": eos = result["eos"] if "text" in result: # Print the generated text in real-time outputs += result["text"] yield outputs if "token_ids" in result: # Add the generated tokens to the context context_ids = torch.cat([context_ids, result["token_ids"]], dim = -1) PLACEHOLDER = """
Logo

ExLlama V2

ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs. Supports paged attention via Flash Attention

Mistral Instruct 7B v3 Meta Llama 3 70B Instruct
Discord GitHub
""" demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a helpful assistant.", label="System message"), gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p", ), gr.Slider( minimum=0, maximum=100, value=40, step=1, label="Top-k", ), gr.Slider( minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty", ), ], theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set( body_background_fill_dark="#16141c", block_background_fill_dark="#16141c", block_border_width="1px", block_title_background_fill_dark="#1e1c26", input_background_fill_dark="#16141c", button_secondary_background_fill_dark="#24212b", border_color_accent_dark="#343140", border_color_primary_dark="#343140", background_fill_secondary_dark="#16141c", color_accent_soft_dark="transparent", code_background_fill_dark="#292733", ), css=css, # retry_btn="Retry", # undo_btn="Undo", # clear_btn="Clear", # submit_btn="Send", description="Exllama: Chat with exl2 [repo](https://github.com/pabl-o-ce/hf-exllama)", chatbot=gr.Chatbot( scale=1, placeholder=PLACEHOLDER, # likeable=False, show_copy_button=True ) ) if __name__ == "__main__": demo.launch(allowed_paths=["https://huggingface.co/spaces/pabloce/llama-cpp-agent/resolve/main/llama.jpg"])