1bit_llama3_instruct_xmad_chatbot / backups /app_v2_enabled_streaming.py
Aston-xMAD's picture
init commit
b37c16f verified
import os
import torch
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
# Environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Enable synchronous CUDA operations
# Load model and tokenizer
model = None
tokenizer = None
def load_model_and_tokenizer(model_name, dtype, kv_bits):
global model, tokenizer
if model is None or tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
special_tokens = {"pad_token": "<PAD>"}
tokenizer.add_special_tokens(special_tokens)
config = AutoConfig.from_pretrained(model_name)
if kv_bits != "unquantized":
quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
setattr(config, "quantizer_path", quantizer_path)
dtype = torch.__dict__.get(dtype, torch.float32)
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
# Initialize model and tokenizer
load_model_and_tokenizer("NousResearch/Hermes-2-Theta-Llama-3-8B", "fp16", "1")
def respond(message, history, system_message, max_tokens, temperature, top_p):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Prepare input prompt
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
tokenized_input_prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
response = ""
try:
for _ in range(max_tokens):
with torch.no_grad():
output = model.generate(
tokenized_input_prompt_ids,
max_new_tokens=1,
temperature=temperature,
top_p=top_p,
do_sample=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
return_dict_in_generate=True,
output_scores=True,
)
next_token_id = output.sequences[0, -1].unsqueeze(0).unsqueeze(0)
if next_token_id.item() >= len(tokenizer):
raise ValueError(f"Next token ID {next_token_id.item()} is out of bounds for vocab size {len(tokenizer)}")
tokenized_input_prompt_ids = torch.cat([tokenized_input_prompt_ids, next_token_id], dim=-1)
token = tokenizer.decode(next_token_id.squeeze().tolist(), skip_special_tokens=True)
response += token
yield response
if next_token_id == tokenizer.eos_token_id:
break
except Exception as e:
yield f"Error: {str(e)}"
# Initialize Gradio ChatInterface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
theme="default",
title="1bit llama3 by xMAD.ai",
description="""
Welcome to the future of AI with xMAD.ai's 1bit Llama3, a breakthrough in Large Language Model (LLM) quantization and efficiency. Our cutting-edge technology offers:
1. **Unmatched Speed**: Achieve an impressive 800 tokens per second on NVIDIA V100 and 1200 tokens per second on NVIDIA A100.
2. **Cost Efficiency**: Slash your cloud hosting expenses by up to 90% with our highly optimized models, delivering significant savings for enterprises.
3. **Scalability**: Support for up to 10x the number of concurrent users without compromising performance, ensuring seamless user experiences.
4. **Memory Savings**: Experience 7x memory reduction, allowing you to run powerful LLMs on standard hardware.
Our Llama3 model is the first in the industry to achieve 1-bit quantization without any loss in model performance. This innovation enables businesses to deploy robust AI solutions locally or in the cloud with minimal overhead.
Explore the potential of Llama3 with our interactive demo, where you can see real-time text generation and understand how our technology can transform your operations. Whether you are looking to enhance your chatbot capabilities, streamline your operations, or cut down on AI deployment costs, xMAD.ai offers a solution that scales with your needs.
Join us in redefining AI efficiency and cost-effectiveness. Try the demo now and see the difference! For Enterprice Demo, reach out to support@xmad.ai !
""",
css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }"
)
if __name__ == "__main__":
demo.launch(share=False)