Spaces:

Aston-xMAD
/

1bit_llama3_instruct_xmad_chatbot

Runtime error

App Files Files Community

1bit_llama3_instruct_xmad_chatbot / backups /app_v3.py

Aston-xMAD

init commit

b37c16f verified 4 months ago

raw

history blame

5.41 kB

	import os
	import torch
	import gradio as gr
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	# Environment variables
	os.environ["TOKENIZERS_PARALLELISM"] = "0"
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Enable synchronous CUDA operations

	# Load model and tokenizer
	model = None
	tokenizer = None

	def load_model_and_tokenizer(model_name, dtype, kv_bits):
	global model, tokenizer
	if model is None or tokenizer is None:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	special_tokens = {"pad_token": "<PAD>"}
	tokenizer.add_special_tokens(special_tokens)

	config = AutoConfig.from_pretrained(model_name)
	if kv_bits != "unquantized":
	quantizer_path = os.path.join("codebooks", f"{model_name.split('/')[-1]}_{kv_bits}bit.xmad")
	setattr(config, "quantizer_path", quantizer_path)

	dtype = torch.__dict__.get(dtype, torch.float32)
	model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")

	if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
	model.resize_token_embeddings(len(tokenizer))

	tokenizer.padding_side = "left"
	model.config.pad_token_id = tokenizer.pad_token_id

	return model, tokenizer

	# Initialize model and tokenizer
	load_model_and_tokenizer("NousResearch/Hermes-2-Theta-Llama-3-8B", "fp16", "1")

	def respond(message, history, system_message, max_tokens, temperature, top_p):
	messages = [{"role": "system", "content": system_message}]
	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})
	messages.append({"role": "user", "content": message})

	# Prepare input prompt
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	tokenized_input_prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

	response = ""
	try:
	with torch.no_grad():
	while len(response.split()) < max_tokens:
	output = model.generate(
	tokenized_input_prompt_ids,
	max_new_tokens=1,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.pad_token_id,
	return_dict_in_generate=True,
	output_scores=True,
	)
	next_token_id = output.sequences[:, -1:]
	tokenized_input_prompt_ids = torch.cat([tokenized_input_prompt_ids, next_token_id], dim=1)
	token = tokenizer.decode(next_token_id[0], skip_special_tokens=True)
	response += token
	yield response

	if tokenizer.eos_token_id in next_token_id:
	break
	except Exception as e:
	yield f"Error: {str(e)}"

	# Initialize Gradio ChatInterface
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	],
	theme="default",
	title="1bit Llama3 by xMAD.ai",
	description="""
	Welcome to the future of AI with xMAD.ai's 1bit Llama3, a breakthrough in Large Language Model (LLM) quantization and efficiency. Our cutting-edge technology offers:

	1. Unmatched Speed: Achieve an impressive 800 tokens per second on NVIDIA V100 and 1200 tokens per second on NVIDIA A100.
	2. Cost Efficiency: Slash your cloud hosting expenses by up to 90% with our highly optimized models, delivering significant savings for enterprises.
	3. Scalability: Support for up to 10x the number of concurrent users without compromising performance, ensuring seamless user experiences.
	4. Memory Savings: Experience 7x memory reduction, allowing you to run powerful LLMs on standard hardware.
	5. Democratization of AI: Make advanced LLMs accessible for various applications, from customer service to content creation, all while maintaining high accuracy and reliability.

	Our Llama3 model is the first in the industry to achieve 1-bit quantization without any loss in model performance. This innovation enables businesses to deploy robust AI solutions locally or in the cloud with minimal overhead.

	Explore the potential of Llama3 with our interactive demo, where you can see real-time text generation and understand how our technology can transform your operations. Whether you are looking to enhance your chatbot capabilities, streamline your operations, or cut down on AI deployment costs, xMAD.ai offers a solution that scales with your needs.

	Join us in redefining AI efficiency and cost-effectiveness. Try the demo now and see the difference!
	""",
	css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }"
	)

	if __name__ == "__main__":
	demo.launch(share=False)