llama-2-13b-chat-transformers

Runtime error

App Files Files Community

llama-2-13b-chat-transformers / model.py

freddyaboulton HF staff

add code

e34e07c 11 months ago

raw history blame contribute delete

No virus

2.77 kB

	from threading import Thread
	import os
	import torch
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	model_id = 'meta-llama/Llama-2-13b-chat-hf'

	is_spaces = True if "SPACE_ID" in os.environ else False
	if is_spaces :
	is_shared_ui = True if "gradio-discord-bots/llama-2-13b-chat-transformers" in os.environ['SPACE_ID'] else False
	else:
	is_shared_ui = False
	is_gpu_associated = torch.cuda.is_available()

	if torch.cuda.is_available() and not is_shared_ui:
	config = AutoConfig.from_pretrained(model_id)
	config.pretraining_tp = 1
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	config=config,
	torch_dtype=torch.float16,
	load_in_4bit=True,
	device_map='auto'
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	else:
	model = None
	tokenizer = None


	def get_prompt(message: str, chat_history: list[tuple[str, str]],
	system_prompt: str) -> str:
	texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
	# The first user input is _not_ stripped
	do_strip = False
	for user_input, response in chat_history:
	user_input = user_input.strip() if do_strip else user_input
	do_strip = True
	texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
	message = message.strip() if do_strip else message
	texts.append(f'{message} [/INST]')
	return ''.join(texts)


	def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
	prompt = get_prompt(message, chat_history, system_prompt)
	input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
	return input_ids.shape[-1]


	def run(message: str,
	chat_history: list[tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.8,
	top_p: float = 0.95,
	top_k: int = 50) -> str:
	prompt = get_prompt(message, chat_history, system_prompt)
	inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')

	streamer = TextIteratorStreamer(tokenizer,
	timeout=10.,
	skip_prompt=True,
	skip_special_tokens=True)
	generate_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)

	return "".join(outputs)