llama-2-7b-chat-ecom

Runtime error

App Files Files Community

llama-2-7b-chat-ecom / ChatService.py

safihaider

16cf660 10 months ago

raw

history blame contribute delete

No virus

1.97 kB

	import os
	import torch
	import transformers
	from huggingface_hub import login

	os.environ["CUDA_VISIBLE_DEVICES"] = "0"

	class ChatService:
	def __init__(self):
	pass

	@staticmethod
	def load_model(model_name=""):
	global tokenizer, pipeline

	print("Loading " + model_name + "...")

	# config
	gpu_count = torch.cuda.device_count()
	print('gpu_count', gpu_count)

	login(os.environ["HF_TOKEN"])

	tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
	pipeline = transformers.pipeline(
	task="text-generation",
	model=model_name,
	torch_dtype=torch.float16,
	device_map="auto",
	)

	@staticmethod
	def generate_message(req):
	history = req["chat"]
	assistant_name = req["assistant_name"] + ": "
	system_message = req.get("system_message") if req.get("system_message") is not None else ""
	temperature = req.get("temperature") if req.get("temperature") is not None else 1
	top_p = req.get("top_p") if req.get("top_p") is not None else 1
	top_k = req.get("top_k") if req.get("top_k") is not None else 10
	max_length = req.get("max_length") if req.get("max_length") is not None else 1000
	ending_tag = "[/INST]"

	fulltext = "[INST] <<SYS>>" + system_message + "<</SYS>>" + "\n\n".join(
	history) + "\n\n" + assistant_name + ending_tag

	sequences = pipeline(
	fulltext,
	do_sample=True,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	num_return_sequences=1,
	eos_token_id=tokenizer.eos_token_id,
	max_length=max_length,
	)

	response = sequences[0]['generated_text'].split(ending_tag)[1].split(assistant_name)

	response = response[1] if len(response) > 1 else response[0]

	response = response.strip()

	return response