suriya7
/

ChatGPT-2.V2

Text Generation

conversational-ai

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

ChatGPT-2.V2 / custom_tokenizer.py

suriya7's picture

Update custom_tokenizer.py

66cf356 verified 3 months ago

2.15 kB

	from transformers import GPT2Tokenizer

	class CustomGPT2Tokenizer(GPT2Tokenizer):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.chat_template = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<\|im_start\|>system\nYou are a helpful AI assistant named Securitron, trained by Aquilax.<\|im_end\|>\n' }}{% endif %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content'] + '<\|im_end\|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<\|im_start\|>assistant\n' }}{% endif %}"

	def apply_chat_template(self, messages, add_system_prompt=True, add_generation_prompt=True):
	"""
	Applies the chat template to the provided messages. Optionally adds the system prompt.

	Args:
	messages (list): List of message dictionaries with 'role' and 'content'.
	add_system_prompt (bool): If True, adds the system prompt at the beginning.
	add_generation_prompt (bool): If True, adds a prompt for generation.

	Returns:
	str: Formatted text with the chat template applied.
	"""
	# Handle template processing here
	formatted_messages = []
	for message in messages:
	role = message.get('role', '')
	content = message.get('content', '')
	if role and content:
	formatted_messages.append(f"<\|im_start\|>{role}\n{content}<\|im_end\|>\n")

	# If the first message is not from 'system' and we want to add the system prompt, do so
	if add_system_prompt and messages[0]['role'] != 'system':
	formatted_messages.insert(0, "<\|im_start\|>system\nYou are a helpful AI assistant named Securitron, trained by Aquilax.<\|im_end\|>\n")

	formatted_text = "".join(formatted_messages)

	return formatted_text

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, args, *kwargs):
	tokenizer = super().from_pretrained(pretrained_model_name_or_path, args, *kwargs)
	tokenizer.__class__ = cls
	return tokenizer