|
from transformers import GPT2Tokenizer |
|
|
|
class CustomGPT2Tokenizer(GPT2Tokenizer): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
self.chat_template = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named Securitron, trained by Aquilax.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" |
|
|
|
def apply_chat_template(self, messages, add_system_prompt=True, add_generation_prompt=True): |
|
""" |
|
Applies the chat template to the provided messages. Optionally adds the system prompt. |
|
|
|
Args: |
|
messages (list): List of message dictionaries with 'role' and 'content'. |
|
add_system_prompt (bool): If True, adds the system prompt at the beginning. |
|
add_generation_prompt (bool): If True, adds a prompt for generation. |
|
|
|
Returns: |
|
str: Formatted text with the chat template applied. |
|
""" |
|
|
|
formatted_messages = [] |
|
for message in messages: |
|
role = message.get('role', '') |
|
content = message.get('content', '') |
|
if role and content: |
|
formatted_messages.append(f"<|im_start|>{role}\n{content}<|im_end|>\n") |
|
|
|
|
|
if add_system_prompt and messages[0]['role'] != 'system': |
|
formatted_messages.insert(0, "<|im_start|>system\nYou are a helpful AI assistant named Securitron, trained by Aquilax.<|im_end|>\n") |
|
|
|
formatted_text = "".join(formatted_messages) |
|
|
|
return formatted_text |
|
|
|
@classmethod |
|
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): |
|
tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) |
|
tokenizer.__class__ = cls |
|
return tokenizer |