Spaces:
Runtime error
Runtime error
File size: 2,871 Bytes
970607e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
import os
from unittest import TestCase
from llama.tokenizer import ChatFormat, Tokenizer
# TOKENIZER_PATH=<path> python -m unittest llama/test_tokenizer.py
class TokenizerTests(TestCase):
def setUp(self):
self.tokenizer = Tokenizer(os.environ["TOKENIZER_PATH"])
self.format = ChatFormat(self.tokenizer)
def test_special_tokens(self):
self.assertEqual(
self.tokenizer.special_tokens["<|begin_of_text|>"],
128000,
)
def test_encode(self):
self.assertEqual(
self.tokenizer.encode(
"This is a test sentence.",
bos=True,
eos=True
),
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
)
def test_decode(self):
self.assertEqual(
self.tokenizer.decode(
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
),
"<|begin_of_text|>This is a test sentence.<|end_of_text|>",
)
def test_encode_message(self):
message = {
"role": "user",
"content": "This is a test sentence.",
}
self.assertEqual(
self.format.encode_message(message),
[
128006, # <|start_header_id|>
882, # "user"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 1296, 11914, 13, # This is a test sentence.
128009, # <|eot_id|>
]
)
def test_encode_dialog(self):
dialog = [
{
"role": "system",
"content": "This is a test sentence.",
},
{
"role": "user",
"content": "This is a response.",
}
]
self.assertEqual(
self.format.encode_dialog_prompt(dialog),
[
128000, # <|begin_of_text|>
128006, # <|start_header_id|>
9125, # "system"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 1296, 11914, 13, # "This is a test sentence."
128009, # <|eot_id|>
128006, # <|start_header_id|>
882, # "user"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 2077, 13, # "This is a response.",
128009, # <|eot_id|>
128006, # <|start_header_id|>
78191, # "assistant"
128007, # <|end_of_header|>
271, # "\n\n"
]
)
|