File size: 964 Bytes
830833d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import tiktoken
test_string = "האיש האחרון עלי אדמות ישב לבד בחדרו, כשלפתע נשמעה דפיקה בדלת"
print(f'Test string = "{test_string}"')
enc = tiktoken.get_encoding("cl100k_base")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (cl100k_base)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string
enc = tiktoken.get_encoding("gpt2")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt2)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string
enc = tiktoken.get_encoding("gpt-hebrew-tokenizer")
encoded_text = enc.encode(test_string)
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt-hebrew-tokenizer)')
decoded_text = enc.decode(encoded_text)
assert decoded_text == test_string |