Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| import json | |
| from bytelatent.constants import BLT_DATA | |
| from bytelatent.tokenizers.blt_tokenizer import BltTokenizer | |
| from bytelatent.tokenizers.build_tokenizer import TokenizerArgs | |
| def test_tokenizer_bytes(): | |
| with open("fixtures/tokenizer_data.json") as f: | |
| data = json.load(f) | |
| examples: list[str] = data["texts"] | |
| examples_tokens: list[list[int]] = data["tokens"] | |
| tokenizer = BltTokenizer(bpe_delim=False) | |
| for i in range(len(examples)): | |
| assert tokenizer.encode(examples[i]) == examples_tokens[i] | |
| def test_tokenizer_bpe(): | |
| with open("fixtures/tokenizer_data_bpe_delim.json") as f: | |
| data = json.load(f) | |
| examples: list[str] = data["texts"] | |
| examples_tokens: list[list[int]] = data["tokens"] | |
| tokenizer = BltTokenizer(bpe_delim=True) | |
| for i in range(len(examples)): | |
| assert tokenizer.encode(examples[i]) == examples_tokens[i] | |
| def test_build_tokenizer_from_args(): | |
| tokenizer_args = TokenizerArgs( | |
| name="blt", | |
| init_kwargs={ | |
| "bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model" | |
| }, | |
| ) | |
| tokenizer = tokenizer_args.build() | |
| assert tokenizer.encode("test text") is not None | |