from transformers import LlamaTokenizerFast # Ultra-FineWeb classifier is using "deepseek-ai/DeepSeek-V2" # path = "deepseek-ai/DeepSeek-V2" path = "local_tokenizer" tokenizer = LlamaTokenizerFast.from_pretrained(path, trust_remote_code=True) # test tokenizer content = "MiniCPM4: Ultra-Efficient LLMs on End Devices" token_ids = tokenizer.encode(content, add_special_tokens=False) print(token_ids) # decode each token and print for token_id in token_ids: print(tokenizer.decode([token_id]))