Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test script to verify /no_think tag handling in SmolLM3 | |
""" | |
import sys | |
import os | |
sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
from transformers import AutoTokenizer | |
from data import SmolLM3Dataset | |
def test_no_think_tag(): | |
"""Test that /no_think tag is properly applied""" | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B") | |
# Test data | |
test_data = [ | |
{ | |
"messages": [ | |
{"role": "user", "content": "What is machine learning?"}, | |
{"role": "assistant", "content": "Machine learning is a subset of AI..."} | |
] | |
} | |
] | |
# Test with no_think_system_message=True | |
print("=== Testing with no_think_system_message=True ===") | |
dataset_with_no_think = SmolLM3Dataset( | |
data_path="test_data", | |
tokenizer=tokenizer, | |
max_seq_length=4096, | |
use_chat_template=True, | |
chat_template_kwargs={ | |
"add_generation_prompt": True, | |
"no_think_system_message": True | |
} | |
) | |
# Test with no_think_system_message=False | |
print("\n=== Testing with no_think_system_message=False ===") | |
dataset_without_no_think = SmolLM3Dataset( | |
data_path="test_data", | |
tokenizer=tokenizer, | |
max_seq_length=4096, | |
use_chat_template=True, | |
chat_template_kwargs={ | |
"add_generation_prompt": True, | |
"no_think_system_message": False | |
} | |
) | |
# Test manual chat template application | |
print("\n=== Manual chat template test ===") | |
messages = [ | |
{"role": "user", "content": "What is machine learning?"}, | |
{"role": "assistant", "content": "Machine learning is a subset of AI..."} | |
] | |
# Without /no_think | |
text_without = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
print("Without /no_think:") | |
print(text_without[:200] + "..." if len(text_without) > 200 else text_without) | |
# With /no_think | |
messages_with_system = [ | |
{"role": "system", "content": "You are a helpful assistant. /no_think"}, | |
{"role": "user", "content": "What is machine learning?"}, | |
{"role": "assistant", "content": "Machine learning is a subset of AI..."} | |
] | |
text_with = tokenizer.apply_chat_template( | |
messages_with_system, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
print("\nWith /no_think:") | |
print(text_with[:200] + "..." if len(text_with) > 200 else text_with) | |
if __name__ == "__main__": | |
test_no_think_tag() |