Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	File size: 2,657 Bytes
			
			32fca7d  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86  | 
								#!/usr/bin/env python3
"""
Test script to verify /no_think tag handling in SmolLM3
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from transformers import AutoTokenizer
from data import SmolLM3Dataset
def test_no_think_tag():
    """Test that /no_think tag is properly applied"""
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
    
    # Test data
    test_data = [
        {
            "messages": [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."}
            ]
        }
    ]
    
    # Test with no_think_system_message=True
    print("=== Testing with no_think_system_message=True ===")
    dataset_with_no_think = SmolLM3Dataset(
        data_path="test_data",
        tokenizer=tokenizer,
        max_seq_length=4096,
        use_chat_template=True,
        chat_template_kwargs={
            "add_generation_prompt": True,
            "no_think_system_message": True
        }
    )
    
    # Test with no_think_system_message=False
    print("\n=== Testing with no_think_system_message=False ===")
    dataset_without_no_think = SmolLM3Dataset(
        data_path="test_data",
        tokenizer=tokenizer,
        max_seq_length=4096,
        use_chat_template=True,
        chat_template_kwargs={
            "add_generation_prompt": True,
            "no_think_system_message": False
        }
    )
    
    # Test manual chat template application
    print("\n=== Manual chat template test ===")
    messages = [
        {"role": "user", "content": "What is machine learning?"},
        {"role": "assistant", "content": "Machine learning is a subset of AI..."}
    ]
    
    # Without /no_think
    text_without = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    print("Without /no_think:")
    print(text_without[:200] + "..." if len(text_without) > 200 else text_without)
    
    # With /no_think
    messages_with_system = [
        {"role": "system", "content": "You are a helpful assistant. /no_think"},
        {"role": "user", "content": "What is machine learning?"},
        {"role": "assistant", "content": "Machine learning is a subset of AI..."}
    ]
    text_with = tokenizer.apply_chat_template(
        messages_with_system,
        tokenize=False,
        add_generation_prompt=True
    )
    print("\nWith /no_think:")
    print(text_with[:200] + "..." if len(text_with) > 200 else text_with)
if __name__ == "__main__":
    test_no_think_tag()  |