File size: 3,496 Bytes
a57357b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python

import json
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

def load_tokenizers():
    """Load both tokenizers."""
    print("Loading tokenizers...")
    phi_tokenizer = AutoTokenizer.from_pretrained(
        "unsloth/phi-4-unsloth-bnb-4bit",
        trust_remote_code=True
    )
    deepseek_tokenizer = AutoTokenizer.from_pretrained(
        "deepseek-ai/deepseek-llama-7b-base",
        trust_remote_code=True
    )
    return phi_tokenizer, deepseek_tokenizer

def analyze_token_counts(jsonl_path, phi_tokenizer, deepseek_tokenizer, sample_size=100):
    """Analyze token count differences between tokenizers."""
    token_counts = {
        'phi': [],
        'deepseek': [],
        'differences': []
    }
    
    print(f"Analyzing token counts from {jsonl_path}")
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    
    # Take a random sample if sample_size specified
    if sample_size and sample_size < len(data):
        data = np.random.choice(data, sample_size, replace=False)
    
    for item in tqdm(data, desc="Processing entries"):
        text = item.get('text', '') or item.get('content', '')
        
        # Get token counts
        phi_tokens = len(phi_tokenizer.encode(text))
        deepseek_tokens = len(deepseek_tokenizer.encode(text))
        
        token_counts['phi'].append(phi_tokens)
        token_counts['deepseek'].append(deepseek_tokens)
        token_counts['differences'].append(phi_tokens - deepseek_tokens)
    
    return token_counts

def plot_comparison(token_counts):
    """Create visualization of token count differences."""
    plt.figure(figsize=(12, 6))
    
    # Plot token count distributions
    plt.subplot(1, 2, 1)
    plt.hist([token_counts['phi'], token_counts['deepseek']], 
            label=['Phi-4', 'DeepSeek'], alpha=0.6)
    plt.title('Token Count Distribution')
    plt.xlabel('Number of Tokens')
    plt.ylabel('Frequency')
    plt.legend()
    
    # Plot differences
    plt.subplot(1, 2, 2)
    plt.hist(token_counts['differences'], bins=30)
    plt.title('Token Count Differences\n(Phi-4 minus DeepSeek)')
    plt.xlabel('Difference in Tokens')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('tokenization_analysis.png')
    print("Saved visualization to tokenization_analysis.png")

def main():
    # Load tokenizers
    phi_tokenizer, deepseek_tokenizer = load_tokenizers()
    
    # Analyze token counts
    token_counts = analyze_token_counts(
        "../../../../data_processing/data/training_data.jsonl",
        phi_tokenizer,
        deepseek_tokenizer
    )
    
    # Calculate statistics
    phi_mean = np.mean(token_counts['phi'])
    deepseek_mean = np.mean(token_counts['deepseek'])
    diff_mean = np.mean(token_counts['differences'])
    diff_std = np.std(token_counts['differences'])
    
    print("\nAnalysis Results:")
    print(f"Phi-4 average tokens: {phi_mean:.1f}")
    print(f"DeepSeek average tokens: {deepseek_mean:.1f}")
    print(f"Average difference: {diff_mean:.1f} ± {diff_std:.1f}")
    print(f"Max Phi-4 tokens: {max(token_counts['phi'])}")
    print(f"Max DeepSeek tokens: {max(token_counts['deepseek'])}")
    
    # Create visualization
    plot_comparison(token_counts)

if __name__ == "__main__":
    main()