File size: 8,874 Bytes

import torch
import os
import json
import re # <-- Import the regular expression module
from datetime import datetime
from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import Qwen3Config, Qwen3ForCausalLM
from collections import Counter

# --- Helper Functions (Definitive Version) ---

def create_vocab_mapping(s_tok, t_tok):
    s_vocab, t_vocab = s_tok.get_vocab(), t_tok.get_vocab()
    s_tok_to_id = {t: i for t, i in s_vocab.items()}
    mapping = {t_id: s_tok_to_id.get(t, -1) for t, t_id in t_vocab.items()}
    matches = sum(1 for v in mapping.values() if v != -1)
    print(f"Vocabulary overlap: {matches}/{len(t_vocab)} tokens ({matches/len(t_vocab)*100:.1f}%) will be transferred.")
    return mapping

def verify_special_tokens(s_tok, t_tok, mapping):
    print("\nVerifying special token mappings...")
    for name, token_value in t_tok.special_tokens_map.items():
        def _process_token(token_str):
            if token_str and token_str in t_tok.get_vocab():
                t_id = t_tok.convert_tokens_to_ids(token_str)
                s_id = mapping.get(t_id, -1)
                status = f"Mapped (T: {t_id} -> S: {s_id})" if s_id != -1 else "NOT FOUND in source (initialized with mean)"
                print(f"  ✓ ('{token_str}'): {status}")
        if isinstance(token_value, str): _process_token(token_value)
        elif isinstance(token_value, list):
            for token_str_in_list in token_value: _process_token(token_str_in_list)

def create_hybrid_matrix(s_matrix, mapping, shape):
    print("  -> Calculating mean embedding from source model for new token initialization...")
    mean_embedding = s_matrix.mean(dim=0, keepdim=True)
    hybrid = torch.zeros(shape, dtype=s_matrix.dtype, device='cpu')
    for t_id, s_id in mapping.items():
        hybrid[t_id] = s_matrix[s_id] if s_id != -1 else mean_embedding
    return hybrid.to(s_matrix.device)

def save_config_diff(s_conf, t_conf, path):
    s_dict, t_dict = s_conf.to_dict(), t_conf.to_dict()
    diff = {'changed': {}, 'added': {}, 'removed': {}}
    for k in set(s_dict.keys()) | set(t_dict.keys()):
        if s_dict.get(k) != t_dict.get(k):
            if k in s_dict and k in t_dict: diff['changed'][k] = {'from': s_dict[k], 'to': t_dict[k]}
            elif k in t_dict: diff['added'][k] = t_dict[k]
            else: diff['removed'][k] = s_dict[k]
    with open(os.path.join(path, "config_diff.json"), "w") as f: json.dump(diff, f, indent=2)

def validate_model(path):
    print("\n[Step 6/6] Validating final model (smoke test)...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", torch_dtype=torch.bfloat16)
        model.eval()
        prompt = "The theory of relativity states that"
        print(f"\nValidation Prompt: '{prompt}'")
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=25, do_sample=False, pad_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated Response: '{response}'")
        assert len(response) > len(prompt), "Model did not generate new tokens."
        print("\n  ✓ Validation successful: Model loads and generates coherent text using standard transformers.")
    except Exception as e:
        print(f"\n  ✗ Validation FAILED: {e}")

# --- Main Conversion Logic ---
def convert_qwen2_to_qwen3_decoupled():
    source_model_id, donor_model_id = "Qwen/Qwen2.5-72B-Instruct", "Qwen/Qwen3-32B"
    target_model_path = "./Qwen3-72B-Instruct"
    print("Starting DECOUPLED conversion process (v5.3)...")

    # --- 1. Pre-flight Checks ---
    print("\n[Step 1/6] Running pre-flight architectural checks...")
    s_config = AutoConfig.from_pretrained(source_model_id)
    d_config = AutoConfig.from_pretrained(donor_model_id)
    assert s_config.hidden_act == d_config.hidden_act, f"FATAL: Hidden activation mismatch! Source: {s_config.hidden_act}, Donor: {d_config.hidden_act}."
    print("  ✓ Hidden activation functions match.")
    if s_config.rope_theta != d_config.rope_theta:
        print(f"  ✓ RoPE Theta: Using donor value {d_config.rope_theta} (source was {s_config.rope_theta})")
    
    # --- 2. Load Models & Tokenizers using AutoModel ---
    print("\n[Step 2/6] Loading models & tokenizers using standard AutoClasses...")
    dtype = torch.bfloat16
    s_model = AutoModelForCausalLM.from_pretrained(source_model_id, torch_dtype=dtype, device_map="auto")
    d_model = AutoModelForCausalLM.from_pretrained(donor_model_id, torch_dtype=dtype, device_map="auto")
    s_tokenizer = AutoTokenizer.from_pretrained(source_model_id)
    t_tokenizer = AutoTokenizer.from_pretrained(donor_model_id)

    # --- 3. Create Target Config & Initialize ---
    print("\n[Step 3/6] Creating target Qwen3 72B config & initializing model shell...")
    t_config = Qwen3Config(hidden_size=s_config.hidden_size, intermediate_size=s_config.intermediate_size, num_hidden_layers=s_config.num_hidden_layers, num_attention_heads=s_config.num_attention_heads, num_key_value_heads=s_config.num_key_value_heads, max_position_embeddings=s_config.max_position_embeddings, max_window_layers=s_config.max_window_layers, sliding_window=s_config.sliding_window, attention_bias=d_config.attention_bias, hidden_act=d_config.hidden_act, initializer_range=d_config.initializer_range, rms_norm_eps=d_config.rms_norm_eps, rope_theta=d_config.rope_theta, vocab_size=d_config.vocab_size, tie_word_embeddings=True)
    with torch.device("meta"): t_model = Qwen3ForCausalLM(t_config)

    # --- 4. Convert and Transfer Weights ---
    print("\n[Step 4/6] Converting weights (memory-safe)...")
    s_state_dict = {k: v.to('cpu', dtype=dtype) for k, v in tqdm(s_model.state_dict().items(), desc="Source state dict to CPU")}
    d_state_dict = {k: v.to('cpu', dtype=dtype) for k, v in tqdm(d_model.state_dict().items(), desc="Donor state dict to CPU")}
    
    vocab_mapping = create_vocab_mapping(s_tokenizer, t_tokenizer)
    verify_special_tokens(s_tokenizer, t_tokenizer, vocab_mapping)
    
    new_state_dict = {}
    num_donor_layers = d_config.num_hidden_layers
    
    for key in tqdm(t_model.state_dict().keys(), desc="Transferring weights"):
        if "q_norm" in key or "k_norm" in key:
            # --- FIX: Implement Cyclical Grafting for Norm Layers ---
            match = re.search(r'layers\.(\d+)\.', key)
            if match:
                target_layer_idx = int(match.group(1))
                donor_layer_idx = target_layer_idx % num_donor_layers
                donor_key = key.replace(f'layers.{target_layer_idx}.', f'layers.{donor_layer_idx}.')
                new_state_dict[key] = d_state_dict[donor_key].clone()
            else:
                print(f"  ⚠️ Could not parse layer index for norm key: {key}. Skipping.")
        elif "model.embed_tokens.weight" in key: new_state_dict[key] = create_hybrid_matrix(s_state_dict[key], vocab_mapping, (t_config.vocab_size, t_config.hidden_size))
        elif "lm_head.weight" in key: new_state_dict[key] = create_hybrid_matrix(s_state_dict[key], vocab_mapping, (t_config.vocab_size, t_config.hidden_size))
        elif key in s_state_dict: new_state_dict[key] = s_state_dict[key].clone()
        else: print(f"  ⚠️ Unhandled key: {key} (not in source, skipping)")

    t_model.load_state_dict(new_state_dict, strict=True, assign=True)
    t_model = t_model.to(dtype)

    # --- 5. Save Final Model & Metadata ---
    print("\n[Step 5/6] Saving final model and supporting files...")
    if not os.path.exists(target_model_path): os.makedirs(target_model_path)
    t_model.save_pretrained(target_model_path, safe_serialization=True)
    t_tokenizer.save_pretrained(target_model_path)
    save_config_diff(s_config, t_config, target_model_path)
    metadata = {"conversion_date_utc": datetime.utcnow().isoformat(), "source_model": source_model_id, "donor_model": donor_model_id,
                "warnings": ["This is a community-created model merge. Its behavior may be unpredictable.", "Sliding window config inherited from Qwen2.5 with Qwen3 RoPE theta - long context behavior MUST be validated.", "Post-conversion evaluation is highly recommended for numerical stability, quantization, and safety alignment."]}
    with open(os.path.join(target_model_path, "conversion_metadata.json"), "w") as f: json.dump(metadata, f, indent=2)
    print(f"✅ Model saved to: {target_model_path}")
    
    # --- 6. Final Validation ---
    del s_model, d_model, s_state_dict, d_state_dict, new_state_dict, t_model
    torch.cuda.empty_cache()
    validate_model(target_model_path)

if __name__ == "__main__":
    convert_qwen2_to_qwen3_decoupled()