#!/usr/bin/env python3 import subprocess import json import os import shutil import sys import argparse def run_huggingface_download(model_name): """Run huggingface-cli download and return the model path.""" try: # Run the huggingface-cli download command env = os.environ.copy() result = subprocess.run( ['huggingface-cli', 'download', model_name], capture_output=True, text=True, env=env, check=True ) # The path is typically the last line of output model_path = result.stdout.strip().split('\n')[-1] print(f"Model downloaded to: {model_path}") return model_path except subprocess.CalledProcessError as e: print(f"Error downloading model: {e}") print(f"Error output: {e.stderr}") sys.exit(1) def backup_and_modify_tokenizer_config(model_path, revert=False): """Backup tokenizer_config.json and remove specified keys.""" tokenizer_config_path = os.path.join(model_path, 'tokenizer_config.json') backup_path = os.path.join(model_path, 'tokenizer_config.json.old') # Check if tokenizer_config.json exists if not os.path.exists(tokenizer_config_path): print(f"Warning: tokenizer_config.json not found in {model_path}") return # Create backup try: # Remove existing backup if it exists if os.path.exists(backup_path): os.remove(backup_path) print(f"Removed existing backup: {backup_path}") # Create new backup shutil.copy2(tokenizer_config_path, backup_path) print(f"Backup created: {backup_path}") except Exception as e: print(f"Error creating backup: {e}") print(f"Attempting to continue without backup...") # Don't exit, just warn and continue # Load and modify the JSON try: with open(tokenizer_config_path, 'r', encoding='utf-8') as f: config = json.load(f) # Check if added_tokens_decoder exists if 'added_tokens_decoder' not in config: print("Warning: 'added_tokens_decoder' key not found in tokenizer_config.json") return # Remove the specified keys keys_to_remove = ["151667", "151668"] removed_keys = [] if revert: config['added_tokens_decoder']['151667'] = { "content": "", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": False } config['added_tokens_decoder']['151668'] = { "content": "", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": False } else: for key in keys_to_remove: if key in config['added_tokens_decoder']: del config['added_tokens_decoder'][key] removed_keys.append(key) if removed_keys: print(f"Removed keys from added_tokens_decoder: {removed_keys}") elif revert: print("Reverted tokenizer config to the original") else: print("Keys 151667 and 151668 not found in added_tokens_decoder") # Write the modified config back with open(tokenizer_config_path, 'w', encoding='utf-8') as f: json.dump(config, f, indent=2, ensure_ascii=False) print(f"Modified tokenizer_config.json saved") except json.JSONDecodeError as e: print(f"Error parsing JSON: {e}") sys.exit(1) except Exception as e: print(f"Error modifying tokenizer config: {e}") sys.exit(1) def main(): parser = argparse.ArgumentParser(description='Download HuggingFace model and fix tokenizer config') parser.add_argument('--model_name', help='HuggingFace model name (e.g., Qwen/Qwen3-4B-Base)') parser.add_argument('--model_path', help='Direct path to already downloaded model directory') parser.add_argument('--revert', action='store_true', help='Revert the tokenizer config to the original') args = parser.parse_args() if args.model_path: # Use existing model path model_path = args.model_path print(f"Using existing model path: {model_path}") elif args.model_name: # Download model print(f"Downloading model: {args.model_name}") model_path = run_huggingface_download(args.model_name) else: print("Error: Either --model_name or --model_path must be provided") sys.exit(1) print(f"Processing tokenizer config in: {model_path}") backup_and_modify_tokenizer_config(model_path, args.revert) print("Done!") if __name__ == "__main__": main()