# src/upload_to_hf.py from transformers import PreTrainedTokenizerFast import torch from model import TransformerModel from utils import load_vocab import json import os # Configuration MODEL_PATH = 'models/3ed0k4_model_epoch10.pth' # Update this path VOCAB_PATH = 'vocab.json' TOKENIZER_DIR = 'tokenizer' HF_MODEL_REPO = '3ed0k4/3ed0k4' # Replace with your Hugging Face repo # Initialize tokenizer def init_tokenizer(vocab): tokenizer = PreTrainedTokenizerFast(tokenizer_file=None) tokenizer.add_tokens(list(vocab.keys())) tokenizer.save_pretrained(TOKENIZER_DIR) print(f"Tokenizer saved to {TOKENIZER_DIR}/") # Prepare model def prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, model_path): model = TransformerModel( vocab_size=vocab_size, embed_size=embed_size, num_heads=num_heads, hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout ) model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() # Save model model.save_pretrained('.') # Saves state_dict; Hugging Face expects more torch.save(model.state_dict(), 'pytorch_model.bin') print("Model weights saved as pytorch_model.bin") # Create config.json def create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout): config = { "vocab_size": vocab_size, "embed_size": embed_size, "num_heads": num_heads, "hidden_dim": hidden_dim, "num_layers": num_layers, "dropout": dropout } with open('config.json', 'w') as f: json.dump(config, f, indent=4) print("Config saved as config.json") if __name__ == "__main__": # Load vocabulary vocab = load_vocab(VOCAB_PATH) vocab_size = len(vocab) # Initialize tokenizer init_tokenizer(vocab) # Model parameters embed_size = 256 num_heads = 8 hidden_dim = 512 num_layers = 4 dropout = 0.1 # Prepare and save model prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, MODEL_PATH) # Create config.json create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout) print("Model preparation for Hugging Face completed.")