possible to get a 1.1 and 3b version?

#3
by LaferriereJC - opened

would love to train one from scratch.

would this work?
from transformers import MistralConfig, AutoModelForCausalLM

import torch

import sys

config = MistralConfig(
hidden_size = 4096,
intermediate_size = 14336,
num_hidden_layers = 16,
num_attention_heads = 32,
num_key_value_heads = 8,
)

model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)

print(f'Created a new model with {model.num_parameters()} parameters.')

#this config results in 3.75B parameters.
with torch.no_grad():
for name, param in model.named_parameters():
param.data = torch.zeros(size=param.size(), dtype=param.dtype)
model.save_pretrained(sys.argv[1])

LaferriereJC changed discussion status to closed

Sign up or log in to comment