Alexandru Gherghescu commited on Mar 5

Commit

bbb5d39

•

1 Parent(s): 8b5602c

Add original model weigts + conversion script

Browse files

Files changed (23) hide show

README.md +6 -0
gpt1-converted-weights/config.json +24 -0
gpt1-converted-weights/configuration_gpt1.py +42 -0
gpt1-converted-weights/generation_config.json +4 -0
gpt1-converted-weights/model.safetensors +3 -0
gpt1-converted-weights/modeling_gpt1.py +237 -0
original_gpt1_params/.ipynb_checkpoints/encoder_bpe_40000-checkpoint.json +0 -0
original_gpt1_params/.ipynb_checkpoints/params_shapes-checkpoint.json +1 -0
original_gpt1_params/.ipynb_checkpoints/vocab_40000-checkpoint.bpe +0 -0
original_gpt1_params/encoder_bpe_40000.json +0 -0
original_gpt1_params/params_0.npy +3 -0
original_gpt1_params/params_1.npy +3 -0
original_gpt1_params/params_2.npy +3 -0
original_gpt1_params/params_3.npy +3 -0
original_gpt1_params/params_4.npy +3 -0
original_gpt1_params/params_5.npy +3 -0
original_gpt1_params/params_6.npy +3 -0
original_gpt1_params/params_7.npy +3 -0
original_gpt1_params/params_8.npy +3 -0
original_gpt1_params/params_9.npy +3 -0
original_gpt1_params/params_shapes.json +1 -0
original_gpt1_params/vocab_40000.bpe +0 -0
tf_weights_to_hf.py +85 -0

README.md CHANGED Viewed

@@ -36,3 +36,9 @@ See `preprocessing.py` on how the data was preprocessed and tokenized.
 See `pre_training.py` on how the model was pre-trained.
 See `inference.py` for an example.

 See `pre_training.py` on how the model was pre-trained.
 See `inference.py` for an example.
+## Converted model
+Inside `gpt1-converted-weights/` is the converted safetensors model from the
+original weights, which can be used directly with the code inside this repo. The
+conversion script and original weights can also be found there.

gpt1-converted-weights/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "GPT1ForCausalLM"
+  ],
+  "attention_dropout": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt1.GPT1Config",
+    "AutoModelForCausalLM": "modeling_gpt1.GPT1ForCausalLM"
+  },
+  "embd_pdrop": 0.1,
+  "hidden_act": "gelu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 512,
+  "model_type": "gpt1",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "resid_pdrop": 0.1,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.1",
+  "vocab_size": 40478
+}

gpt1-converted-weights/configuration_gpt1.py ADDED Viewed

	@@ -0,0 +1,42 @@

+""" GPT1 model configuration """
+from transformers.configuration_utils import PretrainedConfig
+class GPT1Config(PretrainedConfig):
+    model_type = "gpt1"
+    def __init__(
+        self,
+        vocab_size=40478,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attention_dropout=0.1,
+        hidden_act="gelu",
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

gpt1-converted-weights/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.38.1"
+}

gpt1-converted-weights/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc19245dd9599204701492aecf9b89d5b130001085743adb249409040390ec02
+size 466321576

gpt1-converted-weights/modeling_gpt1.py ADDED Viewed

	@@ -0,0 +1,237 @@

+""" PyTorch GPT1 model."""
+import math
+import torch
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+)
+from transformers.activations import get_activation
+from configuration_gpt1 import GPT1Config
+class GPT1MLP(nn.Module):
+    def __init__(self, config: GPT1Config):
+        super().__init__()
+        self.activation_fn = get_activation(config.hidden_act)
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_state):
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation_fn(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+class GPT1Attention(nn.Module):
+    def __init__(self, config: GPT1Config):
+        """
+        Multi-head attention layer.
+        """
+        super().__init__()
+        assert config.hidden_size % config.num_attention_heads == 0
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attn_dropout = nn.Dropout(p=config.attention_dropout)
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size)
+    def forward(self, hidden_state, attn_mask):
+        bs, seq_len, _ = hidden_state.size() # (batch_size, seq_len, dim)
+        # linearly project the inputs
+        Q = self.q_proj(hidden_state) # (batch_size, seq_len, n_heads * head_dim)
+        K = self.k_proj(hidden_state)
+        V = self.v_proj(hidden_state)
+        # split into n_heads to compute attention
+        queries = Q.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (batch_size, n_heads, seq_len, head_dim)
+        keys = K.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        values = V.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # compute attention matmul
+        keys = keys.transpose(2, 3) # (batch_size, n_heads, head_dim, seq_len)
+        attn_scores = queries @ keys # (batch_size, n_heads, seq_len, seq_len)
+        # scale
+        attn_scores = attn_scores / math.sqrt(self.head_dim)
+        # mask
+        if attn_mask is not None:
+            attn_scores = attn_scores + attn_mask
+        # softmax (attention probabilities) + dropout
+        attn_probs = nn.functional.softmax(attn_scores, dim=-1, dtype=torch.float32).to(Q.dtype)
+        attn_probs = self.attn_dropout(attn_probs)
+        # matmul
+        attn_output = attn_probs @ values # (batch_size, n_heads, seq_len, head_dim)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bs, seq_len, self.hidden_size) # (batch_size, seq_len, n_heads * head_dim)
+        # final linear
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class GPT1DecoderLayer(nn.Module):
+    def __init__(self, config: GPT1Config):
+        super().__init__()
+        self.attention = GPT1Attention(config)
+        self.mlp = GPT1MLP(config)
+        self.attention_norm = nn.LayerNorm(normalized_shape=config.hidden_size,
+                                           eps=config.layer_norm_eps)
+        self.mlp_norm = nn.LayerNorm(normalized_shape=config.hidden_size,
+                                     eps=config.layer_norm_eps)
+        self.res_dropout = nn.Dropout(p=config.resid_pdrop)
+    def forward(self, hidden_state, attn_mask):
+        # attention
+        residual = hidden_state
+        hidden_state = self.attention(hidden_state, attn_mask)
+        hidden_state = self.res_dropout(hidden_state)
+        hidden_state = residual + hidden_state
+        hidden_state = self.attention_norm(hidden_state)
+        # feed forward fully connected
+        residual = hidden_state
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = self.res_dropout(hidden_state)
+        hidden_state = residual + hidden_state
+        hidden_state = self.mlp_norm(hidden_state)
+        return hidden_state
+class GPT1PreTrainedModel(PreTrainedModel):
+    config_class = GPT1Config
+    supports_gradient_checkpointing = False
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class GPT1Model(GPT1PreTrainedModel):
+    def __init__(self, config: GPT1Config):
+        super().__init__(config)
+        # embeddings
+        self.embs = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.embs_dropout = nn.Dropout(p=config.embd_pdrop)
+        # positional encoding (learned)
+        self.pos_emb = nn.Embedding(config.max_position_embeddings,
+                                    config.hidden_size)
+        self.layers = nn.ModuleList(
+            [GPT1DecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embs
+    def set_input_embeddings(self, value):
+        self.embs = value
+    def forward(self, input_ids, *args, **kwargs):
+        position_ids = torch.arange(input_ids.size(-1),
+                                    dtype=torch.long,
+                                    device=input_ids.device).unsqueeze_(0)
+        input_embeds = self.embs(input_ids) # (bs, seq_len, dim)
+        position_embeds = self.pos_emb(position_ids)
+        hidden_state = self.embs_dropout(input_embeds) + position_embeds
+        seq_len = input_ids.size(-1)
+        attn_mask = torch.full((seq_len, seq_len), fill_value=float('-inf'))
+        attn_mask = torch.triu(attn_mask, diagonal=1)
+        causal_mask = attn_mask.to(dtype=input_embeds.dtype,
+                                   device=input_embeds.device)
+        for layer in self.layers:
+            hidden_state = layer(hidden_state, attn_mask=causal_mask)
+        return BaseModelOutput(
+            last_hidden_state=hidden_state
+        )
+class GPT1ForCausalLM(GPT1PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: GPT1Config):
+        super().__init__(config)
+        self.model = GPT1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        # initialize weigths and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embs
+    def set_input_embeddings(self, value):
+        self.model.embs = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_decoder(self):
+        return self.model
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def forward(self, input_ids, labels=None, *args, **kwargs):
+        output = self.model(input_ids)
+        hidden_state = output[0]
+        logits = self.lm_head(hidden_state).float()
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fn = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            loss = loss_fn(shift_logits, shift_labels)
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits
+        )
+    def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
+        return { 'input_ids': input_ids }

original_gpt1_params/.ipynb_checkpoints/encoder_bpe_40000-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

original_gpt1_params/.ipynb_checkpoints/params_shapes-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@

+ [[512, 768], [40478, 768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768]]

original_gpt1_params/.ipynb_checkpoints/vocab_40000-checkpoint.bpe ADDED Viewed

The diff for this file is too large to render. See raw diff

original_gpt1_params/encoder_bpe_40000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

original_gpt1_params/params_0.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9cd095b901dfbfbe0ce5e01d151dfe0b791e955d71149969ba65a6eab4480f
+size 46614044

original_gpt1_params/params_1.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca074893c040fa69cbf2fc95c06feda45a4e1492d03b645e2076e89ccf7ddd9f
+size 46614044

original_gpt1_params/params_2.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:966c25fbd632f0df18c4d4380ba57f23410f43311a96616f00b3d05ae6592f58
+size 46614044

original_gpt1_params/params_3.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40df0d328f5d3d1b2bec768855a5d2eeeaf2b2124758ef98116f76a02526fd92
+size 46614044

original_gpt1_params/params_4.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:139f098dcd620ccf0200530e9ce9ff1c342714ff881a0c7258ac9faac4a06e6a
+size 46614040

original_gpt1_params/params_5.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad27b5cb245db9a29657270ff637d3ff1c15fd9df3683324a2936674cef8c3c5
+size 46614040

original_gpt1_params/params_6.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af5bb5c76ddfea50683e0b9895fe704ae689853ed8bb3f1b3fee4daff2f27d45
+size 46614040

original_gpt1_params/params_7.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27f55501d895ce1adb9b254aa762519a242edf2bcd2b43298b89538b5591566c
+size 46614040

original_gpt1_params/params_8.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17a2b695128ea0aae98a360351b92769b879bc0f2835862949b6405b0ce88569
+size 46614040

original_gpt1_params/params_9.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1355fcd519db223f65db7fa7b79dcaf9b4c653915ffe4bd417d87f7903225c1
+size 46614040

original_gpt1_params/params_shapes.json ADDED Viewed

	@@ -0,0 +1 @@

original_gpt1_params/vocab_40000.bpe ADDED Viewed

The diff for this file is too large to render. See raw diff

tf_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import json
+import torch
+import numpy as np
+from modeling_gpt1 import GPT1ForCausalLM, GPT1Model
+from configuration_gpt1 import GPT1Config
+GPT1Config.register_for_auto_class()
+GPT1Model.register_for_auto_class('AutoModel')
+GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+def lists_are_equal(list1, list2):
+    for i, j in zip(list1, list2):
+        if i != j:
+            return False
+    return True
+# get the original weights from the GPT1 params.npy files
+def get_weights_from_tf_model():
+    shapes = json.load(open('original_gpt1_params/params_shapes.json'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load('original_gpt1_params/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+    config = GPT1Config()
+    model = GPT1ForCausalLM(config)
+    # print(shapes[:15])
+    # print([k for k, v in model.named_parameters()][:10])
+    # embs layer
+    model.model.embs.weight.data = torch.from_numpy(init_params[1])
+    # pos enc layer
+    model.model.pos_emb.weight.data = torch.from_numpy(init_params[0])
+    layers = model.model.layers
+    for i in range(0, 12):
+        idx = 12 * i + 2
+        # attention q, k, v projections
+        init_params[idx] = np.squeeze(init_params[idx], axis=0)
+        q, k, v = torch.split(torch.tensor(init_params[idx]), 768, dim=-1)
+        layers[i].attention.q_proj.weight.data = q.detach().clone().transpose(-1, -2).contiguous()
+        layers[i].attention.k_proj.weight.data = k.detach().clone().transpose(-1, -2).contiguous()
+        layers[i].attention.v_proj.weight.data = v.detach().clone().transpose(-1, -2).contiguous()
+        # attention q, k, v biases
+        q_bias, k_bias, v_bias = torch.split(torch.tensor(init_params[idx + 1]), 768, dim=-1)
+        layers[i].attention.q_proj.bias.data = q_bias.detach().clone().contiguous()
+        layers[i].attention.k_proj.bias.data = k_bias.detach().clone().contiguous()
+        layers[i].attention.v_proj.bias.data = v_bias.detach().clone().contiguous()
+        # attention output proj + bias
+        init_params[idx + 2] = np.squeeze(init_params[idx + 2], axis=0)
+        layers[i].attention.o_proj.weight.data = torch.from_numpy(init_params[idx + 2]).transpose(-1, -2).contiguous()
+        layers[i].attention.o_proj.bias.data = torch.from_numpy(init_params[idx + 3])
+        # attention norm + bias
+        layers[i].attention_norm.weight.data = torch.from_numpy(init_params[idx + 4])
+        layers[i].attention_norm.bias.data = torch.from_numpy(init_params[idx + 5])
+        # mlp layer
+        init_params[idx + 6] = np.squeeze(init_params[idx + 6], axis=0)
+        layers[i].mlp.fc1.weight.data = torch.from_numpy(init_params[idx + 6]).transpose(-1, -2).contiguous()
+        layers[i].mlp.fc1.bias.data = torch.from_numpy(init_params[idx + 7])
+        init_params[idx + 8] = np.squeeze(init_params[idx + 8], axis=0)
+        layers[i].mlp.fc2.weight.data = torch.from_numpy(init_params[idx + 8]).transpose(-1, -2).contiguous()
+        layers[i].mlp.fc2.bias.data = torch.from_numpy(init_params[idx + 9])
+        # mlp norm + bias
+        layers[i].mlp_norm.weight.data = torch.from_numpy(init_params[idx + 10])
+        layers[i].mlp_norm.bias.data = torch.from_numpy(init_params[idx + 11])
+    model.save_pretrained('gpt1-converted-weights/')
+get_weights_from_tf_model()