File size: 1,609 Bytes
802062f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
---
license: mit
datasets:
- wikipedia
language:
- en
tags:
- research
---
This model is significantly undertrained and designed for research purposes only.
For use in transformers:
```python
from transformers import AutoTokenizer, GPT2Model
import torch.nn as nn
import torch
class RMSLayerNorm(nn.Module):
def __init__(self, normalized_shape, eps=1e-8, affine=True):
super(RMSLayerNorm, self).__init__()
self.normalized_shape = normalized_shape
self.eps = eps
self.affine = affine
if self.affine:
self.weight = nn.Parameter(torch.ones(()))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
def forward(self, x):
rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
x_normalized = x / rms
if self.affine:
x_normalized = x_normalized * self.weight
return x_normalized
def replace(model):
for name, child in model.named_children():
if isinstance(child, nn.modules.normalization.LayerNorm):
setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True))
else:
replace(child)
return model
class GPTR2Model(GPT2Model):
def __init__(self, config):
super().__init__(config)
replace(self)
model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-with-momentum-without-weight-decay")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
```
For more details and example usage, see https://github.com/George-Ogden/residual-streams |