|
--- |
|
license: mit |
|
datasets: |
|
- wikipedia |
|
language: |
|
- en |
|
tags: |
|
- research |
|
--- |
|
This model is significantly undertrained and designed for research purposes only. |
|
For use in transformers: |
|
```python |
|
from transformers import AutoTokenizer, GPT2Model |
|
|
|
import torch.nn as nn |
|
import torch |
|
|
|
class RMSLayerNorm(nn.Module): |
|
def __init__(self, normalized_shape, eps=1e-8, affine=True): |
|
super(RMSLayerNorm, self).__init__() |
|
self.normalized_shape = normalized_shape |
|
self.eps = eps |
|
self.affine = affine |
|
|
|
if self.affine: |
|
self.weight = nn.Parameter(torch.ones(())) |
|
else: |
|
self.register_parameter('weight', None) |
|
self.register_parameter('bias', None) |
|
|
|
def forward(self, x): |
|
rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps) |
|
x_normalized = x / rms |
|
if self.affine: |
|
x_normalized = x_normalized * self.weight |
|
return x_normalized |
|
|
|
|
|
def replace(model): |
|
for name, child in model.named_children(): |
|
if isinstance(child, nn.modules.normalization.LayerNorm): |
|
setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True)) |
|
else: |
|
replace(child) |
|
return model |
|
|
|
|
|
class GPTR2Model(GPT2Model): |
|
def __init__(self, config): |
|
super().__init__(config) |
|
replace(self) |
|
|
|
model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-with-momentum-without-weight-decay") |
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
``` |
|
For more details and example usage, see https://github.com/George-Ogden/residual-streams |