This model is significantly undertrained and designed for research purposes only.
For use in transformers:

from transformers import AutoTokenizer, GPT2Model

import torch.nn as nn
import torch

class RMSLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-8, affine=True):
        super(RMSLayerNorm, self).__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.affine = affine

        if self.affine:
            self.weight = nn.Parameter(torch.ones(()))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)

    def forward(self, x):
        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        x_normalized = x / rms
        if self.affine:
            x_normalized = x_normalized * self.weight
        return x_normalized


def replace(model):
    for name, child in model.named_children():
        if isinstance(child, nn.modules.normalization.LayerNorm):
            setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True))
        else:
            replace(child)
    return model


class GPTR2Model(GPT2Model):
    def __init__(self, config):
        super().__init__(config)
        replace(self)

model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-without-momentum-with-weight-decay")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

For more details and example usage, see https://github.com/George-Ogden/residual-streams

Downloads last month
43
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no pipeline_tag.

Dataset used to train George-Ogden/gptr2-nano-without-momentum-with-weight-decay