Fal7acy commited on
Commit
e58f28a
1 Parent(s): fc99f44

Upload model

Browse files
Files changed (4) hide show
  1. config.json +1 -1
  2. language.py +6 -6
  3. language_config.py +43 -0
  4. pytorch_model.bin +1 -1
config.json CHANGED
@@ -4,7 +4,7 @@
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "auto_map": {
7
- "AutoConfig": "config.BigBrainConfig",
8
  "AutoModel": "language.BigBrainLanguageModel"
9
  },
10
  "hidden_act": "gelu",
 
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "auto_map": {
7
+ "AutoConfig": "language_config.BigBrainLanguageConfig",
8
  "AutoModel": "language.BigBrainLanguageModel"
9
  },
10
  "hidden_act": "gelu",
language.py CHANGED
@@ -6,7 +6,7 @@ from torch.nn import functional as f
6
  from transformers import PreTrainedModel
7
  from transformers.activations import ACT2FN
8
 
9
- from config import BigBrainConfig
10
 
11
 
12
  def _make_casual_mask(size: int) -> torch.Tensor:
@@ -26,7 +26,7 @@ class RootMeanSquareNorm(nn.Module):
26
 
27
 
28
  class MultiLayerPerceptron(nn.Module):
29
- def __init__(self, config: BigBrainConfig):
30
  super().__init__()
31
  self.config = config
32
  self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
@@ -72,7 +72,7 @@ class RotaryPositionalEmbedding(nn.Module):
72
 
73
 
74
  class RotaryMultiHeadAttention(nn.Module):
75
- def __init__(self, config: BigBrainConfig):
76
  super().__init__()
77
  self.config = config
78
  self.hidden_size = config.hidden_size
@@ -113,7 +113,7 @@ class RotaryMultiHeadAttention(nn.Module):
113
 
114
 
115
  class BigBrainDecoderLayer(nn.Module):
116
- def __init__(self, config: BigBrainConfig):
117
  super().__init__()
118
  self.config = config
119
  self.self_attn = RotaryMultiHeadAttention(config)
@@ -131,10 +131,10 @@ class BigBrainDecoderLayer(nn.Module):
131
 
132
 
133
  class BigBrainLanguageModel(PreTrainedModel):
134
- config_class = BigBrainConfig
135
  base_model_prefix = 'big-brain-lm'
136
 
137
- def __init__(self, config: BigBrainConfig = BigBrainConfig()):
138
  super().__init__(config)
139
  self.config = config
140
  self.tok_embed = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
 
6
  from transformers import PreTrainedModel
7
  from transformers.activations import ACT2FN
8
 
9
+ from language_config import BigBrainLanguageConfig
10
 
11
 
12
  def _make_casual_mask(size: int) -> torch.Tensor:
 
26
 
27
 
28
  class MultiLayerPerceptron(nn.Module):
29
+ def __init__(self, config: BigBrainLanguageConfig):
30
  super().__init__()
31
  self.config = config
32
  self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
 
72
 
73
 
74
  class RotaryMultiHeadAttention(nn.Module):
75
+ def __init__(self, config: BigBrainLanguageConfig):
76
  super().__init__()
77
  self.config = config
78
  self.hidden_size = config.hidden_size
 
113
 
114
 
115
  class BigBrainDecoderLayer(nn.Module):
116
+ def __init__(self, config: BigBrainLanguageConfig):
117
  super().__init__()
118
  self.config = config
119
  self.self_attn = RotaryMultiHeadAttention(config)
 
131
 
132
 
133
  class BigBrainLanguageModel(PreTrainedModel):
134
+ config_class = BigBrainLanguageConfig
135
  base_model_prefix = 'big-brain-lm'
136
 
137
+ def __init__(self, config: BigBrainLanguageConfig):
138
  super().__init__(config)
139
  self.config = config
140
  self.tok_embed = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
language_config.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class BigBrainLanguageConfig(PretrainedConfig):
5
+ model_type = 'big-brain-lm'
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=50265,
10
+ hidden_size=768,
11
+ num_hidden_layers=12,
12
+ num_attention_heads=12,
13
+ intermediate_size=3072,
14
+ hidden_act='gelu',
15
+ hidden_dropout_probability=0.1,
16
+ attention_probs_dropout_prob=0.1,
17
+ max_position_embeddings=512,
18
+ initializer_range=0.02,
19
+ layer_norm_eps=1e-6,
20
+ rope_theta=10000,
21
+ sos_token_id=0,
22
+ pad_token_id=1,
23
+ eos_token_id=2,
24
+ unk_token_id=3,
25
+ **kwargs
26
+ ):
27
+ self.vocab_size = vocab_size
28
+ self.hidden_size = hidden_size
29
+ self.num_hidden_layers = num_hidden_layers
30
+ self.num_attention_heads = num_attention_heads
31
+ self.intermediate_size = intermediate_size
32
+ self.hidden_act = hidden_act
33
+ self.hidden_dropout_probability = hidden_dropout_probability
34
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
35
+ self.max_position_embeddings = max_position_embeddings
36
+ self.initializer_range = initializer_range
37
+ self.layer_norm_eps = layer_norm_eps
38
+ self.rope_theta = rope_theta
39
+ self.sos_token_id = sos_token_id
40
+ self.pad_token_id = pad_token_id
41
+ self.eos_token_id = eos_token_id
42
+ self.unk_token_id = unk_token_id
43
+ super().__init__(**kwargs)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bc2647de306177bc7c1e1266bb7a6a98f8aee907e18e2b8d0d4457f4c1806d1
3
  size 774713018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93547b3cc53ceeeaec4e5754fe86e144c1b90d9e8bbf30e82b9fcb2b53caf85
3
  size 774713018