DanielHesslow commited on
Commit
1f88152
1 Parent(s): bb221cf
Files changed (4) hide show
  1. config.json +6 -7
  2. pytorch_model.bin +2 -2
  3. rita_configuration.py +5 -8
  4. rita_modeling.py +4 -5
config.json CHANGED
@@ -1,23 +1,22 @@
1
  {
 
2
  "architectures": [
3
  "RITAModel"
4
  ],
5
  "auto_map": {
6
  "AutoConfig": "rita_configuration.RITAConfig",
7
- "AutoModel": "rita_modeling.RITAModel"
 
8
  },
9
- "bos_token_id": [
10
- 50256
11
- ],
12
  "d_feedforward": 8192,
13
  "d_model": 2048,
14
  "dropout": 0.0,
15
  "eos_token_id": 50256,
16
  "max_seq_len": 1024,
17
- "model_type": "codegen",
18
  "num_heads": 32,
19
  "num_layers": 24,
20
- "torch_dtype": "float32",
21
  "transformers_version": "4.18.0",
22
- "vocab_size": 128
23
  }
 
1
  {
2
+ "_name_or_path": "nz/RITA_xl",
3
  "architectures": [
4
  "RITAModel"
5
  ],
6
  "auto_map": {
7
  "AutoConfig": "rita_configuration.RITAConfig",
8
+ "AutoModel": "rita_modeling.RITAModel",
9
+ "AutoModelForCausalLM": "rita_modeling.RITAModel"
10
  },
 
 
 
11
  "d_feedforward": 8192,
12
  "d_model": 2048,
13
  "dropout": 0.0,
14
  "eos_token_id": 50256,
15
  "max_seq_len": 1024,
16
+ "model_type": "rita",
17
  "num_heads": 32,
18
  "num_layers": 24,
19
+ "torch_dtype": "float16",
20
  "transformers_version": "4.18.0",
21
+ "vocab_size": 26
22
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd3eab5b0d211f648b4dc7ab99291186c6866ff533ebdd94b1ddd0334c118f5d
3
- size 4836636593
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295ec5633c129e34d2a83ffb6adced9dcdb6cdae3f42534c85b0e4ed7adbfb21
3
+ size 2417438283
rita_configuration.py CHANGED
@@ -1,26 +1,24 @@
1
-
2
  from transformers.configuration_utils import PretrainedConfig
3
  from transformers.utils import logging
4
 
5
  logger = logging.get_logger(__name__)
6
 
7
  class RITAConfig(PretrainedConfig):
8
- model_type = "codegen"
9
 
10
  def __init__(
11
  self,
12
- vocab_size=128,
13
  d_model=768,
14
  num_layers=12,
15
  max_seq_len=1024,
16
  num_heads=12,
17
  dropout=0.,
18
  ff_ratio=4,
19
- bos_token_id=50256, # TODO
20
- eos_token_id=50256, # TODO
21
  **kwargs,
22
  ):
23
- super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
24
  self.vocab_size = vocab_size
25
  self.d_model = d_model
26
  self.num_heads = num_heads
@@ -28,5 +26,4 @@ class RITAConfig(PretrainedConfig):
28
  self.num_layers = num_layers
29
  self.max_seq_len=max_seq_len
30
  self.dropout = dropout
31
- self.bos_token_id=bos_token_id,
32
- self.eos_token_id=eos_token_id
 
 
1
  from transformers.configuration_utils import PretrainedConfig
2
  from transformers.utils import logging
3
 
4
  logger = logging.get_logger(__name__)
5
 
6
  class RITAConfig(PretrainedConfig):
7
+ model_type = "rita"
8
 
9
  def __init__(
10
  self,
11
+ vocab_size=26,
12
  d_model=768,
13
  num_layers=12,
14
  max_seq_len=1024,
15
  num_heads=12,
16
  dropout=0.,
17
  ff_ratio=4,
18
+ eos_token_id=2,
 
19
  **kwargs,
20
  ):
21
+ super().__init__(eos_token_id=eos_token_id, **kwargs)
22
  self.vocab_size = vocab_size
23
  self.d_model = d_model
24
  self.num_heads = num_heads
 
26
  self.num_layers = num_layers
27
  self.max_seq_len=max_seq_len
28
  self.dropout = dropout
29
+ self.eos_token_id=eos_token_id
 
rita_modeling.py CHANGED
@@ -222,10 +222,10 @@ class RITAModel(PreTrainedModel):
222
  self.final_norm = nn.LayerNorm(config.d_model)
223
  self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)
224
 
225
- def forward(self, ids, attn_mask=None, padding_mask=None, return_hidden=False) -> torch.FloatTensor:
226
- x = self.embedding(ids) # N x L x D
227
  if attn_mask == None:
228
- attn_mask = (torch.triu(torch.ones(ids.size(1), ids.size(1))) == 0).transpose(0, 1).contiguous()
229
  for layer in self.layers:
230
  x = layer(x, attn_mask=attn_mask, padding_mask=padding_mask)
231
  x = self.final_norm(x) # N x L x D
@@ -246,5 +246,4 @@ class RITAModel(PreTrainedModel):
246
  return self.projector
247
 
248
  def set_output_embeddings(self, new_projector):
249
- return new_projector
250
-
 
222
  self.final_norm = nn.LayerNorm(config.d_model)
223
  self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)
224
 
225
+ def forward(self, input_ids, attn_mask=None, padding_mask=None, return_hidden=False) -> torch.FloatTensor:
226
+ x = self.embedding(input_ids) # N x L x D
227
  if attn_mask == None:
228
+ attn_mask = (torch.triu(torch.ones(input_ids.size(1), input_ids.size(1))) == 0).transpose(0, 1).contiguous().to(input_ids.device)
229
  for layer in self.layers:
230
  x = layer(x, attn_mask=attn_mask, padding_mask=padding_mask)
231
  x = self.final_norm(x) # N x L x D
 
246
  return self.projector
247
 
248
  def set_output_embeddings(self, new_projector):
249
+ self.projector = new_projector