Alexandru Gherghescu commited on
Commit
030a9e9
1 Parent(s): 9dde569

Fix couple of issues with inference, dataset folder

Browse files
Files changed (4) hide show
  1. inference.py +1 -1
  2. modeling_gpt1.py +19 -6
  3. pre_training.py +2 -6
  4. preprocessing.py +1 -1
inference.py CHANGED
@@ -8,7 +8,7 @@ prompt = 'The mastermind behind the plan was, all along, '
8
  inputs = tokenizer(prompt, return_tensors='pt')
9
 
10
  generate_ids = model.generate(inputs.input_ids,
11
- max_length=50,
12
  num_beams=1,
13
  do_sample=True,
14
  top_p=0.9,
 
8
  inputs = tokenizer(prompt, return_tensors='pt')
9
 
10
  generate_ids = model.generate(inputs.input_ids,
11
+ max_new_tokens=40,
12
  num_beams=1,
13
  do_sample=True,
14
  top_p=0.9,
modeling_gpt1.py CHANGED
@@ -163,7 +163,7 @@ class GPT1Model(GPT1PreTrainedModel):
163
  def set_input_embeddings(self, value):
164
  self.embs = value
165
 
166
- def forward(self, input_ids, *args, **kwargs):
167
  position_ids = torch.arange(input_ids.size(-1),
168
  dtype=torch.long,
169
  device=input_ids.device).unsqueeze_(0)
@@ -172,8 +172,12 @@ class GPT1Model(GPT1PreTrainedModel):
172
  position_embeds = self.pos_emb(position_ids)
173
  hidden_state = self.embs_dropout(input_embeds) + position_embeds
174
 
175
- causal_mask = self.causal_mask.to(dtype=input_embeds.dtype,
176
- device=input_embeds.device)
 
 
 
 
177
 
178
  for layer in self.layers:
179
  hidden_state = layer(hidden_state, attn_mask=causal_mask)
@@ -214,8 +218,9 @@ class GPT1ForCausalLM(GPT1PreTrainedModel):
214
  def set_decoder(self, decoder):
215
  self.model = decoder
216
 
217
- def forward(self, input_ids, labels = None, *args, **kwargs):
218
- output = self.model(input_ids)
 
219
 
220
  hidden_state = output[0]
221
  logits = self.lm_head(hidden_state).float()
@@ -236,4 +241,12 @@ class GPT1ForCausalLM(GPT1PreTrainedModel):
236
  )
237
 
238
  def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
239
- return { 'input_ids': input_ids }
 
 
 
 
 
 
 
 
 
163
  def set_input_embeddings(self, value):
164
  self.embs = value
165
 
166
+ def forward(self, input_ids, attention_mask=None, *args, **kwargs):
167
  position_ids = torch.arange(input_ids.size(-1),
168
  dtype=torch.long,
169
  device=input_ids.device).unsqueeze_(0)
 
172
  position_embeds = self.pos_emb(position_ids)
173
  hidden_state = self.embs_dropout(input_embeds) + position_embeds
174
 
175
+ if attention_mask is not None:
176
+ causal_mask = attention_mask.to(dtype=input_embeds.dtype,
177
+ device=input_embeds.device)
178
+ else:
179
+ causal_mask = self.causal_mask.to(dtype=input_embeds.dtype,
180
+ device=input_embeds.device)
181
 
182
  for layer in self.layers:
183
  hidden_state = layer(hidden_state, attn_mask=causal_mask)
 
218
  def set_decoder(self, decoder):
219
  self.model = decoder
220
 
221
+ def forward(self, input_ids, labels=None, attention_mask=None,
222
+ *args, **kwargs):
223
+ output = self.model(input_ids, attention_mask)
224
 
225
  hidden_state = output[0]
226
  logits = self.lm_head(hidden_state).float()
 
241
  )
242
 
243
  def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
244
+ seq_len = input_ids.size(1)
245
+
246
+ attn_mask = torch.full((1, seq_len, seq_len), fill_value=float('-inf'))
247
+ attn_mask = torch.triu(attn_mask, diagonal=1)
248
+
249
+ return {
250
+ 'input_ids': input_ids,
251
+ 'attention_mask': attn_mask
252
+ }
pre_training.py CHANGED
@@ -11,17 +11,13 @@ from datasets import load_from_disk
11
  from configuration_gpt1 import GPT1Config
12
  from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
13
 
14
- # a few more things to try to get the model to train (in this order)
15
- # actually manually check the input (the books), and the tokenizer output (i
16
- # don't know if it tokenizes correctly, if it adds eos_token etc.)
17
-
18
 
19
  GPT1Config.register_for_auto_class()
20
  GPT1Model.register_for_auto_class('AutoModel')
21
  GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
22
 
23
- # load the already tokenized dataset (see training_preprocessing.py)
24
- tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
25
 
26
  # shuffle for good measure
27
  tokenized_datasets = tokenized_datasets.shuffle(seed=42)
 
11
  from configuration_gpt1 import GPT1Config
12
  from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
13
 
 
 
 
 
14
 
15
  GPT1Config.register_for_auto_class()
16
  GPT1Model.register_for_auto_class('AutoModel')
17
  GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
18
 
19
+ # load the already tokenized dataset (see preprocessing.py)
20
+ tokenized_datasets = load_from_disk('data')
21
 
22
  # shuffle for good measure
23
  tokenized_datasets = tokenized_datasets.shuffle(seed=42)
preprocessing.py CHANGED
@@ -26,4 +26,4 @@ tokenized_datasets = raw_datasets.map(
26
  remove_columns=raw_datasets['train'].column_names,
27
  )
28
 
29
- tokenized_datasets.save_to_disk('tokenized_bookcorpusopen')
 
26
  remove_columns=raw_datasets['train'].column_names,
27
  )
28
 
29
+ tokenized_datasets.save_to_disk('data')