Alexandru Gherghescu
commited on
Commit
•
030a9e9
1
Parent(s):
9dde569
Fix couple of issues with inference, dataset folder
Browse files- inference.py +1 -1
- modeling_gpt1.py +19 -6
- pre_training.py +2 -6
- preprocessing.py +1 -1
inference.py
CHANGED
@@ -8,7 +8,7 @@ prompt = 'The mastermind behind the plan was, all along, '
|
|
8 |
inputs = tokenizer(prompt, return_tensors='pt')
|
9 |
|
10 |
generate_ids = model.generate(inputs.input_ids,
|
11 |
-
|
12 |
num_beams=1,
|
13 |
do_sample=True,
|
14 |
top_p=0.9,
|
|
|
8 |
inputs = tokenizer(prompt, return_tensors='pt')
|
9 |
|
10 |
generate_ids = model.generate(inputs.input_ids,
|
11 |
+
max_new_tokens=40,
|
12 |
num_beams=1,
|
13 |
do_sample=True,
|
14 |
top_p=0.9,
|
modeling_gpt1.py
CHANGED
@@ -163,7 +163,7 @@ class GPT1Model(GPT1PreTrainedModel):
|
|
163 |
def set_input_embeddings(self, value):
|
164 |
self.embs = value
|
165 |
|
166 |
-
def forward(self, input_ids, *args, **kwargs):
|
167 |
position_ids = torch.arange(input_ids.size(-1),
|
168 |
dtype=torch.long,
|
169 |
device=input_ids.device).unsqueeze_(0)
|
@@ -172,8 +172,12 @@ class GPT1Model(GPT1PreTrainedModel):
|
|
172 |
position_embeds = self.pos_emb(position_ids)
|
173 |
hidden_state = self.embs_dropout(input_embeds) + position_embeds
|
174 |
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
177 |
|
178 |
for layer in self.layers:
|
179 |
hidden_state = layer(hidden_state, attn_mask=causal_mask)
|
@@ -214,8 +218,9 @@ class GPT1ForCausalLM(GPT1PreTrainedModel):
|
|
214 |
def set_decoder(self, decoder):
|
215 |
self.model = decoder
|
216 |
|
217 |
-
def forward(self, input_ids, labels
|
218 |
-
|
|
|
219 |
|
220 |
hidden_state = output[0]
|
221 |
logits = self.lm_head(hidden_state).float()
|
@@ -236,4 +241,12 @@ class GPT1ForCausalLM(GPT1PreTrainedModel):
|
|
236 |
)
|
237 |
|
238 |
def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
def set_input_embeddings(self, value):
|
164 |
self.embs = value
|
165 |
|
166 |
+
def forward(self, input_ids, attention_mask=None, *args, **kwargs):
|
167 |
position_ids = torch.arange(input_ids.size(-1),
|
168 |
dtype=torch.long,
|
169 |
device=input_ids.device).unsqueeze_(0)
|
|
|
172 |
position_embeds = self.pos_emb(position_ids)
|
173 |
hidden_state = self.embs_dropout(input_embeds) + position_embeds
|
174 |
|
175 |
+
if attention_mask is not None:
|
176 |
+
causal_mask = attention_mask.to(dtype=input_embeds.dtype,
|
177 |
+
device=input_embeds.device)
|
178 |
+
else:
|
179 |
+
causal_mask = self.causal_mask.to(dtype=input_embeds.dtype,
|
180 |
+
device=input_embeds.device)
|
181 |
|
182 |
for layer in self.layers:
|
183 |
hidden_state = layer(hidden_state, attn_mask=causal_mask)
|
|
|
218 |
def set_decoder(self, decoder):
|
219 |
self.model = decoder
|
220 |
|
221 |
+
def forward(self, input_ids, labels=None, attention_mask=None,
|
222 |
+
*args, **kwargs):
|
223 |
+
output = self.model(input_ids, attention_mask)
|
224 |
|
225 |
hidden_state = output[0]
|
226 |
logits = self.lm_head(hidden_state).float()
|
|
|
241 |
)
|
242 |
|
243 |
def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
|
244 |
+
seq_len = input_ids.size(1)
|
245 |
+
|
246 |
+
attn_mask = torch.full((1, seq_len, seq_len), fill_value=float('-inf'))
|
247 |
+
attn_mask = torch.triu(attn_mask, diagonal=1)
|
248 |
+
|
249 |
+
return {
|
250 |
+
'input_ids': input_ids,
|
251 |
+
'attention_mask': attn_mask
|
252 |
+
}
|
pre_training.py
CHANGED
@@ -11,17 +11,13 @@ from datasets import load_from_disk
|
|
11 |
from configuration_gpt1 import GPT1Config
|
12 |
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
|
13 |
|
14 |
-
# a few more things to try to get the model to train (in this order)
|
15 |
-
# actually manually check the input (the books), and the tokenizer output (i
|
16 |
-
# don't know if it tokenizes correctly, if it adds eos_token etc.)
|
17 |
-
|
18 |
|
19 |
GPT1Config.register_for_auto_class()
|
20 |
GPT1Model.register_for_auto_class('AutoModel')
|
21 |
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
|
22 |
|
23 |
-
# load the already tokenized dataset (see
|
24 |
-
tokenized_datasets = load_from_disk('
|
25 |
|
26 |
# shuffle for good measure
|
27 |
tokenized_datasets = tokenized_datasets.shuffle(seed=42)
|
|
|
11 |
from configuration_gpt1 import GPT1Config
|
12 |
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
GPT1Config.register_for_auto_class()
|
16 |
GPT1Model.register_for_auto_class('AutoModel')
|
17 |
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
|
18 |
|
19 |
+
# load the already tokenized dataset (see preprocessing.py)
|
20 |
+
tokenized_datasets = load_from_disk('data')
|
21 |
|
22 |
# shuffle for good measure
|
23 |
tokenized_datasets = tokenized_datasets.shuffle(seed=42)
|
preprocessing.py
CHANGED
@@ -26,4 +26,4 @@ tokenized_datasets = raw_datasets.map(
|
|
26 |
remove_columns=raw_datasets['train'].column_names,
|
27 |
)
|
28 |
|
29 |
-
tokenized_datasets.save_to_disk('
|
|
|
26 |
remove_columns=raw_datasets['train'].column_names,
|
27 |
)
|
28 |
|
29 |
+
tokenized_datasets.save_to_disk('data')
|