shng2025 commited on
Commit
3a29ced
1 Parent(s): 4d63b0c
.ipynb_checkpoints/gptesla_training-checkpoint.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import datasets, transformers
4
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed
5
+ from transformers.optimization import get_scheduler
6
+ from datasets import load_dataset, DownloadConfig
7
+
8
+ import torch
9
+ from torch.utils.data import IterableDataset
10
+ from torch.utils.data.dataloader import DataLoader
11
+ from torch.utils.tensorboard import SummaryWriter
12
+ from torch.optim import AdamW
13
+
14
+ import logging
15
+ import wandb
16
+ from huggingface_hub import Repository, create_branch
17
+ from accelerate import Accelerator
18
+ from argparse import Namespace
19
+
20
+
21
+ # Set the API token as an environment variable
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
+
24
+
25
+ class ConstantLengthDataset(IterableDataset):
26
+
27
+ def __init__(
28
+ self,
29
+ tokenizer,
30
+ dataset,
31
+ seq_length=1024,
32
+ num_of_sequences=1024,
33
+ chars_per_token=3.6,
34
+ ):
35
+ self.tokenizer = tokenizer
36
+ self.concat_token_id = tokenizer.eos_token_id
37
+ self.dataset = dataset
38
+ self.seq_length = seq_length
39
+ self.input_characters = seq_length * chars_per_token * num_of_sequences
40
+
41
+ def __iter__(self):
42
+ iterator = iter(self.dataset)
43
+ more_examples = True
44
+ while more_examples:
45
+ buffer, buffer_len = [], 0
46
+ while True:
47
+ if buffer_len >= self.input_characters:
48
+ m = f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
49
+ # print(m)
50
+ break
51
+ try:
52
+ m = f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
53
+ # print(m)
54
+ buffer.append(next(iterator)["content"])
55
+ buffer_len += len(buffer[-1])
56
+ except StopIteration:
57
+ # iterator = iter(self.dataset)
58
+ more_examples = False
59
+ break
60
+
61
+ all_token_ids = []
62
+ tokenized_inputs = self.tokenizer(buffer, truncation=False)
63
+ for tokenized_input in tokenized_inputs["input_ids"]:
64
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
65
+
66
+ for i in range(0, len(all_token_ids), self.seq_length):
67
+ input_ids = all_token_ids[i : i + self.seq_length]
68
+ if len(input_ids) == self.seq_length:
69
+ yield torch.tensor(input_ids)
70
+
71
+
72
+ def setup_logging(project_name):
73
+ logger = logging.getLogger(__name__)
74
+
75
+ dir_name = "./log"
76
+ if not os.path.exists(dir_name):
77
+ os.makedirs(dir_name)
78
+ print(f"Directory '{dir_name}' was created.")
79
+ else:
80
+ print(f"Directory '{dir_name}' already exists.")
81
+
82
+ # setting up log directory
83
+ logging.basicConfig(
84
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
85
+ datefmt="%m/%d/%Y %H:%M:%S",
86
+ level=logging.INFO,
87
+ handlers=[
88
+ logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
89
+ logging.StreamHandler(),
90
+ ],
91
+ )
92
+ if accelerator.is_main_process: # We only want to set up logging once
93
+ wandb.init(project=project_name, config=args, dir="../wandb")
94
+ run_name = wandb.run.name
95
+ tb_writer = SummaryWriter()
96
+ tb_writer.add_hparams(vars(args), {"0": 0})
97
+ logger.setLevel(logging.INFO)
98
+ datasets.utils.logging.set_verbosity_debug()
99
+ transformers.utils.logging.set_verbosity_info()
100
+ else:
101
+ tb_writer = None
102
+ run_name = ""
103
+ logger.setLevel(logging.ERROR)
104
+ datasets.utils.logging.set_verbosity_error()
105
+ transformers.utils.logging.set_verbosity_error()
106
+ return logger, tb_writer, run_name
107
+
108
+
109
+ def create_dataloaders(dataset_name):
110
+ train_data = load_dataset(dataset_name + "-train", split="train", streaming=True)
111
+ train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
112
+ valid_data = load_dataset(
113
+ dataset_name + "-valid", split="validation", streaming=True
114
+ )
115
+
116
+ train_dataset = ConstantLengthDataset(
117
+ tokenizer, train_data, seq_length=args.seq_length
118
+ )
119
+ valid_dataset = ConstantLengthDataset(
120
+ tokenizer, valid_data, seq_length=args.seq_length
121
+ )
122
+
123
+ train_dataloader = DataLoader(
124
+ train_dataset, batch_size=args.train_batch_size, num_workers=96
125
+ )
126
+ eval_dataloader = DataLoader(
127
+ valid_dataset, batch_size=args.valid_batch_size, num_workers=1
128
+ )
129
+ return train_dataloader, eval_dataloader
130
+
131
+
132
+ def log_metrics(step, metrics):
133
+ logger.info(f"Step {step}: {metrics}")
134
+ if accelerator.is_main_process:
135
+ wandb.log(metrics)
136
+ [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
137
+
138
+
139
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
140
+ params_with_wd, params_without_wd = [], []
141
+ for n, p in model.named_parameters():
142
+ if any(nd in n for nd in no_decay):
143
+ params_without_wd.append(p)
144
+ else:
145
+ params_with_wd.append(p)
146
+ return [
147
+ {"params": params_with_wd, "weight_decay": args.weight_decay},
148
+ {"params": params_without_wd, "weight_decay": 0.0},
149
+ ]
150
+
151
+
152
+ def evaluate():
153
+ model.eval()
154
+ losses = []
155
+ for step, batch in enumerate(eval_dataloader):
156
+ with torch.no_grad():
157
+ outputs = model(batch, labels=batch)
158
+ loss = outputs.loss.repeat(args.valid_batch_size)
159
+ losses.append(accelerator.gather(loss))
160
+ if args.max_eval_steps > 0 and step >= args.max_eval_steps:
161
+ break
162
+ loss = torch.mean(torch.cat(losses))
163
+
164
+ try:
165
+ perplexity = torch.exp(loss)
166
+ except OverflowError:
167
+ perplexity = torch.tensor(float("inf"))
168
+
169
+ return loss.item(), perplexity.item()
170
+
171
+
172
+ # Accelerator
173
+ accelerator = Accelerator(dispatch_batches=True)
174
+ acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
175
+
176
+ project_name = "shng2025/gptesla-small"
177
+ dataset_name = "shng2025/gptesla"
178
+
179
+ # GPTesla - 111M param setup in comment. Modification to make lighter training requirement needed
180
+ config = {
181
+ "train_batch_size": 12, # 12
182
+ "valid_batch_size": 12, # 12
183
+ "weight_decay": 0.1,
184
+ "shuffle_buffer": 1000,
185
+ "learning_rate": 5e-4, # 5e-4
186
+ "lr_scheduler_type": "cosine",
187
+ "num_warmup_steps": 700, # 2000
188
+ "gradient_accumulation_steps": 1, # 1
189
+ "max_train_steps": 50000, # 150000
190
+ "max_eval_steps": 10,
191
+ "seq_length": 1024,
192
+ "seed": 1,
193
+ "save_checkpoint_steps": 50,
194
+ } # 15000
195
+
196
+ args = Namespace(**config, **acc_state)
197
+ samples_per_step = accelerator.state.num_processes * args.train_batch_size
198
+ set_seed(args.seed)
199
+
200
+ # Logging
201
+ logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
202
+ logger.info(accelerator.state)
203
+
204
+ # Load model and tokenizer
205
+ if accelerator.is_main_process:
206
+ new_branch_name = run_name
207
+ create_branch(project_name, repo_type="model", branch=new_branch_name)
208
+ hf_repo = Repository("./", clone_from=project_name, revision=run_name)
209
+
210
+ model = AutoModelForCausalLM.from_pretrained("./") # , gradient_checkpointing=True)
211
+ tokenizer = AutoTokenizer.from_pretrained("./")
212
+
213
+ # Load dataset and dataloader
214
+ train_dataloader, eval_dataloader = create_dataloaders(dataset_name)
215
+
216
+ # Prepare the optimizer and learning rate scheduler
217
+ optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
218
+ lr_scheduler = get_scheduler(
219
+ name=args.lr_scheduler_type,
220
+ optimizer=optimizer,
221
+ num_warmup_steps=args.num_warmup_steps,
222
+ num_training_steps=args.max_train_steps,
223
+ )
224
+
225
+
226
+ def get_lr():
227
+ return optimizer.param_groups[0]["lr"]
228
+
229
+
230
+ # Prepare everything with our `accelerator` (order of args is not important)
231
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
232
+ model, optimizer, train_dataloader, eval_dataloader
233
+ )
234
+
235
+ # Train model
236
+ model.train()
237
+ completed_steps = 0
238
+ for step, batch in enumerate(train_dataloader, start=1):
239
+ loss = model(batch, labels=batch).loss
240
+ log_metrics(
241
+ step,
242
+ {
243
+ "lr": get_lr(),
244
+ "samples": step * samples_per_step,
245
+ "steps": completed_steps,
246
+ "loss/train": loss.item(),
247
+ },
248
+ )
249
+ loss = loss / args.gradient_accumulation_steps
250
+ accelerator.backward(loss)
251
+ if step % args.gradient_accumulation_steps == 0:
252
+ optimizer.step()
253
+ lr_scheduler.step()
254
+ optimizer.zero_grad()
255
+ completed_steps += 1
256
+ if step % args.save_checkpoint_steps == 0:
257
+ logger.info("Evaluating and saving model checkpoint")
258
+ eval_loss, perplexity = evaluate()
259
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
260
+ accelerator.wait_for_everyone()
261
+ unwrapped_model = accelerator.unwrap_model(model)
262
+ if accelerator.is_main_process:
263
+ unwrapped_model.save_pretrained("./")
264
+ hf_repo.push_to_hub(commit_message=f"step {step}")
265
+ model.train()
266
+ if completed_steps >= args.max_train_steps:
267
+ break
268
+
269
+
270
+ # Evaluate and save the last checkpoint
271
+ logger.info("Evaluating and saving model after training")
272
+ eval_loss, perplexity = evaluate()
273
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
274
+ accelerator.wait_for_everyone()
275
+ unwrapped_model = accelerator.unwrap_model(model)
276
+ if accelerator.is_main_process:
277
+ unwrapped_model.save_pretrained("./")
278
+ hf_repo.push_to_hub(commit_message="final model")
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "gpt2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "./",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
gptesla_training.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import datasets, transformers
4
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed
5
+ from transformers.optimization import get_scheduler
6
+ from datasets import load_dataset, DownloadConfig
7
+
8
+ import torch
9
+ from torch.utils.data import IterableDataset
10
+ from torch.utils.data.dataloader import DataLoader
11
+ from torch.utils.tensorboard import SummaryWriter
12
+ from torch.optim import AdamW
13
+
14
+ import logging
15
+ import wandb
16
+ from huggingface_hub import Repository, create_branch
17
+ from accelerate import Accelerator
18
+ from argparse import Namespace
19
+
20
+
21
+ # Set the API token as an environment variable
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
+
24
+
25
+ class ConstantLengthDataset(IterableDataset):
26
+
27
+ def __init__(
28
+ self,
29
+ tokenizer,
30
+ dataset,
31
+ seq_length=1024,
32
+ num_of_sequences=1024,
33
+ chars_per_token=3.6,
34
+ ):
35
+ self.tokenizer = tokenizer
36
+ self.concat_token_id = tokenizer.eos_token_id
37
+ self.dataset = dataset
38
+ self.seq_length = seq_length
39
+ self.input_characters = seq_length * chars_per_token * num_of_sequences
40
+
41
+ def __iter__(self):
42
+ iterator = iter(self.dataset)
43
+ more_examples = True
44
+ while more_examples:
45
+ buffer, buffer_len = [], 0
46
+ while True:
47
+ if buffer_len >= self.input_characters:
48
+ m = f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
49
+ # print(m)
50
+ break
51
+ try:
52
+ m = f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
53
+ # print(m)
54
+ buffer.append(next(iterator)["content"])
55
+ buffer_len += len(buffer[-1])
56
+ except StopIteration:
57
+ # iterator = iter(self.dataset)
58
+ more_examples = False
59
+ break
60
+
61
+ all_token_ids = []
62
+ tokenized_inputs = self.tokenizer(buffer, truncation=False)
63
+ for tokenized_input in tokenized_inputs["input_ids"]:
64
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
65
+
66
+ for i in range(0, len(all_token_ids), self.seq_length):
67
+ input_ids = all_token_ids[i : i + self.seq_length]
68
+ if len(input_ids) == self.seq_length:
69
+ yield torch.tensor(input_ids)
70
+
71
+
72
+ def setup_logging(project_name):
73
+ logger = logging.getLogger(__name__)
74
+
75
+ dir_name = "./log"
76
+ if not os.path.exists(dir_name):
77
+ os.makedirs(dir_name)
78
+ print(f"Directory '{dir_name}' was created.")
79
+ else:
80
+ print(f"Directory '{dir_name}' already exists.")
81
+
82
+ # setting up log directory
83
+ logging.basicConfig(
84
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
85
+ datefmt="%m/%d/%Y %H:%M:%S",
86
+ level=logging.INFO,
87
+ handlers=[
88
+ logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
89
+ logging.StreamHandler(),
90
+ ],
91
+ )
92
+ if accelerator.is_main_process: # We only want to set up logging once
93
+ wandb.init(project=project_name, config=args, dir="../wandb")
94
+ run_name = wandb.run.name
95
+ tb_writer = SummaryWriter()
96
+ tb_writer.add_hparams(vars(args), {"0": 0})
97
+ logger.setLevel(logging.INFO)
98
+ datasets.utils.logging.set_verbosity_debug()
99
+ transformers.utils.logging.set_verbosity_info()
100
+ else:
101
+ tb_writer = None
102
+ run_name = ""
103
+ logger.setLevel(logging.ERROR)
104
+ datasets.utils.logging.set_verbosity_error()
105
+ transformers.utils.logging.set_verbosity_error()
106
+ return logger, tb_writer, run_name
107
+
108
+
109
+ def create_dataloaders(dataset_name):
110
+ train_data = load_dataset(dataset_name + "-train", split="train", streaming=True)
111
+ train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
112
+ valid_data = load_dataset(
113
+ dataset_name + "-valid", split="validation", streaming=True
114
+ )
115
+
116
+ train_dataset = ConstantLengthDataset(
117
+ tokenizer, train_data, seq_length=args.seq_length
118
+ )
119
+ valid_dataset = ConstantLengthDataset(
120
+ tokenizer, valid_data, seq_length=args.seq_length
121
+ )
122
+
123
+ train_dataloader = DataLoader(
124
+ train_dataset, batch_size=args.train_batch_size, num_workers=96
125
+ )
126
+ eval_dataloader = DataLoader(
127
+ valid_dataset, batch_size=args.valid_batch_size, num_workers=1
128
+ )
129
+ return train_dataloader, eval_dataloader
130
+
131
+
132
+ def log_metrics(step, metrics):
133
+ logger.info(f"Step {step}: {metrics}")
134
+ if accelerator.is_main_process:
135
+ wandb.log(metrics)
136
+ [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
137
+
138
+
139
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
140
+ params_with_wd, params_without_wd = [], []
141
+ for n, p in model.named_parameters():
142
+ if any(nd in n for nd in no_decay):
143
+ params_without_wd.append(p)
144
+ else:
145
+ params_with_wd.append(p)
146
+ return [
147
+ {"params": params_with_wd, "weight_decay": args.weight_decay},
148
+ {"params": params_without_wd, "weight_decay": 0.0},
149
+ ]
150
+
151
+
152
+ def evaluate():
153
+ model.eval()
154
+ losses = []
155
+ for step, batch in enumerate(eval_dataloader):
156
+ with torch.no_grad():
157
+ outputs = model(batch, labels=batch)
158
+ loss = outputs.loss.repeat(args.valid_batch_size)
159
+ losses.append(accelerator.gather(loss))
160
+ if args.max_eval_steps > 0 and step >= args.max_eval_steps:
161
+ break
162
+ loss = torch.mean(torch.cat(losses))
163
+
164
+ try:
165
+ perplexity = torch.exp(loss)
166
+ except OverflowError:
167
+ perplexity = torch.tensor(float("inf"))
168
+
169
+ return loss.item(), perplexity.item()
170
+
171
+
172
+ # Accelerator
173
+ accelerator = Accelerator(dispatch_batches=True)
174
+ acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
175
+
176
+ project_name = "shng2025/gptesla-small"
177
+ dataset_name = "shng2025/gptesla"
178
+
179
+ # GPTesla - 111M param setup in comment. Modification to make lighter training requirement needed
180
+ config = {
181
+ "train_batch_size": 12, # 12
182
+ "valid_batch_size": 12, # 12
183
+ "weight_decay": 0.1,
184
+ "shuffle_buffer": 1000,
185
+ "learning_rate": 5e-4, # 5e-4
186
+ "lr_scheduler_type": "cosine",
187
+ "num_warmup_steps": 700, # 2000
188
+ "gradient_accumulation_steps": 1, # 1
189
+ "max_train_steps": 50000, # 150000
190
+ "max_eval_steps": 10,
191
+ "seq_length": 1024,
192
+ "seed": 1,
193
+ "save_checkpoint_steps": 50,
194
+ } # 15000
195
+
196
+ args = Namespace(**config, **acc_state)
197
+ samples_per_step = accelerator.state.num_processes * args.train_batch_size
198
+ set_seed(args.seed)
199
+
200
+ # Logging
201
+ logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
202
+ logger.info(accelerator.state)
203
+
204
+ # Load model and tokenizer
205
+ if accelerator.is_main_process:
206
+ new_branch_name = run_name
207
+ create_branch(project_name, repo_type="model", branch=new_branch_name)
208
+ hf_repo = Repository("./", clone_from=project_name, revision=run_name)
209
+
210
+ model = AutoModelForCausalLM.from_pretrained("./") # , gradient_checkpointing=True)
211
+ tokenizer = AutoTokenizer.from_pretrained("./")
212
+
213
+ # Load dataset and dataloader
214
+ train_dataloader, eval_dataloader = create_dataloaders(dataset_name)
215
+
216
+ # Prepare the optimizer and learning rate scheduler
217
+ optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
218
+ lr_scheduler = get_scheduler(
219
+ name=args.lr_scheduler_type,
220
+ optimizer=optimizer,
221
+ num_warmup_steps=args.num_warmup_steps,
222
+ num_training_steps=args.max_train_steps,
223
+ )
224
+
225
+
226
+ def get_lr():
227
+ return optimizer.param_groups[0]["lr"]
228
+
229
+
230
+ # Prepare everything with our `accelerator` (order of args is not important)
231
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
232
+ model, optimizer, train_dataloader, eval_dataloader
233
+ )
234
+
235
+ # Train model
236
+ model.train()
237
+ completed_steps = 0
238
+ for step, batch in enumerate(train_dataloader, start=1):
239
+ loss = model(batch, labels=batch).loss
240
+ log_metrics(
241
+ step,
242
+ {
243
+ "lr": get_lr(),
244
+ "samples": step * samples_per_step,
245
+ "steps": completed_steps,
246
+ "loss/train": loss.item(),
247
+ },
248
+ )
249
+ loss = loss / args.gradient_accumulation_steps
250
+ accelerator.backward(loss)
251
+ if step % args.gradient_accumulation_steps == 0:
252
+ optimizer.step()
253
+ lr_scheduler.step()
254
+ optimizer.zero_grad()
255
+ completed_steps += 1
256
+ if step % args.save_checkpoint_steps == 0:
257
+ logger.info("Evaluating and saving model checkpoint")
258
+ eval_loss, perplexity = evaluate()
259
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
260
+ accelerator.wait_for_everyone()
261
+ unwrapped_model = accelerator.unwrap_model(model)
262
+ if accelerator.is_main_process:
263
+ unwrapped_model.save_pretrained("./")
264
+ hf_repo.push_to_hub(commit_message=f"step {step}")
265
+ model.train()
266
+ if completed_steps >= args.max_train_steps:
267
+ break
268
+
269
+
270
+ # Evaluate and save the last checkpoint
271
+ logger.info("Evaluating and saving model after training")
272
+ eval_loss, perplexity = evaluate()
273
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
274
+ accelerator.wait_for_everyone()
275
+ unwrapped_model = accelerator.unwrap_model(model)
276
+ if accelerator.is_main_process:
277
+ unwrapped_model.save_pretrained("./")
278
+ hf_repo.push_to_hub(commit_message="final model")
log/debug_0.log CHANGED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 07/24/2024 16:20:48 - INFO - __main__ - Distributed environment: MULTI_GPU Backend: nccl
2
+ Num processes: 4
3
+ Process index: 0
4
+ Local process index: 0
5
+ Device: cuda:0
6
+
7
+ Mixed precision type: fp16
8
+
9
+ 07/24/2024 16:22:30 - INFO - __main__ - Distributed environment: MULTI_GPU Backend: nccl
10
+ Num processes: 4
11
+ Process index: 0
12
+ Local process index: 0
13
+ Device: cuda:0
14
+
15
+ Mixed precision type: fp16
16
+
17
+ 07/24/2024 16:22:31 - WARNING - huggingface_hub.repository - /dli/gptesla-small/./ is already a clone of https://huggingface.co/shng2025/gptesla-small. Make sure you pull the latest changes with `repo.git_pull()`.
18
+ 07/24/2024 16:22:31 - WARNING - huggingface_hub.repository - Revision `dashing-violet-117` does not exist. Created and checked out branch `dashing-violet-117`.
19
+ 07/24/2024 16:22:31 - WARNING - huggingface_hub.repository -
20
+ 07/24/2024 16:22:32 - DEBUG - datasets.utils._dataset_viewer - Dataset info for shng2025/gptesla-train is not completely ready yet.
21
+ 07/24/2024 16:22:32 - INFO - datasets.builder - No config specified, defaulting to the single config: gptesla-train/default
22
+ 07/24/2024 16:22:32 - INFO - datasets.info - Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json
23
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 2/183 shards.
24
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#1, ': Starting to iterate over 2/183 shards.
25
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#2, ': Starting to iterate over 2/183 shards.
26
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#4, ': Starting to iterate over 2/183 shards.
27
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#3, ': Starting to iterate over 2/183 shards.
28
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#5, ': Starting to iterate over 2/183 shards.
29
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#6, ': Starting to iterate over 2/183 shards.
30
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#7, ': Starting to iterate over 2/183 shards.
31
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#8, ': Starting to iterate over 2/183 shards.
32
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#12, ': Starting to iterate over 2/183 shards.
33
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#21, ': Starting to iterate over 2/183 shards.
34
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#10, ': Starting to iterate over 2/183 shards.
35
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#13, ': Starting to iterate over 2/183 shards.
36
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#19, ': Starting to iterate over 2/183 shards.
37
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#11, ': Starting to iterate over 2/183 shards.
38
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#22, ': Starting to iterate over 2/183 shards.
39
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#15, ': Starting to iterate over 2/183 shards.
40
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#24, ': Starting to iterate over 2/183 shards.
41
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#23, ': Starting to iterate over 2/183 shards.
42
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#9, ': Starting to iterate over 2/183 shards.
43
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#17, ': Starting to iterate over 2/183 shards.
44
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#18, ': Starting to iterate over 2/183 shards.
45
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#14, ': Starting to iterate over 2/183 shards.
46
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#16, ': Starting to iterate over 2/183 shards.
47
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#20, ': Starting to iterate over 2/183 shards.
48
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#26, ': Starting to iterate over 2/183 shards.
49
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#28, ': Starting to iterate over 2/183 shards.
50
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#27, ': Starting to iterate over 2/183 shards.
51
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#25, ': Starting to iterate over 2/183 shards.
52
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#30, ': Starting to iterate over 2/183 shards.
53
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#29, ': Starting to iterate over 2/183 shards.
54
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#31, ': Starting to iterate over 2/183 shards.
55
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#32, ': Starting to iterate over 2/183 shards.
56
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#33, ': Starting to iterate over 2/183 shards.
57
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#34, ': Starting to iterate over 2/183 shards.
58
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#35, ': Starting to iterate over 2/183 shards.
59
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#36, ': Starting to iterate over 2/183 shards.
60
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#37, ': Starting to iterate over 2/183 shards.
61
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#38, ': Starting to iterate over 2/183 shards.
62
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#39, ': Starting to iterate over 2/183 shards.
63
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#40, ': Starting to iterate over 2/183 shards.
64
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#41, ': Starting to iterate over 2/183 shards.
65
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#42, ': Starting to iterate over 2/183 shards.
66
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#43, ': Starting to iterate over 2/183 shards.
67
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#44, ': Starting to iterate over 2/183 shards.
68
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#45, ': Starting to iterate over 2/183 shards.
69
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#46, ': Starting to iterate over 2/183 shards.
70
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#47, ': Starting to iterate over 2/183 shards.
71
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#48, ': Starting to iterate over 2/183 shards.
72
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#49, ': Starting to iterate over 2/183 shards.
73
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#51, ': Starting to iterate over 2/183 shards.
74
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#50, ': Starting to iterate over 2/183 shards.
75
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#52, ': Starting to iterate over 2/183 shards.
76
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#53, ': Starting to iterate over 2/183 shards.
77
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#54, ': Starting to iterate over 2/183 shards.
78
+ 07/24/2024 16:22:38 - DEBUG - datasets.iterable_dataset - dataloader worker#56, ': Starting to iterate over 2/183 shards.
79
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#55, ': Starting to iterate over 2/183 shards.
80
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#57, ': Starting to iterate over 2/183 shards.
81
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#58, ': Starting to iterate over 2/183 shards.
82
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#60, ': Starting to iterate over 2/183 shards.
83
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#61, ': Starting to iterate over 2/183 shards.
84
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#62, ': Starting to iterate over 2/183 shards.
85
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#63, ': Starting to iterate over 2/183 shards.
86
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#59, ': Starting to iterate over 2/183 shards.
87
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#64, ': Starting to iterate over 2/183 shards.
88
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#65, ': Starting to iterate over 2/183 shards.
89
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#66, ': Starting to iterate over 2/183 shards.
90
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#67, ': Starting to iterate over 2/183 shards.
91
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#68, ': Starting to iterate over 2/183 shards.
92
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#69, ': Starting to iterate over 2/183 shards.
93
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#70, ': Starting to iterate over 2/183 shards.
94
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#71, ': Starting to iterate over 2/183 shards.
95
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#72, ': Starting to iterate over 2/183 shards.
96
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#73, ': Starting to iterate over 2/183 shards.
97
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#74, ': Starting to iterate over 2/183 shards.
98
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#75, ': Starting to iterate over 2/183 shards.
99
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#76, ': Starting to iterate over 2/183 shards.
100
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#77, ': Starting to iterate over 2/183 shards.
101
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#78, ': Starting to iterate over 2/183 shards.
102
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#80, ': Starting to iterate over 2/183 shards.
103
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#79, ': Starting to iterate over 2/183 shards.
104
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#81, ': Starting to iterate over 2/183 shards.
105
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#82, ': Starting to iterate over 2/183 shards.
106
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#83, ': Starting to iterate over 2/183 shards.
107
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#84, ': Starting to iterate over 2/183 shards.
108
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#85, ': Starting to iterate over 2/183 shards.
109
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#86, ': Starting to iterate over 2/183 shards.
110
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#88, ': Starting to iterate over 1/183 shards.
111
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#87, ': Starting to iterate over 1/183 shards.
112
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#89, ': Starting to iterate over 1/183 shards.
113
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#91, ': Starting to iterate over 1/183 shards.
114
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#90, ': Starting to iterate over 1/183 shards.
115
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#92, ': Starting to iterate over 1/183 shards.
116
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#93, ': Starting to iterate over 1/183 shards.
117
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#94, ': Starting to iterate over 1/183 shards.
118
+ 07/24/2024 16:22:39 - DEBUG - datasets.iterable_dataset - dataloader worker#95, ': Starting to iterate over 1/183 shards.
119
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486023 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
120
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486023 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
121
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497218 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
122
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485912 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
123
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485912 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
124
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10668116 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
125
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485847 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
126
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10512203 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
127
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495973 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
128
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488098 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
129
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485842 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
130
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487725 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
131
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486172 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
132
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10522596 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
133
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10949076 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
134
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488651 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
135
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10536479 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
136
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489575 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
137
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485918 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
138
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488651 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
139
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10610581 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
140
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511500 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
141
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486276 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
142
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488385 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
143
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487790 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
144
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509262 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
145
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492861 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
146
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492861 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
147
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10498167 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
148
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491327 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
149
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10525688 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
150
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10553677 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
151
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489599 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
152
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 11115863 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
153
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486801 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
154
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10640425 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
155
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487482 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
156
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10530453 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
157
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492277 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
158
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10553677 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
159
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10863935 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
160
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10686322 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
161
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492554 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
162
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10493913 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
163
+ 07/24/2024 16:22:39 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489635 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
164
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10500290 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
165
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495520 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
166
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497335 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
167
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486397 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
168
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10676628 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
169
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495520 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
170
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 11286262 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
171
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10751338 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
172
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 11286262 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
173
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491889 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
174
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10621496 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
175
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10500930 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
176
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487097 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
177
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10598254 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
178
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497111 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
179
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486616 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
180
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497062 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
181
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497111 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
182
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486616 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
183
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10515063 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
184
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10499607 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
185
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511515 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
186
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488608 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
187
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10562022 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
188
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491272 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
189
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10525926 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
190
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488150 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
191
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10499106 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
192
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509286 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
193
+ 07/24/2024 16:22:40 - DEBUG - datasets.packaged_modules.json.json - Batch of 10501535 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
194
+ 07/24/2024 16:22:41 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509286 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
195
+ 07/24/2024 16:22:41 - DEBUG - datasets.packaged_modules.json.json - Batch of 10552417 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
196
+ 07/24/2024 16:22:41 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491547 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
197
+ 07/24/2024 16:22:41 - DEBUG - datasets.packaged_modules.json.json - Batch of 10552417 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
198
+ 07/24/2024 16:22:41 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511604 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
199
+ 07/24/2024 16:22:57 - INFO - __main__ - Step 1: {'lr': 0.0, 'samples': 48, 'steps': 0, 'loss/train': 10.554669380187988}
200
+ 07/24/2024 16:22:58 - INFO - __main__ - Step 2: {'lr': 7.142857142857143e-07, 'samples': 96, 'steps': 1, 'loss/train': 10.494059562683105}
201
+ 07/24/2024 16:22:58 - INFO - __main__ - Step 3: {'lr': 1.4285714285714286e-06, 'samples': 144, 'steps': 2, 'loss/train': 10.507988929748535}
202
+ 07/24/2024 16:22:58 - INFO - __main__ - Step 4: {'lr': 2.142857142857143e-06, 'samples': 192, 'steps': 3, 'loss/train': 10.415447235107422}
203
+ 07/24/2024 16:22:58 - INFO - __main__ - Step 5: {'lr': 2.8571428571428573e-06, 'samples': 240, 'steps': 4, 'loss/train': 10.345850944519043}
204
+ 07/24/2024 16:22:59 - INFO - __main__ - Step 6: {'lr': 3.5714285714285714e-06, 'samples': 288, 'steps': 5, 'loss/train': 10.195524215698242}
205
+ 07/24/2024 16:22:59 - INFO - __main__ - Step 7: {'lr': 4.285714285714286e-06, 'samples': 336, 'steps': 6, 'loss/train': 10.09341812133789}
206
+ 07/24/2024 16:22:59 - INFO - __main__ - Step 8: {'lr': 5e-06, 'samples': 384, 'steps': 7, 'loss/train': 9.965239524841309}
207
+ 07/24/2024 16:22:59 - INFO - __main__ - Step 9: {'lr': 5.7142857142857145e-06, 'samples': 432, 'steps': 8, 'loss/train': 9.698853492736816}
208
+ 07/24/2024 16:23:00 - INFO - __main__ - Step 10: {'lr': 6.428571428571429e-06, 'samples': 480, 'steps': 9, 'loss/train': 9.80683708190918}
209
+ 07/24/2024 16:23:00 - INFO - __main__ - Step 11: {'lr': 7.142857142857143e-06, 'samples': 528, 'steps': 10, 'loss/train': 9.633079528808594}
210
+ 07/24/2024 16:23:00 - INFO - __main__ - Step 12: {'lr': 7.857142857142858e-06, 'samples': 576, 'steps': 11, 'loss/train': 9.700591087341309}
211
+ 07/24/2024 16:23:01 - INFO - __main__ - Step 13: {'lr': 8.571428571428573e-06, 'samples': 624, 'steps': 12, 'loss/train': 9.603139877319336}
212
+ 07/24/2024 16:23:01 - INFO - __main__ - Step 14: {'lr': 9.285714285714286e-06, 'samples': 672, 'steps': 13, 'loss/train': 9.30308723449707}
213
+ 07/24/2024 16:23:01 - INFO - __main__ - Step 15: {'lr': 1e-05, 'samples': 720, 'steps': 14, 'loss/train': 9.333526611328125}
214
+ 07/24/2024 16:23:01 - INFO - __main__ - Step 16: {'lr': 1.0714285714285714e-05, 'samples': 768, 'steps': 15, 'loss/train': 8.336181640625}
215
+ 07/24/2024 16:23:02 - INFO - __main__ - Step 17: {'lr': 1.1428571428571429e-05, 'samples': 816, 'steps': 16, 'loss/train': 9.075631141662598}
216
+ 07/24/2024 16:23:02 - INFO - __main__ - Step 18: {'lr': 1.2142857142857142e-05, 'samples': 864, 'steps': 17, 'loss/train': 9.18478012084961}
217
+ 07/24/2024 16:23:02 - INFO - __main__ - Step 19: {'lr': 1.2857142857142857e-05, 'samples': 912, 'steps': 18, 'loss/train': 8.96328353881836}
218
+ 07/24/2024 16:23:03 - INFO - __main__ - Step 20: {'lr': 1.3571428571428572e-05, 'samples': 960, 'steps': 19, 'loss/train': 9.45018196105957}
219
+ 07/24/2024 16:23:03 - INFO - __main__ - Step 21: {'lr': 1.4285714285714285e-05, 'samples': 1008, 'steps': 20, 'loss/train': 8.517333984375}
220
+ 07/24/2024 16:23:03 - INFO - __main__ - Step 22: {'lr': 1.5e-05, 'samples': 1056, 'steps': 21, 'loss/train': 9.207684516906738}
221
+ 07/24/2024 16:23:03 - INFO - __main__ - Step 23: {'lr': 1.5714285714285715e-05, 'samples': 1104, 'steps': 22, 'loss/train': 8.681092262268066}
222
+ 07/24/2024 16:23:04 - INFO - __main__ - Step 24: {'lr': 1.642857142857143e-05, 'samples': 1152, 'steps': 23, 'loss/train': 8.316036224365234}
223
+ 07/24/2024 16:23:04 - INFO - __main__ - Step 25: {'lr': 1.7142857142857145e-05, 'samples': 1200, 'steps': 24, 'loss/train': 8.944169044494629}
224
+ 07/24/2024 16:23:04 - INFO - __main__ - Step 26: {'lr': 1.7857142857142855e-05, 'samples': 1248, 'steps': 25, 'loss/train': 8.878201484680176}
225
+ 07/24/2024 16:23:05 - INFO - __main__ - Step 27: {'lr': 1.8571428571428572e-05, 'samples': 1296, 'steps': 26, 'loss/train': 9.158102989196777}
226
+ 07/24/2024 16:23:05 - INFO - __main__ - Step 28: {'lr': 1.9285714285714285e-05, 'samples': 1344, 'steps': 27, 'loss/train': 9.14354419708252}
227
+ 07/24/2024 16:23:05 - INFO - __main__ - Step 29: {'lr': 2e-05, 'samples': 1392, 'steps': 28, 'loss/train': 8.860624313354492}
228
+ 07/24/2024 16:23:05 - INFO - __main__ - Step 30: {'lr': 2.0714285714285715e-05, 'samples': 1440, 'steps': 29, 'loss/train': 8.876450538635254}
229
+ 07/24/2024 16:23:06 - INFO - __main__ - Step 31: {'lr': 2.1428571428571428e-05, 'samples': 1488, 'steps': 30, 'loss/train': 8.425738334655762}
230
+ 07/24/2024 16:23:06 - INFO - __main__ - Step 32: {'lr': 2.214285714285714e-05, 'samples': 1536, 'steps': 31, 'loss/train': 8.942279815673828}
231
+ 07/24/2024 16:23:06 - INFO - __main__ - Step 33: {'lr': 2.2857142857142858e-05, 'samples': 1584, 'steps': 32, 'loss/train': 8.757084846496582}
232
+ 07/24/2024 16:23:06 - INFO - __main__ - Step 34: {'lr': 2.3571428571428575e-05, 'samples': 1632, 'steps': 33, 'loss/train': 8.699286460876465}
233
+ 07/24/2024 16:23:07 - INFO - __main__ - Step 35: {'lr': 2.4285714285714285e-05, 'samples': 1680, 'steps': 34, 'loss/train': 8.857367515563965}
234
+ 07/24/2024 16:23:07 - INFO - __main__ - Step 36: {'lr': 2.5e-05, 'samples': 1728, 'steps': 35, 'loss/train': 8.830195426940918}
235
+ 07/24/2024 16:23:07 - INFO - __main__ - Step 37: {'lr': 2.5714285714285714e-05, 'samples': 1776, 'steps': 36, 'loss/train': 8.944982528686523}
236
+ 07/24/2024 16:23:08 - INFO - __main__ - Step 38: {'lr': 2.642857142857143e-05, 'samples': 1824, 'steps': 37, 'loss/train': 8.670278549194336}
237
+ 07/24/2024 16:23:08 - INFO - __main__ - Step 39: {'lr': 2.7142857142857144e-05, 'samples': 1872, 'steps': 38, 'loss/train': 8.710525512695312}
238
+ 07/24/2024 16:23:08 - INFO - __main__ - Step 40: {'lr': 2.7857142857142858e-05, 'samples': 1920, 'steps': 39, 'loss/train': 7.902089595794678}
239
+ 07/24/2024 16:23:08 - INFO - __main__ - Step 41: {'lr': 2.857142857142857e-05, 'samples': 1968, 'steps': 40, 'loss/train': 8.400484085083008}
240
+ 07/24/2024 16:23:09 - INFO - __main__ - Step 42: {'lr': 2.9285714285714288e-05, 'samples': 2016, 'steps': 41, 'loss/train': 8.789310455322266}
241
+ 07/24/2024 16:23:09 - INFO - __main__ - Step 43: {'lr': 3e-05, 'samples': 2064, 'steps': 42, 'loss/train': 8.754344940185547}
242
+ 07/24/2024 16:23:09 - INFO - __main__ - Step 44: {'lr': 3.071428571428572e-05, 'samples': 2112, 'steps': 43, 'loss/train': 8.84192943572998}
243
+ 07/24/2024 16:23:10 - INFO - __main__ - Step 45: {'lr': 3.142857142857143e-05, 'samples': 2160, 'steps': 44, 'loss/train': 8.784793853759766}
244
+ 07/24/2024 16:23:10 - INFO - __main__ - Step 46: {'lr': 3.214285714285714e-05, 'samples': 2208, 'steps': 45, 'loss/train': 8.67403793334961}
245
+ 07/24/2024 16:23:10 - INFO - __main__ - Step 47: {'lr': 3.285714285714286e-05, 'samples': 2256, 'steps': 46, 'loss/train': 8.51427173614502}
246
+ 07/24/2024 16:23:10 - INFO - __main__ - Step 48: {'lr': 3.357142857142857e-05, 'samples': 2304, 'steps': 47, 'loss/train': 8.48193073272705}
247
+ 07/24/2024 16:23:11 - INFO - __main__ - Step 49: {'lr': 3.428571428571429e-05, 'samples': 2352, 'steps': 48, 'loss/train': 8.518038749694824}
248
+ 07/24/2024 16:23:11 - INFO - __main__ - Step 50: {'lr': 3.5000000000000004e-05, 'samples': 2400, 'steps': 49, 'loss/train': 8.63569450378418}
249
+ 07/24/2024 16:23:11 - INFO - __main__ - Evaluating and saving model checkpoint
250
+ 07/24/2024 16:23:14 - WARNING - datasets.iterable_dataset - Too many dataloader workers: 96 (max is dataset.n_shards=1). Stopping 95 dataloader workers.
251
+ 07/24/2024 16:23:14 - INFO - datasets.iterable_dataset - To parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of workers greater than dataset.n_shards=1. To enable more parallelism, please split the dataset in more files than 1.
252
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 1/1 shards.
253
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#3, ': Stopping... Number of dataset shards < num_workers (1<96).
254
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#2, ': Stopping... Number of dataset shards < num_workers (1<96).
255
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#1, ': Stopping... Number of dataset shards < num_workers (1<96).
256
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#4, ': Stopping... Number of dataset shards < num_workers (1<96).
257
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#5, ': Stopping... Number of dataset shards < num_workers (1<96).
258
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#6, ': Stopping... Number of dataset shards < num_workers (1<96).
259
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#7, ': Stopping... Number of dataset shards < num_workers (1<96).
260
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#8, ': Stopping... Number of dataset shards < num_workers (1<96).
261
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#9, ': Stopping... Number of dataset shards < num_workers (1<96).
262
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#10, ': Stopping... Number of dataset shards < num_workers (1<96).
263
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#11, ': Stopping... Number of dataset shards < num_workers (1<96).
264
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#12, ': Stopping... Number of dataset shards < num_workers (1<96).
265
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#13, ': Stopping... Number of dataset shards < num_workers (1<96).
266
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#14, ': Stopping... Number of dataset shards < num_workers (1<96).
267
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#15, ': Stopping... Number of dataset shards < num_workers (1<96).
268
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#17, ': Stopping... Number of dataset shards < num_workers (1<96).
269
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#16, ': Stopping... Number of dataset shards < num_workers (1<96).
270
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#18, ': Stopping... Number of dataset shards < num_workers (1<96).
271
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#19, ': Stopping... Number of dataset shards < num_workers (1<96).
272
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#20, ': Stopping... Number of dataset shards < num_workers (1<96).
273
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#21, ': Stopping... Number of dataset shards < num_workers (1<96).
274
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#22, ': Stopping... Number of dataset shards < num_workers (1<96).
275
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#23, ': Stopping... Number of dataset shards < num_workers (1<96).
276
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#24, ': Stopping... Number of dataset shards < num_workers (1<96).
277
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#26, ': Stopping... Number of dataset shards < num_workers (1<96).
278
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#27, ': Stopping... Number of dataset shards < num_workers (1<96).
279
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#28, ': Stopping... Number of dataset shards < num_workers (1<96).
280
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#29, ': Stopping... Number of dataset shards < num_workers (1<96).
281
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#25, ': Stopping... Number of dataset shards < num_workers (1<96).
282
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#30, ': Stopping... Number of dataset shards < num_workers (1<96).
283
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#31, ': Stopping... Number of dataset shards < num_workers (1<96).
284
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#32, ': Stopping... Number of dataset shards < num_workers (1<96).
285
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#33, ': Stopping... Number of dataset shards < num_workers (1<96).
286
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#34, ': Stopping... Number of dataset shards < num_workers (1<96).
287
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#35, ': Stopping... Number of dataset shards < num_workers (1<96).
288
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#36, ': Stopping... Number of dataset shards < num_workers (1<96).
289
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#37, ': Stopping... Number of dataset shards < num_workers (1<96).
290
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#38, ': Stopping... Number of dataset shards < num_workers (1<96).
291
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#39, ': Stopping... Number of dataset shards < num_workers (1<96).
292
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#40, ': Stopping... Number of dataset shards < num_workers (1<96).
293
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#41, ': Stopping... Number of dataset shards < num_workers (1<96).
294
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#42, ': Stopping... Number of dataset shards < num_workers (1<96).
295
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#43, ': Stopping... Number of dataset shards < num_workers (1<96).
296
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#44, ': Stopping... Number of dataset shards < num_workers (1<96).
297
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#45, ': Stopping... Number of dataset shards < num_workers (1<96).
298
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#46, ': Stopping... Number of dataset shards < num_workers (1<96).
299
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#47, ': Stopping... Number of dataset shards < num_workers (1<96).
300
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#48, ': Stopping... Number of dataset shards < num_workers (1<96).
301
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#49, ': Stopping... Number of dataset shards < num_workers (1<96).
302
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#50, ': Stopping... Number of dataset shards < num_workers (1<96).
303
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#51, ': Stopping... Number of dataset shards < num_workers (1<96).
304
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#52, ': Stopping... Number of dataset shards < num_workers (1<96).
305
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#53, ': Stopping... Number of dataset shards < num_workers (1<96).
306
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#54, ': Stopping... Number of dataset shards < num_workers (1<96).
307
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#55, ': Stopping... Number of dataset shards < num_workers (1<96).
308
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#56, ': Stopping... Number of dataset shards < num_workers (1<96).
309
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#57, ': Stopping... Number of dataset shards < num_workers (1<96).
310
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#58, ': Stopping... Number of dataset shards < num_workers (1<96).
311
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#59, ': Stopping... Number of dataset shards < num_workers (1<96).
312
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#60, ': Stopping... Number of dataset shards < num_workers (1<96).
313
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#61, ': Stopping... Number of dataset shards < num_workers (1<96).
314
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#62, ': Stopping... Number of dataset shards < num_workers (1<96).
315
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#63, ': Stopping... Number of dataset shards < num_workers (1<96).
316
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#64, ': Stopping... Number of dataset shards < num_workers (1<96).
317
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#66, ': Stopping... Number of dataset shards < num_workers (1<96).
318
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#65, ': Stopping... Number of dataset shards < num_workers (1<96).
319
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#67, ': Stopping... Number of dataset shards < num_workers (1<96).
320
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#68, ': Stopping... Number of dataset shards < num_workers (1<96).
321
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#69, ': Stopping... Number of dataset shards < num_workers (1<96).
322
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#70, ': Stopping... Number of dataset shards < num_workers (1<96).
323
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#71, ': Stopping... Number of dataset shards < num_workers (1<96).
324
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#72, ': Stopping... Number of dataset shards < num_workers (1<96).
325
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#73, ': Stopping... Number of dataset shards < num_workers (1<96).
326
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#74, ': Stopping... Number of dataset shards < num_workers (1<96).
327
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#75, ': Stopping... Number of dataset shards < num_workers (1<96).
328
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#76, ': Stopping... Number of dataset shards < num_workers (1<96).
329
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#77, ': Stopping... Number of dataset shards < num_workers (1<96).
330
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#78, ': Stopping... Number of dataset shards < num_workers (1<96).
331
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#80, ': Stopping... Number of dataset shards < num_workers (1<96).
332
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#79, ': Stopping... Number of dataset shards < num_workers (1<96).
333
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#81, ': Stopping... Number of dataset shards < num_workers (1<96).
334
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#82, ': Stopping... Number of dataset shards < num_workers (1<96).
335
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#83, ': Stopping... Number of dataset shards < num_workers (1<96).
336
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#84, ': Stopping... Number of dataset shards < num_workers (1<96).
337
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#85, ': Stopping... Number of dataset shards < num_workers (1<96).
338
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#86, ': Stopping... Number of dataset shards < num_workers (1<96).
339
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#87, ': Stopping... Number of dataset shards < num_workers (1<96).
340
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#88, ': Stopping... Number of dataset shards < num_workers (1<96).
341
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#89, ': Stopping... Number of dataset shards < num_workers (1<96).
342
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#90, ': Stopping... Number of dataset shards < num_workers (1<96).
343
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#91, ': Stopping... Number of dataset shards < num_workers (1<96).
344
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#92, ': Stopping... Number of dataset shards < num_workers (1<96).
345
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#93, ': Stopping... Number of dataset shards < num_workers (1<96).
346
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#94, ': Stopping... Number of dataset shards < num_workers (1<96).
347
+ 07/24/2024 16:23:14 - DEBUG - datasets.iterable_dataset - dataloader worker#95, ': Stopping... Number of dataset shards < num_workers (1<96).
348
+ 07/24/2024 16:24:38 - INFO - __main__ - Distributed environment: MULTI_GPU Backend: nccl
349
+ Num processes: 4
350
+ Process index: 0
351
+ Local process index: 0
352
+ Device: cuda:0
353
+
354
+ Mixed precision type: fp16
355
+
356
+ 07/24/2024 16:24:39 - WARNING - huggingface_hub.repository - /dli/gptesla-small/./ is already a clone of https://huggingface.co/shng2025/gptesla-small. Make sure you pull the latest changes with `repo.git_pull()`.
357
+ 07/24/2024 16:24:39 - WARNING - huggingface_hub.repository - Revision `faithful-thunder-118` does not exist. Created and checked out branch `faithful-thunder-118`.
358
+ 07/24/2024 16:24:39 - WARNING - huggingface_hub.repository -
359
+ 07/24/2024 16:24:40 - DEBUG - datasets.utils._dataset_viewer - Dataset info for shng2025/gptesla-train is not completely ready yet.
360
+ 07/24/2024 16:24:40 - INFO - datasets.builder - No config specified, defaulting to the single config: gptesla-train/default
361
+ 07/24/2024 16:24:40 - INFO - datasets.info - Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json
362
+ 07/24/2024 16:24:45 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 2/183 shards.
363
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#5, ': Starting to iterate over 2/183 shards.
364
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#2, ': Starting to iterate over 2/183 shards.
365
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#6, ': Starting to iterate over 2/183 shards.
366
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#9, ': Starting to iterate over 2/183 shards.
367
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#10, ': Starting to iterate over 2/183 shards.
368
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#7, ': Starting to iterate over 2/183 shards.
369
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#18, ': Starting to iterate over 2/183 shards.
370
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#4, ': Starting to iterate over 2/183 shards.
371
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#23, ': Starting to iterate over 2/183 shards.
372
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#1, ': Starting to iterate over 2/183 shards.
373
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#21, ': Starting to iterate over 2/183 shards.
374
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#27, ': Starting to iterate over 2/183 shards.
375
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#8, ': Starting to iterate over 2/183 shards.
376
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#30, ': Starting to iterate over 2/183 shards.
377
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#12, ': Starting to iterate over 2/183 shards.
378
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#33, ': Starting to iterate over 2/183 shards.
379
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#24, ': Starting to iterate over 2/183 shards.
380
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#31, ': Starting to iterate over 2/183 shards.
381
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#32, ': Starting to iterate over 2/183 shards.
382
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#19, ': Starting to iterate over 2/183 shards.
383
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#39, ': Starting to iterate over 2/183 shards.
384
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#11, ': Starting to iterate over 2/183 shards.
385
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#20, ': Starting to iterate over 2/183 shards.
386
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#22, ': Starting to iterate over 2/183 shards.
387
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#25, ': Starting to iterate over 2/183 shards.
388
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#14, ': Starting to iterate over 2/183 shards.
389
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#40, ': Starting to iterate over 2/183 shards.
390
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#15, ': Starting to iterate over 2/183 shards.
391
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#13, ': Starting to iterate over 2/183 shards.
392
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#16, ': Starting to iterate over 2/183 shards.
393
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#41, ': Starting to iterate over 2/183 shards.
394
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#17, ': Starting to iterate over 2/183 shards.
395
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#29, ': Starting to iterate over 2/183 shards.
396
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#28, ': Starting to iterate over 2/183 shards.
397
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#37, ': Starting to iterate over 2/183 shards.
398
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#36, ': Starting to iterate over 2/183 shards.
399
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#35, ': Starting to iterate over 2/183 shards.
400
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#3, ': Starting to iterate over 2/183 shards.
401
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#26, ': Starting to iterate over 2/183 shards.
402
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#38, ': Starting to iterate over 2/183 shards.
403
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#34, ': Starting to iterate over 2/183 shards.
404
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#42, ': Starting to iterate over 2/183 shards.
405
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#43, ': Starting to iterate over 2/183 shards.
406
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#45, ': Starting to iterate over 2/183 shards.
407
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#46, ': Starting to iterate over 2/183 shards.
408
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#44, ': Starting to iterate over 2/183 shards.
409
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#47, ': Starting to iterate over 2/183 shards.
410
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#48, ': Starting to iterate over 2/183 shards.
411
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#49, ': Starting to iterate over 2/183 shards.
412
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#50, ': Starting to iterate over 2/183 shards.
413
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#52, ': Starting to iterate over 2/183 shards.
414
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#51, ': Starting to iterate over 2/183 shards.
415
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#53, ': Starting to iterate over 2/183 shards.
416
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#54, ': Starting to iterate over 2/183 shards.
417
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#55, ': Starting to iterate over 2/183 shards.
418
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#56, ': Starting to iterate over 2/183 shards.
419
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#57, ': Starting to iterate over 2/183 shards.
420
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#58, ': Starting to iterate over 2/183 shards.
421
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#59, ': Starting to iterate over 2/183 shards.
422
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#60, ': Starting to iterate over 2/183 shards.
423
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#61, ': Starting to iterate over 2/183 shards.
424
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#62, ': Starting to iterate over 2/183 shards.
425
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#64, ': Starting to iterate over 2/183 shards.
426
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#63, ': Starting to iterate over 2/183 shards.
427
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#65, ': Starting to iterate over 2/183 shards.
428
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#66, ': Starting to iterate over 2/183 shards.
429
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#67, ': Starting to iterate over 2/183 shards.
430
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#68, ': Starting to iterate over 2/183 shards.
431
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#69, ': Starting to iterate over 2/183 shards.
432
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#70, ': Starting to iterate over 2/183 shards.
433
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#72, ': Starting to iterate over 2/183 shards.
434
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#71, ': Starting to iterate over 2/183 shards.
435
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#73, ': Starting to iterate over 2/183 shards.
436
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#74, ': Starting to iterate over 2/183 shards.
437
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#75, ': Starting to iterate over 2/183 shards.
438
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#76, ': Starting to iterate over 2/183 shards.
439
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#77, ': Starting to iterate over 2/183 shards.
440
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#78, ': Starting to iterate over 2/183 shards.
441
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#80, ': Starting to iterate over 2/183 shards.
442
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#79, ': Starting to iterate over 2/183 shards.
443
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#81, ': Starting to iterate over 2/183 shards.
444
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#82, ': Starting to iterate over 2/183 shards.
445
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#83, ': Starting to iterate over 2/183 shards.
446
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#84, ': Starting to iterate over 2/183 shards.
447
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#85, ': Starting to iterate over 2/183 shards.
448
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#86, ': Starting to iterate over 2/183 shards.
449
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#87, ': Starting to iterate over 1/183 shards.
450
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#88, ': Starting to iterate over 1/183 shards.
451
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#89, ': Starting to iterate over 1/183 shards.
452
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#90, ': Starting to iterate over 1/183 shards.
453
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#91, ': Starting to iterate over 1/183 shards.
454
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#92, ': Starting to iterate over 1/183 shards.
455
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#93, ': Starting to iterate over 1/183 shards.
456
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#94, ': Starting to iterate over 1/183 shards.
457
+ 07/24/2024 16:24:46 - DEBUG - datasets.iterable_dataset - dataloader worker#95, ': Starting to iterate over 1/183 shards.
458
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486616 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
459
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486023 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
460
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10500930 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
461
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497111 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
462
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497062 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
463
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10499607 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
464
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486397 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
465
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511500 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
466
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10525688 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
467
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486616 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
468
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10522596 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
469
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489599 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
470
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10512203 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
471
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497218 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
472
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10949076 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
473
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497111 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
474
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486023 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
475
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10562022 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
476
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491327 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
477
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488098 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
478
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489575 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
479
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10668116 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
480
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10553677 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
481
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10553677 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
482
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485847 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
483
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511604 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
484
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487790 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
485
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10489635 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
486
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485912 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
487
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10552417 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
488
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488651 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
489
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485912 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
490
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491272 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
491
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488651 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
492
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10552417 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
493
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487097 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
494
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509262 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
495
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10515063 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
496
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10598254 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
497
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10621496 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
498
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10501535 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
499
+ 07/24/2024 16:24:46 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488150 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
500
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495973 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
501
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492554 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
502
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487725 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
503
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485918 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
504
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10499106 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
505
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492277 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
506
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486276 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
507
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491889 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
508
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 11286262 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
509
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486801 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
510
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 11286262 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
511
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509286 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
512
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488385 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
513
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10509286 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
514
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10536479 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
515
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10488608 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
516
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10485842 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
517
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10500290 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
518
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492861 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
519
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10492861 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
520
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10511515 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
521
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10491547 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
522
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10676628 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
523
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10497335 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
524
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10610581 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
525
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10486172 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
526
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495520 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
527
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10495520 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
528
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10487482 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
529
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10686322 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
530
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 11115863 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
531
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10498167 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
532
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10530453 bytes couldn't be parsed with block_size=655360. Retrying with block_size=1310720.
533
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10493913 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
534
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10525926 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
535
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10863935 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
536
+ 07/24/2024 16:24:47 - DEBUG - datasets.packaged_modules.json.json - Batch of 10640425 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
537
+ 07/24/2024 16:24:48 - DEBUG - datasets.packaged_modules.json.json - Batch of 10751338 bytes couldn't be parsed with block_size=327680. Retrying with block_size=655360.
538
+ 07/24/2024 16:25:03 - INFO - __main__ - Step 1: {'lr': 0.0, 'samples': 48, 'steps': 0, 'loss/train': 10.554669380187988}
539
+ 07/24/2024 16:25:04 - INFO - __main__ - Step 2: {'lr': 7.142857142857143e-07, 'samples': 96, 'steps': 1, 'loss/train': 10.494059562683105}
540
+ 07/24/2024 16:25:04 - INFO - __main__ - Step 3: {'lr': 1.4285714285714286e-06, 'samples': 144, 'steps': 2, 'loss/train': 10.507988929748535}
541
+ 07/24/2024 16:25:04 - INFO - __main__ - Step 4: {'lr': 2.142857142857143e-06, 'samples': 192, 'steps': 3, 'loss/train': 10.415447235107422}
542
+ 07/24/2024 16:25:05 - INFO - __main__ - Step 5: {'lr': 2.8571428571428573e-06, 'samples': 240, 'steps': 4, 'loss/train': 10.345850944519043}
543
+ 07/24/2024 16:25:05 - INFO - __main__ - Step 6: {'lr': 3.5714285714285714e-06, 'samples': 288, 'steps': 5, 'loss/train': 10.195524215698242}
544
+ 07/24/2024 16:25:05 - INFO - __main__ - Step 7: {'lr': 4.285714285714286e-06, 'samples': 336, 'steps': 6, 'loss/train': 10.09341812133789}
545
+ 07/24/2024 16:25:06 - INFO - __main__ - Step 8: {'lr': 5e-06, 'samples': 384, 'steps': 7, 'loss/train': 9.965239524841309}
546
+ 07/24/2024 16:25:06 - INFO - __main__ - Step 9: {'lr': 5.7142857142857145e-06, 'samples': 432, 'steps': 8, 'loss/train': 9.698853492736816}
547
+ 07/24/2024 16:25:06 - INFO - __main__ - Step 10: {'lr': 6.428571428571429e-06, 'samples': 480, 'steps': 9, 'loss/train': 9.80683708190918}
548
+ 07/24/2024 16:25:06 - INFO - __main__ - Step 11: {'lr': 7.142857142857143e-06, 'samples': 528, 'steps': 10, 'loss/train': 9.633079528808594}
549
+ 07/24/2024 16:25:07 - INFO - __main__ - Step 12: {'lr': 7.857142857142858e-06, 'samples': 576, 'steps': 11, 'loss/train': 9.700591087341309}
550
+ 07/24/2024 16:25:07 - INFO - __main__ - Step 13: {'lr': 8.571428571428573e-06, 'samples': 624, 'steps': 12, 'loss/train': 9.603139877319336}
551
+ 07/24/2024 16:25:07 - INFO - __main__ - Step 14: {'lr': 9.285714285714286e-06, 'samples': 672, 'steps': 13, 'loss/train': 9.30308723449707}
552
+ 07/24/2024 16:25:08 - INFO - __main__ - Step 15: {'lr': 1e-05, 'samples': 720, 'steps': 14, 'loss/train': 9.333526611328125}
553
+ 07/24/2024 16:25:08 - INFO - __main__ - Step 16: {'lr': 1.0714285714285714e-05, 'samples': 768, 'steps': 15, 'loss/train': 8.336181640625}
554
+ 07/24/2024 16:25:08 - INFO - __main__ - Step 17: {'lr': 1.1428571428571429e-05, 'samples': 816, 'steps': 16, 'loss/train': 9.075631141662598}
555
+ 07/24/2024 16:25:08 - INFO - __main__ - Step 18: {'lr': 1.2142857142857142e-05, 'samples': 864, 'steps': 17, 'loss/train': 9.18478012084961}
556
+ 07/24/2024 16:25:09 - INFO - __main__ - Step 19: {'lr': 1.2857142857142857e-05, 'samples': 912, 'steps': 18, 'loss/train': 8.96328353881836}
557
+ 07/24/2024 16:25:09 - INFO - __main__ - Step 20: {'lr': 1.3571428571428572e-05, 'samples': 960, 'steps': 19, 'loss/train': 9.45018196105957}
558
+ 07/24/2024 16:25:09 - INFO - __main__ - Step 21: {'lr': 1.4285714285714285e-05, 'samples': 1008, 'steps': 20, 'loss/train': 8.517333984375}
559
+ 07/24/2024 16:25:09 - INFO - __main__ - Step 22: {'lr': 1.5e-05, 'samples': 1056, 'steps': 21, 'loss/train': 9.207684516906738}
560
+ 07/24/2024 16:25:10 - INFO - __main__ - Step 23: {'lr': 1.5714285714285715e-05, 'samples': 1104, 'steps': 22, 'loss/train': 8.681092262268066}
561
+ 07/24/2024 16:25:10 - INFO - __main__ - Step 24: {'lr': 1.642857142857143e-05, 'samples': 1152, 'steps': 23, 'loss/train': 8.316036224365234}
562
+ 07/24/2024 16:25:10 - INFO - __main__ - Step 25: {'lr': 1.7142857142857145e-05, 'samples': 1200, 'steps': 24, 'loss/train': 8.944169044494629}
563
+ 07/24/2024 16:25:11 - INFO - __main__ - Step 26: {'lr': 1.7857142857142855e-05, 'samples': 1248, 'steps': 25, 'loss/train': 8.878201484680176}
564
+ 07/24/2024 16:25:11 - INFO - __main__ - Step 27: {'lr': 1.8571428571428572e-05, 'samples': 1296, 'steps': 26, 'loss/train': 9.158102989196777}
565
+ 07/24/2024 16:25:11 - INFO - __main__ - Step 28: {'lr': 1.9285714285714285e-05, 'samples': 1344, 'steps': 27, 'loss/train': 9.14354419708252}
566
+ 07/24/2024 16:25:11 - INFO - __main__ - Step 29: {'lr': 2e-05, 'samples': 1392, 'steps': 28, 'loss/train': 8.860624313354492}
567
+ 07/24/2024 16:25:12 - INFO - __main__ - Step 30: {'lr': 2.0714285714285715e-05, 'samples': 1440, 'steps': 29, 'loss/train': 8.876450538635254}
568
+ 07/24/2024 16:25:12 - INFO - __main__ - Step 31: {'lr': 2.1428571428571428e-05, 'samples': 1488, 'steps': 30, 'loss/train': 8.425738334655762}
569
+ 07/24/2024 16:25:12 - INFO - __main__ - Step 32: {'lr': 2.214285714285714e-05, 'samples': 1536, 'steps': 31, 'loss/train': 8.942279815673828}
570
+ 07/24/2024 16:25:13 - INFO - __main__ - Step 33: {'lr': 2.2857142857142858e-05, 'samples': 1584, 'steps': 32, 'loss/train': 8.757084846496582}
571
+ 07/24/2024 16:25:13 - INFO - __main__ - Step 34: {'lr': 2.3571428571428575e-05, 'samples': 1632, 'steps': 33, 'loss/train': 8.699286460876465}
572
+ 07/24/2024 16:25:13 - INFO - __main__ - Step 35: {'lr': 2.4285714285714285e-05, 'samples': 1680, 'steps': 34, 'loss/train': 8.857367515563965}
573
+ 07/24/2024 16:25:13 - INFO - __main__ - Step 36: {'lr': 2.5e-05, 'samples': 1728, 'steps': 35, 'loss/train': 8.830195426940918}
574
+ 07/24/2024 16:25:14 - INFO - __main__ - Step 37: {'lr': 2.5714285714285714e-05, 'samples': 1776, 'steps': 36, 'loss/train': 8.944982528686523}
575
+ 07/24/2024 16:25:14 - INFO - __main__ - Step 38: {'lr': 2.642857142857143e-05, 'samples': 1824, 'steps': 37, 'loss/train': 8.670278549194336}
576
+ 07/24/2024 16:25:14 - INFO - __main__ - Step 39: {'lr': 2.7142857142857144e-05, 'samples': 1872, 'steps': 38, 'loss/train': 8.710525512695312}
577
+ 07/24/2024 16:25:14 - INFO - __main__ - Step 40: {'lr': 2.7857142857142858e-05, 'samples': 1920, 'steps': 39, 'loss/train': 7.902089595794678}
578
+ 07/24/2024 16:25:15 - INFO - __main__ - Step 41: {'lr': 2.857142857142857e-05, 'samples': 1968, 'steps': 40, 'loss/train': 8.400484085083008}
579
+ 07/24/2024 16:25:15 - INFO - __main__ - Step 42: {'lr': 2.9285714285714288e-05, 'samples': 2016, 'steps': 41, 'loss/train': 8.789310455322266}
580
+ 07/24/2024 16:25:15 - INFO - __main__ - Step 43: {'lr': 3e-05, 'samples': 2064, 'steps': 42, 'loss/train': 8.754344940185547}
581
+ 07/24/2024 16:25:16 - INFO - __main__ - Step 44: {'lr': 3.071428571428572e-05, 'samples': 2112, 'steps': 43, 'loss/train': 8.84192943572998}
582
+ 07/24/2024 16:25:16 - INFO - __main__ - Step 45: {'lr': 3.142857142857143e-05, 'samples': 2160, 'steps': 44, 'loss/train': 8.784793853759766}
583
+ 07/24/2024 16:25:16 - INFO - __main__ - Step 46: {'lr': 3.214285714285714e-05, 'samples': 2208, 'steps': 45, 'loss/train': 8.67403793334961}
584
+ 07/24/2024 16:25:16 - INFO - __main__ - Step 47: {'lr': 3.285714285714286e-05, 'samples': 2256, 'steps': 46, 'loss/train': 8.51427173614502}
585
+ 07/24/2024 16:25:17 - INFO - __main__ - Step 48: {'lr': 3.357142857142857e-05, 'samples': 2304, 'steps': 47, 'loss/train': 8.48193073272705}
586
+ 07/24/2024 16:25:17 - INFO - __main__ - Step 49: {'lr': 3.428571428571429e-05, 'samples': 2352, 'steps': 48, 'loss/train': 8.518038749694824}
587
+ 07/24/2024 16:25:17 - INFO - __main__ - Step 50: {'lr': 3.5000000000000004e-05, 'samples': 2400, 'steps': 49, 'loss/train': 8.63569450378418}
588
+ 07/24/2024 16:25:17 - INFO - __main__ - Evaluating and saving model checkpoint
589
+ 07/24/2024 16:25:18 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 1/1 shards.
590
+ 07/24/2024 16:25:21 - INFO - __main__ - Step 50: {'loss/eval': 8.551246643066406, 'perplexity': 5173.19970703125}
log/debug_1.log ADDED
File without changes
log/debug_2.log ADDED
File without changes
log/debug_3.log ADDED
File without changes
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5bfbaf0ff37c9cbbbf37dfdf848aaee9b060a6a19f3c79d815806112c6c2f1
3
  size 444048000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eb13d61f8d3f9cb945838abd62b274c030a665710e11a81e40523fd167676e8
3
  size 444048000
runs/Jul24_16-20-48_lab/1721838048.2686536/events.out.tfevents.1721838048.lab.4218.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:facfdf2c3e0b75285a572d57f60dccd04b0da7628fe0d7ee3edbf792f86aec9c
3
+ size 1702
runs/Jul24_16-20-48_lab/events.out.tfevents.1721838048.lab.4218.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c5ae1e09bc1bd05e1631230336ed65fbb1438108cc749375436a78ebcb9d586
3
+ size 88
runs/Jul24_16-22-30_lab/1721838150.6920006/events.out.tfevents.1721838150.lab.4428.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93b9c0e640f1010cdec2a74742559254caff20d48bfe2c663a311a87e0d98d51
3
+ size 1702
runs/Jul24_16-22-30_lab/events.out.tfevents.1721838150.lab.4428.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a1d6befcf3fa6cf43782909cf53836226eabf5ee6f94726ef2d92ace277cc5
3
+ size 8888
runs/Jul24_16-24-38_lab/1721838278.570936/events.out.tfevents.1721838278.lab.56535.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e78429e33247b62288e7325a0d71f9fbdea950562fa19331ac798d77e2dbad7
3
+ size 1702
runs/Jul24_16-24-38_lab/events.out.tfevents.1721838278.lab.56535.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c21884bb840d99643a3170e86e82a585cb5a06aa164511e08c9e903a04adca
3
+ size 8983