lvwerra HF staff commited on
Commit
d787f60
1 Parent(s): ea70f93

step 50000

Browse files
.gitattributes CHANGED
@@ -15,3 +15,6 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ wandb/debug-internal.log filter=lfs diff=lfs merge=lfs -text
19
+ wandb/run-20210920_142810-36cw69uv/logs/debug-internal.log filter=lfs diff=lfs merge=lfs -text
20
+ wandb/run-20210920_142810-36cw69uv/run-36cw69uv.wandb filter=lfs diff=lfs merge=lfs -text
codeparrot_training.py CHANGED
@@ -12,24 +12,22 @@ from argparse import Namespace
12
  import torch
13
  import logging
14
  import wandb
15
- import time
16
-
17
 
18
  class ConstantLengthDataset(IterableDataset):
 
19
  def __init__(self, tokenizer, dataset, seq_length=1024,
20
  num_of_sequences=1024, chars_per_token=3.6):
21
  self.tokenizer = tokenizer
22
- self.concatenation_token_id = tokenizer.bos_token_id
23
  self.dataset = dataset
24
  self.seq_length = seq_length
25
  self.input_characters = seq_length * chars_per_token * num_of_sequences
26
- self.produced_samples = 0
27
  def __iter__(self):
28
  iterator = iter(self.dataset)
29
  more_examples = True
30
  while more_examples:
31
- buffer = []
32
- buffer_len = 0
33
  while True:
34
  if buffer_len >= self.input_characters:
35
  break
@@ -42,7 +40,7 @@ class ConstantLengthDataset(IterableDataset):
42
  tokenized_inputs = tokenizer(buffer, truncation=False)['input_ids']
43
  all_token_ids = []
44
  for tokenized_input in tokenized_inputs:
45
- all_token_ids.extend(tokenized_input + [self.concatenation_token_id])
46
  for i in range(0, len(all_token_ids), self.seq_length):
47
  input_ids = all_token_ids[i : i + self.seq_length]
48
  if len(input_ids) == self.seq_length:
@@ -52,14 +50,16 @@ def setup_logging(project_name):
52
  logger = logging.getLogger(__name__)
53
  logging.basicConfig(
54
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
55
- datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,)
 
 
56
  if accelerator.is_main_process: # we only want to setup logging once
57
  wandb.init(project=project_name, config=args)
58
  run_name = wandb.run.name
59
  tb_writer = SummaryWriter()
60
  tb_writer.add_hparams(vars(args), {'0': 0})
61
  logger.setLevel(logging.INFO)
62
- datasets.utils.logging.set_verbosity_warning()
63
  transformers.utils.logging.set_verbosity_info()
64
  else:
65
  tb_writer = None
@@ -69,13 +69,12 @@ def setup_logging(project_name):
69
  transformers.utils.logging.set_verbosity_error()
70
  return logger, tb_writer, run_name
71
 
72
- def create_dataloaders(dataset_name):
73
- train_data = load_dataset(dataset_name+'-train', split="train",
74
- streaming=True)
75
  train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
76
  seed=args.seed)
77
- valid_data = load_dataset(dataset_name+'-valid', split="train",
78
- streaming=True)
79
  train_dataset = ConstantLengthDataset(tokenizer, train_data,
80
  seq_length=args.seq_length)
81
  valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
@@ -84,7 +83,7 @@ def create_dataloaders(dataset_name):
84
  eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
85
  return train_dataloader, eval_dataloader
86
 
87
- def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
88
  params_with_wd, params_without_wd = [], []
89
  for n, p in model.named_parameters():
90
  if any(nd in n for nd in no_decay): params_without_wd.append(p)
@@ -98,7 +97,7 @@ def log_metrics(step, metrics):
98
  wandb.log(metrics)
99
  [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
100
 
101
- def evaluate():
102
  model.eval()
103
  losses = []
104
  for step, batch in enumerate(eval_dataloader):
@@ -112,44 +111,44 @@ def evaluate():
112
  except OverflowError: perplexity = float("inf")
113
  return loss.item(), perplexity.item()
114
 
 
 
 
115
  # Hyperparameters
116
  project_name = 'transformersbook/codeparrot'
117
- dataset_name = 'transformersbook/codeparrot'
118
- config = {"train_batch_size": 4,
119
- "valid_batch_size": 4,
120
  "weight_decay": 0.1,
121
- "shuffle_buffer": 1000,
122
- "learning_rate": 5e-4,
123
  "lr_scheduler_type": "cosine",
124
- "num_warmup_steps": 1000,
125
- "gradient_accumulation_steps": 2,
126
- "max_train_steps": 24_000,
127
- "max_eval_steps": 500,
128
  "seq_length": 1024,
129
  "seed": 1,
130
- "save_checkpoint_steps":6_000,}
131
- args = Namespace(**config)
132
- set_seed(args.seed)
133
-
134
- # Accelerator
135
- accelerator = Accelerator()
136
  samples_per_step = accelerator.state.num_processes * args.train_batch_size
 
137
 
138
  # Logging
139
  logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
140
  logger.info(accelerator.state)
141
 
142
  # Load model and tokenizer
143
- if accelerator.is_main_process: # we only want to setup logging once
144
  hf_repo = Repository("./", clone_from=project_name, revision=run_name)
145
- model = GPT2LMHeadModel.from_pretrained("./")
146
  tokenizer = AutoTokenizer.from_pretrained("./")
147
 
148
  # Load dataset and dataloader
149
- train_dataloader, eval_dataloader = create_dataloaders(dataset_name)
150
 
151
  # Prepare the optimizer and learning rate scheduler
152
- optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
153
  lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
154
  num_warmup_steps=args.num_warmup_steps,
155
  num_training_steps=args.max_train_steps,)
@@ -162,24 +161,21 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
162
  # Train model
163
  model.train()
164
  completed_steps = 0
165
- t0 = time.time()
166
  for step, batch in enumerate(train_dataloader, start=1):
167
- t1 = time.time()
168
- loss = model(batch, labels=batch).loss
169
- t2 = time.time()
170
  log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,
171
  'steps': completed_steps, 'loss/train': loss.item()})
172
  loss = loss / args.gradient_accumulation_steps
173
  accelerator.backward(loss)
174
- t3 = time.time()
175
  if step % args.gradient_accumulation_steps == 0:
 
176
  optimizer.step()
177
  lr_scheduler.step()
178
  optimizer.zero_grad()
179
  completed_steps += 1
180
  if step % args.save_checkpoint_steps == 0:
181
  logger.info('Evaluating and saving model checkpoint')
182
- eval_loss, perplexity = evaluate()
183
  log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
184
  accelerator.wait_for_everyone()
185
  unwrapped_model = accelerator.unwrap_model(model)
@@ -189,17 +185,13 @@ for step, batch in enumerate(train_dataloader, start=1):
189
  model.train()
190
  if completed_steps >= args.max_train_steps:
191
  break
192
- t4 = time.time()
193
- #logger.info(f'ITER: {t1-t0:.3f}, FRWD: {t2-t1:.3f}, BKWD: {t3-t2:.3f}, OPT: {t4-t3:.3f}, ALL: {t4-t0}')
194
- t0 = time.time()
195
 
196
  # Evaluate and save the last checkpoint
197
  logger.info('Evaluating and saving model after training')
198
- eval_loss, perplexity = evaluate()
199
  log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
200
  accelerator.wait_for_everyone()
201
  unwrapped_model = accelerator.unwrap_model(model)
202
  if accelerator.is_main_process:
203
  unwrapped_model.save_pretrained("./")
204
- try: hf_repo.push_to_hub(commit_message=f'final model')
205
- except: logger.info('No changes to previously saved model.')
 
12
  import torch
13
  import logging
14
  import wandb
 
 
15
 
16
  class ConstantLengthDataset(IterableDataset):
17
+
18
  def __init__(self, tokenizer, dataset, seq_length=1024,
19
  num_of_sequences=1024, chars_per_token=3.6):
20
  self.tokenizer = tokenizer
21
+ self.concat_token_id = tokenizer.bos_token_id
22
  self.dataset = dataset
23
  self.seq_length = seq_length
24
  self.input_characters = seq_length * chars_per_token * num_of_sequences
25
+
26
  def __iter__(self):
27
  iterator = iter(self.dataset)
28
  more_examples = True
29
  while more_examples:
30
+ buffer, buffer_len = [], 0
 
31
  while True:
32
  if buffer_len >= self.input_characters:
33
  break
 
40
  tokenized_inputs = tokenizer(buffer, truncation=False)['input_ids']
41
  all_token_ids = []
42
  for tokenized_input in tokenized_inputs:
43
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
44
  for i in range(0, len(all_token_ids), self.seq_length):
45
  input_ids = all_token_ids[i : i + self.seq_length]
46
  if len(input_ids) == self.seq_length:
 
50
  logger = logging.getLogger(__name__)
51
  logging.basicConfig(
52
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
53
+ datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
54
+ logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
55
+ logging.StreamHandler()])
56
  if accelerator.is_main_process: # we only want to setup logging once
57
  wandb.init(project=project_name, config=args)
58
  run_name = wandb.run.name
59
  tb_writer = SummaryWriter()
60
  tb_writer.add_hparams(vars(args), {'0': 0})
61
  logger.setLevel(logging.INFO)
62
+ datasets.utils.logging.set_verbosity_info()
63
  transformers.utils.logging.set_verbosity_info()
64
  else:
65
  tb_writer = None
 
69
  transformers.utils.logging.set_verbosity_error()
70
  return logger, tb_writer, run_name
71
 
72
+ def create_dataloaders(dataset_name, args):
73
+ ds_kwargs = {"streaming":True, "chunksize":40<<20, "error_bad_chunk":False}
74
+ train_data = load_dataset(dataset_name+'-train', split='train', **ds_kwargs)
75
  train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
76
  seed=args.seed)
77
+ valid_data = load_dataset(dataset_name+'-valid', split="train", **ds_kwargs)
 
78
  train_dataset = ConstantLengthDataset(tokenizer, train_data,
79
  seq_length=args.seq_length)
80
  valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
 
83
  eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
84
  return train_dataloader, eval_dataloader
85
 
86
+ def get_grouped_params(model, args, no_decay=["bias", "LayerNorm.weight"]):
87
  params_with_wd, params_without_wd = [], []
88
  for n, p in model.named_parameters():
89
  if any(nd in n for nd in no_decay): params_without_wd.append(p)
 
97
  wandb.log(metrics)
98
  [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
99
 
100
+ def evaluate(args):
101
  model.eval()
102
  losses = []
103
  for step, batch in enumerate(eval_dataloader):
 
111
  except OverflowError: perplexity = float("inf")
112
  return loss.item(), perplexity.item()
113
 
114
+ # Accelerator
115
+ accelerator = Accelerator(dispatch_batches=True)
116
+ acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
117
  # Hyperparameters
118
  project_name = 'transformersbook/codeparrot'
119
+ dataset_name = '../codeparrot'
120
+ config = {"train_batch_size": 2,
121
+ "valid_batch_size": 2,
122
  "weight_decay": 0.1,
123
+ "shuffle_buffer": 1_000,
124
+ "learning_rate": 2e-4,
125
  "lr_scheduler_type": "cosine",
126
+ "num_warmup_steps": 750,
127
+ "gradient_accumulation_steps": 16,
128
+ "max_train_steps": 50_000,
129
+ "max_eval_steps": -1,
130
  "seq_length": 1024,
131
  "seed": 1,
132
+ "save_checkpoint_steps": 50_000}
133
+ args = Namespace(**config, **acc_state)
 
 
 
 
134
  samples_per_step = accelerator.state.num_processes * args.train_batch_size
135
+ set_seed(args.seed)
136
 
137
  # Logging
138
  logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
139
  logger.info(accelerator.state)
140
 
141
  # Load model and tokenizer
142
+ if accelerator.is_main_process:
143
  hf_repo = Repository("./", clone_from=project_name, revision=run_name)
144
+ model = GPT2LMHeadModel.from_pretrained("./", gradient_checkpointing=True)
145
  tokenizer = AutoTokenizer.from_pretrained("./")
146
 
147
  # Load dataset and dataloader
148
+ train_dataloader, eval_dataloader = create_dataloaders(dataset_name, args)
149
 
150
  # Prepare the optimizer and learning rate scheduler
151
+ optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
152
  lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
153
  num_warmup_steps=args.num_warmup_steps,
154
  num_training_steps=args.max_train_steps,)
 
161
  # Train model
162
  model.train()
163
  completed_steps = 0
 
164
  for step, batch in enumerate(train_dataloader, start=1):
165
+ loss = model(batch, labels=batch, use_cache=False).loss
 
 
166
  log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,
167
  'steps': completed_steps, 'loss/train': loss.item()})
168
  loss = loss / args.gradient_accumulation_steps
169
  accelerator.backward(loss)
 
170
  if step % args.gradient_accumulation_steps == 0:
171
+ accelerator.clip_grad_norm_(model.parameters(), 1.0)
172
  optimizer.step()
173
  lr_scheduler.step()
174
  optimizer.zero_grad()
175
  completed_steps += 1
176
  if step % args.save_checkpoint_steps == 0:
177
  logger.info('Evaluating and saving model checkpoint')
178
+ eval_loss, perplexity = evaluate(args)
179
  log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
180
  accelerator.wait_for_everyone()
181
  unwrapped_model = accelerator.unwrap_model(model)
 
185
  model.train()
186
  if completed_steps >= args.max_train_steps:
187
  break
 
 
 
188
 
189
  # Evaluate and save the last checkpoint
190
  logger.info('Evaluating and saving model after training')
191
+ eval_loss, perplexity = evaluate(args)
192
  log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
193
  accelerator.wait_for_everyone()
194
  unwrapped_model = accelerator.unwrap_model(model)
195
  if accelerator.is_main_process:
196
  unwrapped_model.save_pretrained("./")
197
+ hf_repo.push_to_hub(commit_message=f'final model')
 
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "activation_function": "gelu_new",
3
  "architectures": [
4
  "GPT2LMHeadModel"
@@ -7,7 +8,7 @@
7
  "bos_token_id": 50256,
8
  "embd_pdrop": 0.1,
9
  "eos_token_id": 50256,
10
- "gradient_checkpointing": false,
11
  "initializer_range": 0.02,
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "gpt2",
 
1
  {
2
+ "_name_or_path": "./",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
8
  "bos_token_id": 50256,
9
  "embd_pdrop": 0.1,
10
  "eos_token_id": 50256,
11
+ "gradient_checkpointing": true,
12
  "initializer_range": 0.02,
13
  "layer_norm_epsilon": 1e-05,
14
  "model_type": "gpt2",
log/debug_0.log ADDED
The diff for this file is too large to render. See raw diff
 
log/debug_1.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_10.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_11.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_12.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_13.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_14.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_15.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_2.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_3.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_4.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_5.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_6.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_7.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_8.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
log/debug_9.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 09/20/2021 14:29:09 - INFO - root - Reducer buckets have been rebuilt in this iteration.
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f700cb9a50ef29578ed0b7d885e8b49208fbca9bdb9247ba9852ea6866d92e8b
3
  size 6169094681
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ec35b14b049e5188c1ba8fd432ffa094b481d96393f02052b1c9a9fa4fdc2a
3
  size 6169094681
requirements.txt CHANGED
@@ -3,5 +3,5 @@ wandb
3
  tensorboard
4
  git+https://github.com/huggingface/huggingface_hub.git
5
  git+https://github.com/huggingface/transformers.git
6
- git+https://github.com/huggingface/datasets.git@load_dataset-no-dataset-script
7
  git+https://github.com/huggingface/accelerate.git
 
3
  tensorboard
4
  git+https://github.com/huggingface/huggingface_hub.git
5
  git+https://github.com/huggingface/transformers.git
6
+ git+https://github.com/huggingface/datasets.git@json-dont-raise
7
  git+https://github.com/huggingface/accelerate.git
runs/Sep20_14-28-12_leandro-16x-v100/1632148092.8874874/events.out.tfevents.1632148092.leandro-16x-v100.8660.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:222b05fb22ccb39b7d43f507f7c672d8c741e4281e65c71c12d98b19c1d3ff1f
3
+ size 1373
runs/Sep20_14-28-12_leandro-16x-v100/events.out.tfevents.1632148092.leandro-16x-v100.8660.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349e549f0e23501888f84c37ff54aff187c6c97313a732fe502a7cf7c77c3a64
3
+ size 9134099
wandb/debug-internal.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20210920_142810-36cw69uv/logs/debug-internal.log
wandb/debug.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20210920_142810-36cw69uv/logs/debug.log
wandb/latest-run ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20210920_142810-36cw69uv
wandb/run-20210920_142810-36cw69uv/files/conda-environment.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: codeparrot
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=main
8
+ - _openmp_mutex=4.5=1_gnu
9
+ - blas=1.0=mkl
10
+ - bzip2=1.0.8=h7b6447c_0
11
+ - ca-certificates=2021.7.5=h06a4308_1
12
+ - certifi=2021.5.30=py38h06a4308_0
13
+ - cudatoolkit=11.1.74=h6bb024c_0
14
+ - ffmpeg=4.3=hf484d3e_0
15
+ - freetype=2.10.4=h5ab3b9f_0
16
+ - gmp=6.2.1=h2531618_2
17
+ - gnutls=3.6.15=he1e5248_0
18
+ - intel-openmp=2021.3.0=h06a4308_3350
19
+ - jpeg=9b=h024ee3a_2
20
+ - lame=3.100=h7b6447c_0
21
+ - lcms2=2.12=h3be6417_0
22
+ - ld_impl_linux-64=2.35.1=h7274673_9
23
+ - libffi=3.3=he6710b0_2
24
+ - libgcc-ng=9.3.0=h5101ec6_17
25
+ - libgomp=9.3.0=h5101ec6_17
26
+ - libiconv=1.15=h63c8f33_5
27
+ - libidn2=2.3.2=h7f8727e_0
28
+ - libpng=1.6.37=hbc83047_0
29
+ - libstdcxx-ng=9.3.0=hd4cf53a_17
30
+ - libtasn1=4.16.0=h27cfd23_0
31
+ - libtiff=4.2.0=h85742a9_0
32
+ - libunistring=0.9.10=h27cfd23_0
33
+ - libuv=1.40.0=h7b6447c_0
34
+ - libwebp-base=1.2.0=h27cfd23_0
35
+ - lz4-c=1.9.3=h295c915_1
36
+ - mkl=2021.3.0=h06a4308_520
37
+ - mkl-service=2.4.0=py38h7f8727e_0
38
+ - mkl_fft=1.3.0=py38h42c9631_2
39
+ - mkl_random=1.2.2=py38h51133e4_0
40
+ - ncurses=6.2=he6710b0_1
41
+ - nettle=3.7.3=hbbd107a_1
42
+ - numpy=1.20.3=py38hf144106_0
43
+ - numpy-base=1.20.3=py38h74d4b33_0
44
+ - olefile=0.46=pyhd3eb1b0_0
45
+ - openh264=2.1.0=hd408876_0
46
+ - openjpeg=2.4.0=h3ad879b_0
47
+ - openssl=1.1.1l=h7f8727e_0
48
+ - pillow=8.3.1=py38h2c7a002_0
49
+ - pip=21.0.1=py38h06a4308_0
50
+ - python=3.8.11=h12debd9_0_cpython
51
+ - pytorch=1.9.0=py3.8_cuda11.1_cudnn8.0.5_0
52
+ - readline=8.1=h27cfd23_0
53
+ - setuptools=52.0.0=py38h06a4308_0
54
+ - six=1.16.0=pyhd3eb1b0_0
55
+ - sqlite=3.36.0=hc218d9a_0
56
+ - tk=8.6.10=hbc83047_0
57
+ - torchaudio=0.9.0=py38
58
+ - torchvision=0.10.0=py38_cu111
59
+ - typing_extensions=3.10.0.0=pyhca03da5_0
60
+ - wheel=0.37.0=pyhd3eb1b0_1
61
+ - xz=5.2.5=h7b6447c_0
62
+ - zlib=1.2.11=h7b6447c_3
63
+ - zstd=1.4.9=haebb681_0
64
+ - pip:
65
+ - absl-py==0.13.0
66
+ - accelerate==0.5.0.dev0
67
+ - aiohttp==3.7.4.post0
68
+ - async-timeout==3.0.1
69
+ - attrs==21.2.0
70
+ - cachetools==4.2.2
71
+ - chardet==4.0.0
72
+ - charset-normalizer==2.0.5
73
+ - click==8.0.1
74
+ - configparser==5.0.2
75
+ - datasets==1.10.3.dev0
76
+ - deepspeed==0.5.2
77
+ - dill==0.3.4
78
+ - docker-pycreds==0.4.0
79
+ - filelock==3.0.12
80
+ - fsspec==2021.8.1
81
+ - gitdb==4.0.7
82
+ - gitpython==3.1.18
83
+ - google-auth==1.35.0
84
+ - google-auth-oauthlib==0.4.6
85
+ - grpcio==1.40.0
86
+ - huggingface-hub==0.0.17
87
+ - idna==3.2
88
+ - joblib==1.0.1
89
+ - markdown==3.3.4
90
+ - multidict==5.1.0
91
+ - multiprocess==0.70.12.2
92
+ - ninja==1.10.2
93
+ - oauthlib==3.1.1
94
+ - packaging==21.0
95
+ - pandas==1.3.3
96
+ - pathtools==0.1.2
97
+ - promise==2.3
98
+ - protobuf==3.18.0
99
+ - psutil==5.8.0
100
+ - pyarrow==5.0.0
101
+ - pyasn1==0.4.8
102
+ - pyasn1-modules==0.2.8
103
+ - pyparsing==2.4.7
104
+ - python-dateutil==2.8.2
105
+ - pytz==2021.1
106
+ - pyyaml==5.4.1
107
+ - regex==2021.8.28
108
+ - requests==2.26.0
109
+ - requests-oauthlib==1.3.0
110
+ - rsa==4.7.2
111
+ - sacremoses==0.0.45
112
+ - sentry-sdk==1.3.1
113
+ - shortuuid==1.0.1
114
+ - smmap==4.0.0
115
+ - subprocess32==3.5.4
116
+ - tensorboard==2.6.0
117
+ - tensorboard-data-server==0.6.1
118
+ - tensorboard-plugin-wit==1.8.0
119
+ - tensorboardx==1.8
120
+ - termcolor==1.1.0
121
+ - tokenizers==0.10.3
122
+ - tqdm==4.62.2
123
+ - transformers==4.11.0.dev0
124
+ - triton==1.0.0
125
+ - urllib3==1.26.6
126
+ - wandb==0.12.2
127
+ - werkzeug==2.0.1
128
+ - xxhash==2.0.2
129
+ - yarl==1.6.3
130
+ - yaspin==2.1.0
131
+ prefix: /home/leandro/miniconda3/envs/codeparrot
wandb/run-20210920_142810-36cw69uv/files/config.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ cli_version: 0.12.2
7
+ framework: huggingface
8
+ huggingface_version: 4.11.0.dev0
9
+ is_jupyter_run: false
10
+ is_kaggle_kernel: false
11
+ python_version: 3.8.11
12
+ start_time: 1632148090
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ 3:
18
+ - 16
19
+ 4: 3.8.11
20
+ 5: 0.12.2
21
+ 6: 4.11.0.dev0
22
+ 8:
23
+ - 5
24
+ backend:
25
+ desc: null
26
+ value: nccl
27
+ deepspeed_plugin:
28
+ desc: null
29
+ value: None
30
+ device:
31
+ desc: null
32
+ value: cuda:0
33
+ distributed_type:
34
+ desc: null
35
+ value: DistributedType.MULTI_GPU
36
+ gradient_accumulation_steps:
37
+ desc: null
38
+ value: 16
39
+ initialized:
40
+ desc: null
41
+ value: 'True'
42
+ learning_rate:
43
+ desc: null
44
+ value: 0.0002
45
+ local_process_index:
46
+ desc: null
47
+ value: '0'
48
+ lr_scheduler_type:
49
+ desc: null
50
+ value: cosine
51
+ max_eval_steps:
52
+ desc: null
53
+ value: -1
54
+ max_train_steps:
55
+ desc: null
56
+ value: 50000
57
+ num_processes:
58
+ desc: null
59
+ value: '16'
60
+ num_warmup_steps:
61
+ desc: null
62
+ value: 750
63
+ process_index:
64
+ desc: null
65
+ value: '0'
66
+ save_checkpoint_steps:
67
+ desc: null
68
+ value: 50000
69
+ seed:
70
+ desc: null
71
+ value: 1
72
+ seq_length:
73
+ desc: null
74
+ value: 1024
75
+ shuffle_buffer:
76
+ desc: null
77
+ value: 1000
78
+ train_batch_size:
79
+ desc: null
80
+ value: 2
81
+ use_fp16:
82
+ desc: null
83
+ value: 'True'
84
+ valid_batch_size:
85
+ desc: null
86
+ value: 2
87
+ weight_decay:
88
+ desc: null
89
+ value: 0.1
wandb/run-20210920_142810-36cw69uv/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20210920_142810-36cw69uv/files/requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==0.13.0
2
+ accelerate==0.5.0.dev0
3
+ aiohttp==3.7.4.post0
4
+ async-timeout==3.0.1
5
+ attrs==21.2.0
6
+ cachetools==4.2.2
7
+ certifi==2021.5.30
8
+ chardet==4.0.0
9
+ charset-normalizer==2.0.5
10
+ click==8.0.1
11
+ configparser==5.0.2
12
+ datasets==1.10.3.dev0
13
+ deepspeed==0.5.2
14
+ dill==0.3.4
15
+ docker-pycreds==0.4.0
16
+ filelock==3.0.12
17
+ fsspec==2021.8.1
18
+ gitdb==4.0.7
19
+ gitpython==3.1.18
20
+ google-auth-oauthlib==0.4.6
21
+ google-auth==1.35.0
22
+ grpcio==1.40.0
23
+ huggingface-hub==0.0.17
24
+ idna==3.2
25
+ joblib==1.0.1
26
+ markdown==3.3.4
27
+ mkl-fft==1.3.0
28
+ mkl-random==1.2.2
29
+ mkl-service==2.4.0
30
+ multidict==5.1.0
31
+ multiprocess==0.70.12.2
32
+ ninja==1.10.2
33
+ numpy==1.20.3
34
+ oauthlib==3.1.1
35
+ olefile==0.46
36
+ packaging==21.0
37
+ pandas==1.3.3
38
+ pathtools==0.1.2
39
+ pillow==8.3.1
40
+ pip==21.0.1
41
+ promise==2.3
42
+ protobuf==3.18.0
43
+ psutil==5.8.0
44
+ pyarrow==5.0.0
45
+ pyasn1-modules==0.2.8
46
+ pyasn1==0.4.8
47
+ pyparsing==2.4.7
48
+ python-dateutil==2.8.2
49
+ pytz==2021.1
50
+ pyyaml==5.4.1
51
+ regex==2021.8.28
52
+ requests-oauthlib==1.3.0
53
+ requests==2.26.0
54
+ rsa==4.7.2
55
+ sacremoses==0.0.45
56
+ sentry-sdk==1.3.1
57
+ setuptools==52.0.0.post20210125
58
+ shortuuid==1.0.1
59
+ six==1.16.0
60
+ smmap==4.0.0
61
+ subprocess32==3.5.4
62
+ tensorboard-data-server==0.6.1
63
+ tensorboard-plugin-wit==1.8.0
64
+ tensorboard==2.6.0
65
+ tensorboardx==1.8
66
+ termcolor==1.1.0
67
+ tokenizers==0.10.3
68
+ torch==1.9.0
69
+ torchaudio==0.9.0a0+33b2469
70
+ torchvision==0.10.0
71
+ tqdm==4.62.2
72
+ transformers==4.11.0.dev0
73
+ triton==1.0.0
74
+ typing-extensions==3.10.0.0
75
+ urllib3==1.26.6
76
+ wandb==0.12.2
77
+ werkzeug==2.0.1
78
+ wheel==0.37.0
79
+ xxhash==2.0.2
80
+ yarl==1.6.3
81
+ yaspin==2.1.0
wandb/run-20210920_142810-36cw69uv/files/wandb-metadata.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-1052-gcp-x86_64-with-glibc2.17",
3
+ "python": "3.8.11",
4
+ "heartbeatAt": "2021-09-20T14:28:11.537999",
5
+ "startedAt": "2021-09-20T14:28:10.785470",
6
+ "docker": null,
7
+ "gpu": "NVIDIA A100-SXM4-40GB",
8
+ "gpu_count": 16,
9
+ "cpu_count": 96,
10
+ "cuda": "10.1.243",
11
+ "args": [],
12
+ "state": "running",
13
+ "program": "codeparrot_training.py",
14
+ "codePath": "codeparrot_training.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/transformersbook/codeparrot",
17
+ "commit": "ea70f93cfbf64eb723d41b350d14827e68b0a6c3"
18
+ },
19
+ "email": "leandro.vonwerra@gmail.com",
20
+ "root": "/home/leandro/codeparrot",
21
+ "host": "leandro-16x-v100",
22
+ "username": "leandro",
23
+ "executable": "/home/leandro/miniconda3/envs/codeparrot/bin/python"
24
+ }
wandb/run-20210920_142810-36cw69uv/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lr": 0.00019885557166781018, "samples": 1600000, "steps": 3124, "loss/train": 1.4012274742126465, "_runtime": 40644, "_timestamp": 1632188734, "_step": 50000, "loss/eval": 1.7745720148086548, "perplexity": 5.897756576538086}
wandb/run-20210920_142810-36cw69uv/logs/debug-internal.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4b2174d10551473549ba5ce66d8348228b83bacf57bbc30dca02a46c5e0319c
3
+ size 26678411
wandb/run-20210920_142810-36cw69uv/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-09-20 14:28:10,787 INFO MainThread:8660 [wandb_setup.py:_flush():69] setting env: {}
2
+ 2021-09-20 14:28:10,787 INFO MainThread:8660 [wandb_setup.py:_flush():69] setting login settings: {}
3
+ 2021-09-20 14:28:10,787 INFO MainThread:8660 [wandb_init.py:_log_setup():348] Logging user logs to /home/leandro/codeparrot/wandb/run-20210920_142810-36cw69uv/logs/debug.log
4
+ 2021-09-20 14:28:10,787 INFO MainThread:8660 [wandb_init.py:_log_setup():349] Logging internal logs to /home/leandro/codeparrot/wandb/run-20210920_142810-36cw69uv/logs/debug-internal.log
5
+ 2021-09-20 14:28:10,788 INFO MainThread:8660 [wandb_init.py:init():381] calling init triggers
6
+ 2021-09-20 14:28:10,788 INFO MainThread:8660 [wandb_init.py:init():386] wandb.init called with sweep_config: {}
7
+ config: {'train_batch_size': 2, 'valid_batch_size': 2, 'weight_decay': 0.1, 'shuffle_buffer': 1000, 'learning_rate': 0.0002, 'lr_scheduler_type': 'cosine', 'num_warmup_steps': 750, 'gradient_accumulation_steps': 16, 'max_train_steps': 50000, 'max_eval_steps': -1, 'seq_length': 1024, 'seed': 1, 'save_checkpoint_steps': 50000, 'backend': 'nccl', 'deepspeed_plugin': 'None', 'distributed_type': 'DistributedType.MULTI_GPU', 'num_processes': '16', 'process_index': '0', 'local_process_index': '0', 'device': 'cuda:0', 'use_fp16': 'True', 'initialized': 'True'}
8
+ 2021-09-20 14:28:10,788 INFO MainThread:8660 [wandb_init.py:init():430] starting backend
9
+ 2021-09-20 14:28:10,788 INFO MainThread:8660 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
10
+ 2021-09-20 14:28:10,805 INFO MainThread:8660 [backend.py:ensure_launched():135] starting backend process...
11
+ 2021-09-20 14:28:10,816 INFO MainThread:8660 [backend.py:ensure_launched():139] started backend process with pid: 9038
12
+ 2021-09-20 14:28:10,818 INFO MainThread:8660 [wandb_init.py:init():435] backend started and connected
13
+ 2021-09-20 14:28:10,825 INFO MainThread:8660 [wandb_init.py:init():494] updated telemetry
14
+ 2021-09-20 14:28:10,826 INFO MainThread:8660 [wandb_init.py:init():517] communicating current version
15
+ 2021-09-20 14:28:11,406 INFO MainThread:8660 [wandb_init.py:init():522] got version response
16
+ 2021-09-20 14:28:11,406 INFO MainThread:8660 [wandb_init.py:init():530] communicating run to backend with 30 second timeout
17
+ 2021-09-20 14:28:11,486 INFO MainThread:8660 [wandb_init.py:init():557] starting run threads in backend
18
+ 2021-09-20 14:28:12,872 INFO MainThread:8660 [wandb_run.py:_console_start():1605] atexit reg
19
+ 2021-09-20 14:28:12,873 INFO MainThread:8660 [wandb_run.py:_redirect():1479] redirect: SettingsConsole.REDIRECT
20
+ 2021-09-20 14:28:12,873 INFO MainThread:8660 [wandb_run.py:_redirect():1484] Redirecting console.
21
+ 2021-09-20 14:28:12,876 INFO MainThread:8660 [wandb_run.py:_redirect():1540] Redirects installed.
22
+ 2021-09-20 14:28:12,876 INFO MainThread:8660 [wandb_init.py:init():582] run started, returning control to user process
wandb/run-20210920_142810-36cw69uv/run-36cw69uv.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3dea2a070cd3c7d6079d138e9461283968f789819a375be8fd99762250f9064
3
+ size 20083529