Tom Jobbins commited on
Commit
48434be
1 Parent(s): 396a7a7

Debug tokenization output: Add ability to output text only (no tokens), and/or specify num samples to see (#511)

Browse files
scripts/finetune.py CHANGED
@@ -246,9 +246,14 @@ def load_datasets(
246
  LOG.info("check_dataset_labels...")
247
  check_dataset_labels(
248
  train_dataset.select(
249
- [random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
 
 
 
250
  ),
251
  tokenizer,
 
 
252
  )
253
 
254
  return TrainDatasetMeta(
 
246
  LOG.info("check_dataset_labels...")
247
  check_dataset_labels(
248
  train_dataset.select(
249
+ [
250
+ random.randrange(0, len(train_dataset) - 1) # nosec
251
+ for _ in range(cli_args.debug_num_examples)
252
+ ]
253
  ),
254
  tokenizer,
255
+ num_examples=cli_args.debug_num_examples,
256
+ text_only=cli_args.debug_text_only,
257
  )
258
 
259
  return TrainDatasetMeta(
src/axolotl/common/cli.py CHANGED
@@ -21,6 +21,8 @@ class TrainerCliArgs:
21
  """
22
 
23
  debug: bool = field(default=False)
 
 
24
  inference: bool = field(default=False)
25
  merge_lora: bool = field(default=False)
26
  prepare_ds_only: bool = field(default=False)
 
21
  """
22
 
23
  debug: bool = field(default=False)
24
+ debug_text_only: bool = field(default=False)
25
+ debug_num_examples: int = field(default=5)
26
  inference: bool = field(default=False)
27
  merge_lora: bool = field(default=False)
28
  prepare_ds_only: bool = field(default=False)
src/axolotl/utils/tokenization.py CHANGED
@@ -8,13 +8,13 @@ from termcolor import colored
8
  LOG = logging.getLogger("axolotl")
9
 
10
 
11
- def check_dataset_labels(dataset, tokenizer):
12
  # the dataset is already shuffled, so let's just check the first 5 elements
13
- for idx in range(5):
14
- check_example_labels(dataset[idx], tokenizer)
15
 
16
 
17
- def check_example_labels(example, tokenizer):
18
  # Get the input_ids, labels, and attention_mask from the dataset
19
  input_ids = example["input_ids"]
20
  labels = example["labels"]
@@ -29,8 +29,10 @@ def check_example_labels(example, tokenizer):
29
  decoded_input_token = tokenizer.decode(input_id)
30
  # Choose the color based on whether the label has the ignore value or not
31
  color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
32
- colored_token = colored(decoded_input_token, color) + colored(
33
- f"({label_id}, {mask}, {input_id})", "white"
 
 
34
  )
35
  colored_tokens.append(colored_token)
36
 
 
8
  LOG = logging.getLogger("axolotl")
9
 
10
 
11
+ def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
12
  # the dataset is already shuffled, so let's just check the first 5 elements
13
+ for idx in range(num_examples):
14
+ check_example_labels(dataset[idx], tokenizer, text_only=text_only)
15
 
16
 
17
+ def check_example_labels(example, tokenizer, text_only=False):
18
  # Get the input_ids, labels, and attention_mask from the dataset
19
  input_ids = example["input_ids"]
20
  labels = example["labels"]
 
29
  decoded_input_token = tokenizer.decode(input_id)
30
  # Choose the color based on whether the label has the ignore value or not
31
  color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
32
+ colored_token = colored(decoded_input_token, color) + (
33
+ not text_only
34
+ and colored(f"({label_id}, {mask}, {input_id})", "white")
35
+ or ""
36
  )
37
  colored_tokens.append(colored_token)
38