Tom Jobbins
commited on
Commit
•
48434be
1
Parent(s):
396a7a7
Debug tokenization output: Add ability to output text only (no tokens), and/or specify num samples to see (#511)
Browse files- scripts/finetune.py +6 -1
- src/axolotl/common/cli.py +2 -0
- src/axolotl/utils/tokenization.py +8 -6
scripts/finetune.py
CHANGED
@@ -246,9 +246,14 @@ def load_datasets(
|
|
246 |
LOG.info("check_dataset_labels...")
|
247 |
check_dataset_labels(
|
248 |
train_dataset.select(
|
249 |
-
[
|
|
|
|
|
|
|
250 |
),
|
251 |
tokenizer,
|
|
|
|
|
252 |
)
|
253 |
|
254 |
return TrainDatasetMeta(
|
|
|
246 |
LOG.info("check_dataset_labels...")
|
247 |
check_dataset_labels(
|
248 |
train_dataset.select(
|
249 |
+
[
|
250 |
+
random.randrange(0, len(train_dataset) - 1) # nosec
|
251 |
+
for _ in range(cli_args.debug_num_examples)
|
252 |
+
]
|
253 |
),
|
254 |
tokenizer,
|
255 |
+
num_examples=cli_args.debug_num_examples,
|
256 |
+
text_only=cli_args.debug_text_only,
|
257 |
)
|
258 |
|
259 |
return TrainDatasetMeta(
|
src/axolotl/common/cli.py
CHANGED
@@ -21,6 +21,8 @@ class TrainerCliArgs:
|
|
21 |
"""
|
22 |
|
23 |
debug: bool = field(default=False)
|
|
|
|
|
24 |
inference: bool = field(default=False)
|
25 |
merge_lora: bool = field(default=False)
|
26 |
prepare_ds_only: bool = field(default=False)
|
|
|
21 |
"""
|
22 |
|
23 |
debug: bool = field(default=False)
|
24 |
+
debug_text_only: bool = field(default=False)
|
25 |
+
debug_num_examples: int = field(default=5)
|
26 |
inference: bool = field(default=False)
|
27 |
merge_lora: bool = field(default=False)
|
28 |
prepare_ds_only: bool = field(default=False)
|
src/axolotl/utils/tokenization.py
CHANGED
@@ -8,13 +8,13 @@ from termcolor import colored
|
|
8 |
LOG = logging.getLogger("axolotl")
|
9 |
|
10 |
|
11 |
-
def check_dataset_labels(dataset, tokenizer):
|
12 |
# the dataset is already shuffled, so let's just check the first 5 elements
|
13 |
-
for idx in range(
|
14 |
-
check_example_labels(dataset[idx], tokenizer)
|
15 |
|
16 |
|
17 |
-
def check_example_labels(example, tokenizer):
|
18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
19 |
input_ids = example["input_ids"]
|
20 |
labels = example["labels"]
|
@@ -29,8 +29,10 @@ def check_example_labels(example, tokenizer):
|
|
29 |
decoded_input_token = tokenizer.decode(input_id)
|
30 |
# Choose the color based on whether the label has the ignore value or not
|
31 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
32 |
-
colored_token = colored(decoded_input_token, color) +
|
33 |
-
|
|
|
|
|
34 |
)
|
35 |
colored_tokens.append(colored_token)
|
36 |
|
|
|
8 |
LOG = logging.getLogger("axolotl")
|
9 |
|
10 |
|
11 |
+
def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
|
12 |
# the dataset is already shuffled, so let's just check the first 5 elements
|
13 |
+
for idx in range(num_examples):
|
14 |
+
check_example_labels(dataset[idx], tokenizer, text_only=text_only)
|
15 |
|
16 |
|
17 |
+
def check_example_labels(example, tokenizer, text_only=False):
|
18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
19 |
input_ids = example["input_ids"]
|
20 |
labels = example["labels"]
|
|
|
29 |
decoded_input_token = tokenizer.decode(input_id)
|
30 |
# Choose the color based on whether the label has the ignore value or not
|
31 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
32 |
+
colored_token = colored(decoded_input_token, color) + (
|
33 |
+
not text_only
|
34 |
+
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
35 |
+
or ""
|
36 |
)
|
37 |
colored_tokens.append(colored_token)
|
38 |
|