prepared dataset caching, other misc fixes (#665)
Browse files* prepared dataset caching, other misc fixes
* also don't load from disk cache unless explicit
- examples/cerebras/qlora.yml +1 -1
- examples/code-llama/13b/lora.yml +1 -1
- examples/code-llama/13b/qlora.yml +1 -1
- examples/code-llama/34b/lora.yml +1 -1
- examples/code-llama/34b/qlora.yml +1 -1
- examples/code-llama/7b/lora.yml +1 -1
- examples/code-llama/7b/qlora.yml +1 -1
- examples/falcon/config-7b-lora.yml +1 -1
- examples/falcon/config-7b-qlora.yml +1 -1
- examples/falcon/config-7b.yml +1 -1
- examples/gptj/qlora.yml +1 -1
- examples/jeopardy-bot/config.yml +1 -1
- examples/llama-2/gptq-lora.yml +1 -1
- examples/llama-2/lora.yml +1 -1
- examples/llama-2/qlora.yml +1 -1
- examples/llama-2/relora.yml +1 -1
- examples/llama-2/tiny-llama.yml +1 -1
- examples/mistral/config.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
- examples/openllama-3b/config.yml +1 -1
- examples/openllama-3b/lora.yml +1 -1
- examples/openllama-3b/qlora.yml +1 -1
- examples/phi/phi-ft.yml +1 -1
- examples/phi/phi-qlora.yml +1 -1
- examples/pythia-12b/config.yml +1 -1
- examples/pythia/lora.yml +1 -1
- examples/redpajama/config-3b.yml +1 -1
- examples/replit-3b/config-lora.yml +1 -1
- examples/xgen-7b/xgen-7b-8k-qlora.yml +1 -1
- src/axolotl/cli/__init__.py +1 -1
- src/axolotl/utils/data.py +3 -3
- src/axolotl/utils/tokenization.py +2 -1
examples/cerebras/qlora.yml
CHANGED
@@ -7,7 +7,7 @@ push_dataset_to_hub:
|
|
7 |
datasets:
|
8 |
- path: teknium/GPT4-LLM-Cleaned
|
9 |
type: alpaca
|
10 |
-
dataset_prepared_path:
|
11 |
val_set_size: 0.01
|
12 |
adapter: qlora
|
13 |
lora_model_dir:
|
|
|
7 |
datasets:
|
8 |
- path: teknium/GPT4-LLM-Cleaned
|
9 |
type: alpaca
|
10 |
+
dataset_prepared_path:
|
11 |
val_set_size: 0.01
|
12 |
adapter: qlora
|
13 |
lora_model_dir:
|
examples/code-llama/13b/lora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
examples/code-llama/13b/qlora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
examples/code-llama/34b/lora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
examples/code-llama/34b/qlora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
examples/code-llama/7b/lora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
examples/code-llama/7b/qlora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
examples/falcon/config-7b-lora.yml
CHANGED
@@ -12,7 +12,7 @@ push_dataset_to_hub:
|
|
12 |
datasets:
|
13 |
- path: teknium/GPT4-LLM-Cleaned
|
14 |
type: alpaca:chat
|
15 |
-
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
adapter: lora
|
18 |
lora_model_dir:
|
|
|
12 |
datasets:
|
13 |
- path: teknium/GPT4-LLM-Cleaned
|
14 |
type: alpaca:chat
|
15 |
+
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
adapter: lora
|
18 |
lora_model_dir:
|
examples/falcon/config-7b-qlora.yml
CHANGED
@@ -18,7 +18,7 @@ datasets:
|
|
18 |
data_files:
|
19 |
- Chain-of-Thought/formatted_cot_data/gsm8k_train.json
|
20 |
type: "alpaca:chat"
|
21 |
-
dataset_prepared_path:
|
22 |
val_set_size: 0.01
|
23 |
# enable QLoRA
|
24 |
adapter: qlora
|
|
|
18 |
data_files:
|
19 |
- Chain-of-Thought/formatted_cot_data/gsm8k_train.json
|
20 |
type: "alpaca:chat"
|
21 |
+
dataset_prepared_path:
|
22 |
val_set_size: 0.01
|
23 |
# enable QLoRA
|
24 |
adapter: qlora
|
examples/falcon/config-7b.yml
CHANGED
@@ -12,7 +12,7 @@ push_dataset_to_hub:
|
|
12 |
datasets:
|
13 |
- path: teknium/GPT4-LLM-Cleaned
|
14 |
type: alpaca:chat
|
15 |
-
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
adapter:
|
18 |
lora_model_dir:
|
|
|
12 |
datasets:
|
13 |
- path: teknium/GPT4-LLM-Cleaned
|
14 |
type: alpaca:chat
|
15 |
+
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
adapter:
|
18 |
lora_model_dir:
|
examples/gptj/qlora.yml
CHANGED
@@ -7,7 +7,7 @@ push_dataset_to_hub:
|
|
7 |
datasets:
|
8 |
- path: teknium/GPT4-LLM-Cleaned
|
9 |
type: alpaca
|
10 |
-
dataset_prepared_path:
|
11 |
val_set_size: 0.01
|
12 |
adapter: qlora
|
13 |
lora_model_dir:
|
|
|
7 |
datasets:
|
8 |
- path: teknium/GPT4-LLM-Cleaned
|
9 |
type: alpaca
|
10 |
+
dataset_prepared_path:
|
11 |
val_set_size: 0.01
|
12 |
adapter: qlora
|
13 |
lora_model_dir:
|
examples/jeopardy-bot/config.yml
CHANGED
@@ -6,7 +6,7 @@ load_in_8bit: false
|
|
6 |
datasets:
|
7 |
- path: openaccess-ai-collective/jeopardy
|
8 |
type: jeopardy
|
9 |
-
dataset_prepared_path:
|
10 |
val_set_size: 0.02
|
11 |
adapter:
|
12 |
lora_model_dir:
|
|
|
6 |
datasets:
|
7 |
- path: openaccess-ai-collective/jeopardy
|
8 |
type: jeopardy
|
9 |
+
dataset_prepared_path:
|
10 |
val_set_size: 0.02
|
11 |
adapter:
|
12 |
lora_model_dir:
|
examples/llama-2/gptq-lora.yml
CHANGED
@@ -15,7 +15,7 @@ hf_use_auth_token: true
|
|
15 |
datasets:
|
16 |
- path: mhenrichsen/alpaca_2k_test
|
17 |
type: alpaca
|
18 |
-
dataset_prepared_path:
|
19 |
val_set_size: 0.01
|
20 |
adapter: lora
|
21 |
lora_model_dir:
|
|
|
15 |
datasets:
|
16 |
- path: mhenrichsen/alpaca_2k_test
|
17 |
type: alpaca
|
18 |
+
dataset_prepared_path:
|
19 |
val_set_size: 0.01
|
20 |
adapter: lora
|
21 |
lora_model_dir:
|
examples/llama-2/lora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./lora-out
|
17 |
|
examples/llama-2/qlora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./qlora-out
|
17 |
|
examples/llama-2/relora.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: teknium/GPT4-LLM-Cleaned
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./relora-out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: teknium/GPT4-LLM-Cleaned
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./relora-out
|
17 |
|
examples/llama-2/tiny-llama.yml
CHANGED
@@ -12,7 +12,7 @@ strict: false
|
|
12 |
datasets:
|
13 |
- path: mhenrichsen/alpaca_2k_test
|
14 |
type: alpaca
|
15 |
-
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
output_dir: ./lora-out
|
18 |
|
|
|
12 |
datasets:
|
13 |
- path: mhenrichsen/alpaca_2k_test
|
14 |
type: alpaca
|
15 |
+
dataset_prepared_path:
|
16 |
val_set_size: 0.01
|
17 |
output_dir: ./lora-out
|
18 |
|
examples/mistral/config.yml
CHANGED
@@ -11,7 +11,7 @@ strict: false
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
-
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./out
|
17 |
|
|
|
11 |
datasets:
|
12 |
- path: mhenrichsen/alpaca_2k_test
|
13 |
type: alpaca
|
14 |
+
dataset_prepared_path:
|
15 |
val_set_size: 0.01
|
16 |
output_dir: ./out
|
17 |
|
examples/mpt-7b/config.yml
CHANGED
@@ -6,7 +6,7 @@ load_in_8bit: false
|
|
6 |
datasets:
|
7 |
- path: vicgalle/alpaca-gpt4
|
8 |
type: alpaca
|
9 |
-
dataset_prepared_path:
|
10 |
val_set_size: 0.02
|
11 |
adapter:
|
12 |
lora_model_dir:
|
|
|
6 |
datasets:
|
7 |
- path: vicgalle/alpaca-gpt4
|
8 |
type: alpaca
|
9 |
+
dataset_prepared_path:
|
10 |
val_set_size: 0.02
|
11 |
adapter:
|
12 |
lora_model_dir:
|
examples/openllama-3b/config.yml
CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
-
dataset_prepared_path:
|
13 |
val_set_size: 0.02
|
14 |
adapter:
|
15 |
lora_model_dir:
|
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
+
dataset_prepared_path:
|
13 |
val_set_size: 0.02
|
14 |
adapter:
|
15 |
lora_model_dir:
|
examples/openllama-3b/lora.yml
CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
-
dataset_prepared_path:
|
13 |
val_set_size: 0.02
|
14 |
adapter: lora
|
15 |
lora_model_dir:
|
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
+
dataset_prepared_path:
|
13 |
val_set_size: 0.02
|
14 |
adapter: lora
|
15 |
lora_model_dir:
|
examples/openllama-3b/qlora.yml
CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
-
dataset_prepared_path:
|
13 |
val_set_size: 0.01
|
14 |
adapter: qlora
|
15 |
lora_model_dir:
|
|
|
9 |
datasets:
|
10 |
- path: teknium/GPT4-LLM-Cleaned
|
11 |
type: alpaca
|
12 |
+
dataset_prepared_path:
|
13 |
val_set_size: 0.01
|
14 |
adapter: qlora
|
15 |
lora_model_dir:
|
examples/phi/phi-ft.yml
CHANGED
@@ -13,7 +13,7 @@ datasets:
|
|
13 |
- path: garage-bAInd/Open-Platypus
|
14 |
type: alpaca
|
15 |
|
16 |
-
dataset_prepared_path:
|
17 |
val_set_size: 0.05
|
18 |
output_dir: ./phi-sft-out
|
19 |
|
|
|
13 |
- path: garage-bAInd/Open-Platypus
|
14 |
type: alpaca
|
15 |
|
16 |
+
dataset_prepared_path:
|
17 |
val_set_size: 0.05
|
18 |
output_dir: ./phi-sft-out
|
19 |
|
examples/phi/phi-qlora.yml
CHANGED
@@ -13,7 +13,7 @@ datasets:
|
|
13 |
- path: garage-bAInd/Open-Platypus
|
14 |
type: alpaca
|
15 |
|
16 |
-
dataset_prepared_path:
|
17 |
val_set_size: 0.05
|
18 |
output_dir: ./phi-sft-out
|
19 |
|
|
|
13 |
- path: garage-bAInd/Open-Platypus
|
14 |
type: alpaca
|
15 |
|
16 |
+
dataset_prepared_path:
|
17 |
val_set_size: 0.05
|
18 |
output_dir: ./phi-sft-out
|
19 |
|
examples/pythia-12b/config.yml
CHANGED
@@ -10,7 +10,7 @@ device_map: auto
|
|
10 |
datasets:
|
11 |
- path: vicgalle/alpaca-gpt4
|
12 |
type: alpaca
|
13 |
-
dataset_prepared_path:
|
14 |
val_set_size: 0.05
|
15 |
adapter:
|
16 |
lora_model_dir:
|
|
|
10 |
datasets:
|
11 |
- path: vicgalle/alpaca-gpt4
|
12 |
type: alpaca
|
13 |
+
dataset_prepared_path:
|
14 |
val_set_size: 0.05
|
15 |
adapter:
|
16 |
lora_model_dir:
|
examples/pythia/lora.yml
CHANGED
@@ -4,7 +4,7 @@ load_in_8bit: true
|
|
4 |
datasets:
|
5 |
- path: teknium/GPT4-LLM-Cleaned
|
6 |
type: alpaca
|
7 |
-
dataset_prepared_path:
|
8 |
val_set_size: 0.05
|
9 |
adapter: lora
|
10 |
lora_model_dir:
|
|
|
4 |
datasets:
|
5 |
- path: teknium/GPT4-LLM-Cleaned
|
6 |
type: alpaca
|
7 |
+
dataset_prepared_path:
|
8 |
val_set_size: 0.05
|
9 |
adapter: lora
|
10 |
lora_model_dir:
|
examples/redpajama/config-3b.yml
CHANGED
@@ -7,7 +7,7 @@ load_in_8bit: false
|
|
7 |
datasets:
|
8 |
- path: vicgalle/alpaca-gpt4
|
9 |
type: alpaca
|
10 |
-
dataset_prepared_path:
|
11 |
val_set_size: 0.02
|
12 |
adapter:
|
13 |
lora_model_dir:
|
|
|
7 |
datasets:
|
8 |
- path: vicgalle/alpaca-gpt4
|
9 |
type: alpaca
|
10 |
+
dataset_prepared_path:
|
11 |
val_set_size: 0.02
|
12 |
adapter:
|
13 |
lora_model_dir:
|
examples/replit-3b/config-lora.yml
CHANGED
@@ -5,7 +5,7 @@ load_in_8bit: false
|
|
5 |
datasets:
|
6 |
- path: vicgalle/alpaca-gpt4
|
7 |
type: alpaca
|
8 |
-
dataset_prepared_path:
|
9 |
val_set_size: 0.05
|
10 |
adapter: lora
|
11 |
lora_model_dir:
|
|
|
5 |
datasets:
|
6 |
- path: vicgalle/alpaca-gpt4
|
7 |
type: alpaca
|
8 |
+
dataset_prepared_path:
|
9 |
val_set_size: 0.05
|
10 |
adapter: lora
|
11 |
lora_model_dir:
|
examples/xgen-7b/xgen-7b-8k-qlora.yml
CHANGED
@@ -16,7 +16,7 @@ datasets:
|
|
16 |
data_files:
|
17 |
- openassistant_best_replies_train.jsonl
|
18 |
type: "completion"
|
19 |
-
dataset_prepared_path:
|
20 |
val_set_size: 0.01
|
21 |
# enable QLoRA
|
22 |
adapter: qlora
|
|
|
16 |
data_files:
|
17 |
- openassistant_best_replies_train.jsonl
|
18 |
type: "completion"
|
19 |
+
dataset_prepared_path:
|
20 |
val_set_size: 0.01
|
21 |
# enable QLoRA
|
22 |
adapter: qlora
|
src/axolotl/cli/__init__.py
CHANGED
@@ -51,7 +51,7 @@ def print_axolotl_text_art(suffix=None):
|
|
51 |
|
52 |
|
53 |
def get_multi_line_input() -> Optional[str]:
|
54 |
-
print("Give me an instruction (Ctrl + D to
|
55 |
instruction = ""
|
56 |
for line in sys.stdin:
|
57 |
instruction += line # pylint: disable=consider-using-join
|
|
|
51 |
|
52 |
|
53 |
def get_multi_line_input() -> Optional[str]:
|
54 |
+
print("Give me an instruction (Ctrl + D to submit): ")
|
55 |
instruction = ""
|
56 |
for line in sys.stdin:
|
57 |
instruction += line # pylint: disable=consider-using-join
|
src/axolotl/utils/data.py
CHANGED
@@ -122,7 +122,7 @@ def load_tokenized_prepared_datasets(
|
|
122 |
|
123 |
if dataset:
|
124 |
...
|
125 |
-
elif any(prepared_ds_path.glob("*")):
|
126 |
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
127 |
dataset = load_from_disk(str(prepared_ds_path))
|
128 |
LOG.info("Prepared dataset loaded from disk...")
|
@@ -357,7 +357,7 @@ def load_tokenized_prepared_datasets(
|
|
357 |
if len(datasets) > 1:
|
358 |
LOG.info("shuffle merged datasets")
|
359 |
dataset = dataset.shuffle(seed=seed)
|
360 |
-
if cfg.local_rank == 0:
|
361 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
362 |
dataset.save_to_disk(prepared_ds_path)
|
363 |
if cfg.push_dataset_to_hub:
|
@@ -425,7 +425,7 @@ def load_prepare_datasets(
|
|
425 |
|
426 |
if dataset:
|
427 |
...
|
428 |
-
elif any(prepared_ds_path.glob("*")):
|
429 |
LOG.info(
|
430 |
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
431 |
)
|
|
|
122 |
|
123 |
if dataset:
|
124 |
...
|
125 |
+
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
126 |
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
127 |
dataset = load_from_disk(str(prepared_ds_path))
|
128 |
LOG.info("Prepared dataset loaded from disk...")
|
|
|
357 |
if len(datasets) > 1:
|
358 |
LOG.info("shuffle merged datasets")
|
359 |
dataset = dataset.shuffle(seed=seed)
|
360 |
+
if cfg.local_rank == 0 and cfg.dataset_prepared_path:
|
361 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
362 |
dataset.save_to_disk(prepared_ds_path)
|
363 |
if cfg.push_dataset_to_hub:
|
|
|
425 |
|
426 |
if dataset:
|
427 |
...
|
428 |
+
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
429 |
LOG.info(
|
430 |
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
431 |
)
|
src/axolotl/utils/tokenization.py
CHANGED
@@ -31,7 +31,8 @@ def check_example_labels(example, tokenizer, text_only=False):
|
|
31 |
)
|
32 |
colored_tokens.append(colored_token)
|
33 |
|
34 |
-
|
|
|
35 |
LOG.info("\n\n\n")
|
36 |
print(" ".join(colored_tokens))
|
37 |
|
|
|
31 |
)
|
32 |
colored_tokens.append(colored_token)
|
33 |
|
34 |
+
delimiter = "" if text_only else " "
|
35 |
+
LOG.info(delimiter.join(colored_tokens))
|
36 |
LOG.info("\n\n\n")
|
37 |
print(" ".join(colored_tokens))
|
38 |
|