Upload folder using huggingface_hub
Browse files- .ipynb_checkpoints/training-checkpoint.log +15 -34
- pytorch_model.bin +1 -1
- training.log +0 -0
.ipynb_checkpoints/training-checkpoint.log
CHANGED
@@ -1,34 +1,15 @@
|
|
1 |
-
[2023-12-
|
2 |
-
[2023-12-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
from dschat.utils.data.data_utils import create_prompt_dataset
|
17 |
-
ModuleNotFoundError: No module named 'dschat'
|
18 |
-
Traceback (most recent call last):
|
19 |
-
File "/home/t-sokumar/llama2_finetune/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py", line 24, in <module>
|
20 |
-
from dschat.utils.data.data_utils import create_prompt_dataset
|
21 |
-
ModuleNotFoundError: No module named 'dschat'
|
22 |
-
Traceback (most recent call last):
|
23 |
-
File "/home/t-sokumar/llama2_finetune/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py", line 24, in <module>
|
24 |
-
from dschat.utils.data.data_utils import create_prompt_dataset
|
25 |
-
ModuleNotFoundError: No module named 'dschat'
|
26 |
-
Traceback (most recent call last):
|
27 |
-
File "/home/t-sokumar/llama2_finetune/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py", line 24, in <module>
|
28 |
-
from dschat.utils.data.data_utils import create_prompt_dataset
|
29 |
-
ModuleNotFoundError: No module named 'dschat'
|
30 |
-
[2023-12-06 04:08:12,905] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 2459201
|
31 |
-
[2023-12-06 04:08:12,905] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 2459202
|
32 |
-
[2023-12-06 04:08:12,931] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 2459203
|
33 |
-
[2023-12-06 04:08:12,949] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 2459204
|
34 |
-
[2023-12-06 04:08:12,967] [ERROR] [launch.py:321:sigkill_handler] ['/home/t-sokumar/miniconda3/envs/ft/bin/python', '-u', 'main.py', '--local_rank=3', '--data_path', 'local/jsonfile', '--data_split', '2,4,4', '--model_name_or_path', 'meta-llama/Llama-2-7b-hf', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '4', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '3', '--deepspeed', '--lora_dim', '128', '--lora_module_name', 'layers.', '--output_dir', './output_step1_llama2_7b_lora'] exits with return code = 1
|
|
|
1 |
+
[2023-12-11 05:30:52,558] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
2 |
+
[2023-12-11 05:30:54,351] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
3 |
+
Detected CUDA_VISIBLE_DEVICES=1,2,3: setting --include=localhost:1,2,3
|
4 |
+
Traceback (most recent call last):
|
5 |
+
File "/home/t-sokumar/miniconda3/envs/ft/bin/deepspeed", line 6, in <module>
|
6 |
+
main()
|
7 |
+
File "/home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/deepspeed/launcher/runner.py", line 430, in main
|
8 |
+
active_resources = parse_inclusion_exclusion(resource_pool, args.include, args.exclude)
|
9 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
10 |
+
File "/home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/deepspeed/launcher/runner.py", line 351, in parse_inclusion_exclusion
|
11 |
+
return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
|
12 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
13 |
+
File "/home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/deepspeed/launcher/runner.py", line 303, in parse_resource_filter
|
14 |
+
raise ValueError(f"No slot '{slot}' specified on host '{hostname}'")
|
15 |
+
ValueError: No slot '3' specified on host 'localhost'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 13477321262
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e79895167c822572d1cf89779c4a10a05433bd2d166150e2c85d5d321054d016
|
3 |
size 13477321262
|
training.log
CHANGED
The diff for this file is too large to render.
See raw diff
|
|