jondurbin commited on
Commit
5907bd4
1 Parent(s): 20f5c52

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ training_data/expert_code.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ training_data/expert_creative.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ training_data/expert_general.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ training_data/expert_qa.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ training_data/expert_reasoning.jsonl filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,5 @@
1
  ---
2
- license: llama2
3
  ---
 
 
 
1
  ---
2
+ license: other
3
  ---
4
+
5
+ https://github.com/jondurbin/airoboros#lmoe
adapters/code/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+
18
+ The following `bitsandbytes` quantization config was used during training:
19
+ - load_in_8bit: False
20
+ - load_in_4bit: True
21
+ - llm_int8_threshold: 6.0
22
+ - llm_int8_skip_modules: None
23
+ - llm_int8_enable_fp32_cpu_offload: False
24
+ - llm_int8_has_fp16_weight: False
25
+ - bnb_4bit_quant_type: nf4
26
+ - bnb_4bit_use_double_quant: True
27
+ - bnb_4bit_compute_dtype: bfloat16
28
+ ### Framework versions
29
+
30
+ - PEFT 0.4.0
31
+
32
+ - PEFT 0.4.0
adapters/code/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "v_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "k_proj",
21
+ "o_proj",
22
+ "gate_proj",
23
+ "q_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapters/code/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3191b3c1cfe3af2377501ca83f683ca3c86576092039651dc7d08eafd2064eb6
3
+ size 1657155077
adapters/creative/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
adapters/creative/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "o_proj",
18
+ "up_proj",
19
+ "k_proj",
20
+ "v_proj",
21
+ "q_proj",
22
+ "down_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapters/creative/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7706c60de732463422ae4fd2d2eaf39def04170a5f5481f65c5e3cb2d057906a
3
+ size 1657155077
adapters/function/README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+
18
+ The following `bitsandbytes` quantization config was used during training:
19
+ - load_in_8bit: False
20
+ - load_in_4bit: True
21
+ - llm_int8_threshold: 6.0
22
+ - llm_int8_skip_modules: None
23
+ - llm_int8_enable_fp32_cpu_offload: False
24
+ - llm_int8_has_fp16_weight: False
25
+ - bnb_4bit_quant_type: nf4
26
+ - bnb_4bit_use_double_quant: True
27
+ - bnb_4bit_compute_dtype: bfloat16
28
+ ### Framework versions
29
+
30
+ - PEFT 0.4.0
31
+
32
+ - PEFT 0.4.0
33
+ tion config was used during training:
34
+ - load_in_8bit: False
35
+ - load_in_4bit: True
36
+ - llm_int8_threshold: 6.0
37
+ - llm_int8_skip_modules: None
38
+ - llm_int8_enable_fp32_cpu_offload: False
39
+ - llm_int8_has_fp16_weight: False
40
+ - bnb_4bit_quant_type: nf4
41
+ - bnb_4bit_use_double_quant: True
42
+ - bnb_4bit_compute_dtype: bfloat16
43
+ ### Framework versions
44
+
45
+ - PEFT 0.4.0
46
+
47
+ - PEFT 0.4.0
adapters/function/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "o_proj",
20
+ "k_proj",
21
+ "v_proj",
22
+ "q_proj",
23
+ "up_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapters/function/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cce2878d176d9e64aaf65637e3303c8ba5d12c7a74bf86eeb7c9f68d6f1adb6
3
+ size 1657155077
routing_data/expert_code.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
routing_data/expert_creative.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
routing_data/expert_function.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
routing_data/expert_general.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
routing_data/expert_qa.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
routing_data/expert_reasoning.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
scripts/segment_dataset.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ from collections import defaultdict
5
+ from smart_open import smart_open
6
+
7
+ # URL to the dataset we're using.
8
+ dataset_url = "https://huggingface.co/datasets/jondurbin/airoboros-2.1/resolve/main/instructions.jsonl"
9
+
10
+ # Select the subset of data for each of our experts.
11
+ experts = {
12
+ "qa": [
13
+ "quiz",
14
+ "multiple_choice",
15
+ "contextual",
16
+ "counterfactual_contextual"
17
+ ],
18
+ "creative": [
19
+ "card",
20
+ "writing",
21
+ "experience",
22
+ "song",
23
+ "roleplay",
24
+ "gtkm",
25
+ "rp",
26
+ "detailed_writing",
27
+ "joke"
28
+ ],
29
+ "code": [
30
+ "coding"
31
+ ],
32
+ "reasoning": [
33
+ "cot",
34
+ "theory_of_mind",
35
+ "riddle",
36
+ "orca"
37
+ ],
38
+ "function": [
39
+ "agent",
40
+ "plan"
41
+ ],
42
+ "general": [
43
+ "wordgame",
44
+ "trivia",
45
+ "general"
46
+ ]
47
+ }
48
+
49
+ # Map all of our training data into the categories per expert.
50
+ categories = defaultdict(list)
51
+ with smart_open(dataset_url, "r") as infile:
52
+ for line in infile.readlines():
53
+ item = json.loads(line)
54
+ if not item.get("category"):
55
+ continue
56
+ categories[item["category"]].append(item)
57
+
58
+ # Include a random sampling of each expert's data in each other expert's dataset.
59
+ samples = {}
60
+ for expert, expert_cats in experts.items():
61
+ samples[expert] = []
62
+ for category in expert_cats:
63
+ samples[expert] += random.sample(categories[category], int(len(categories[category]) * 0.15) or 1)
64
+
65
+ # Save the split datasets.
66
+ if not os.path.exists("training_data"):
67
+ os.mkdir("training_data")
68
+ if not os.path.exists("routing_data"):
69
+ os.mkdir("routing_data")
70
+ for expert, expert_cats in experts.items():
71
+ with open(f"training_data/expert_{expert}.jsonl", "w") as outfile:
72
+ # Also, be sure to include stylized responses so it adapts to system prompt well.
73
+ for category in expert_cats + ["stylized_response"]:
74
+ for item in categories[category]:
75
+ outfile.write(json.dumps(item) + "\n")
76
+ for other in samples:
77
+ if other == expert:
78
+ continue
79
+ for item in samples[other]:
80
+ outfile.write(json.dumps(item) + "\n")
81
+ with open(f"routing_data/expert_{expert}.jsonl", "w") as outfile:
82
+ for category in expert_cats:
83
+ for item in categories[category]:
84
+ outfile.write(json.dumps({"instruction": item.get("system", "A chat.") + " " + item["instruction"]}) + "\n")
scripts/tune.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export EXPERT=$1
2
+ export MODEL_SIZE=$2
3
+ export BATCH_SIZE=$3
4
+ export CUDA_VISIBLE_DEVICES=$4
5
+
6
+ export BASE_DIR=/workspace
7
+ export WANDB_API_KEY=[redacted]
8
+ export WANDB_PROJECT=airoboros-lmoe-$MODEL_SIZE-2.1-$EXPERT
9
+
10
+ pyt qlora.py \
11
+ --model_name_or_path $BASE_DIR/llama-2-$MODEL_SIZE-hf \
12
+ --output_dir $BASE_DIR/$WANDB_PROJECT \
13
+ --num_train_epochs 3 \
14
+ --logging_steps 1 \
15
+ --save_strategy steps \
16
+ --save_steps 100 \
17
+ --save_total_limit 1 \
18
+ --data_seed 11422 \
19
+ --evaluation_strategy no \
20
+ --eval_dataset_size 2 \
21
+ --max_new_tokens 4096 \
22
+ --dataloader_num_workers 3 \
23
+ --logging_strategy steps \
24
+ --remove_unused_columns False \
25
+ --do_train \
26
+ --lora_r 64 \
27
+ --lora_alpha 16 \
28
+ --lora_modules all \
29
+ --bf16 \
30
+ --bits 4 \
31
+ --double_quant \
32
+ --quant_type nf4 \
33
+ --warmup_ratio 0.03 \
34
+ --lr_scheduler_type constant \
35
+ --dataset airoboros-lmoe-2.1/expert_$EXPERT.jsonl \
36
+ --dataset_format airoboros \
37
+ --model_max_len 4096 \
38
+ --per_device_train_batch_size $BASE_SIZE \
39
+ --learning_rate 0.00017 \
40
+ --adam_beta2 0.999 \
41
+ --max_grad_norm 0.3 \
42
+ --lora_dropout 0.05 \
43
+ --weight_decay 0.0 \
44
+ --seed 11422 \
45
+ --report_to wandb \
46
+ --gradient_accumulation_steps 16 \
47
+ --gradient_checkpointing
training_data/expert_code.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2d2ee497fe2eb7ee9d8a53d8efe6c711174eadd3c593b447bfbf73f2c964ccf
3
+ size 17716707
training_data/expert_creative.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c41f8aa2f90c066ba13fdc6e28458a4bfc4b6478c01f07b449c442f04093c3e0
3
+ size 25482996
training_data/expert_function.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
training_data/expert_general.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e658a5081fd87b3869033e0b5305c16f69a6c8d27a8ad903473db9b03b7914b1
3
+ size 18641341
training_data/expert_qa.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ec3201c55bfb43326bc002032ea73179feae8b661b9678f32871b370fe7b02
3
+ size 12318163
training_data/expert_reasoning.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00f4c3813c9c231a54ac3279af97ffe7268424e6c175eba4b048f310839fccd9
3
+ size 17556799