jondurbin
/

airoboros-lmoe-70b-2.1

Model card Files Files and versions Community

jondurbin commited on Aug 30, 2023

Commit

5907bd4

•

1 Parent(s): 20f5c52

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.gitattributes +5 -0
README.md +3 -1
adapters/code/README.md +32 -0
adapters/code/adapter_config.json +26 -0
adapters/code/adapter_model.bin +3 -0
adapters/creative/README.md +20 -0
adapters/creative/adapter_config.json +26 -0
adapters/creative/adapter_model.bin +3 -0
adapters/function/README.md +47 -0
adapters/function/adapter_config.json +26 -0
adapters/function/adapter_model.bin +3 -0
routing_data/expert_code.jsonl +0 -0
routing_data/expert_creative.jsonl +0 -0
routing_data/expert_function.jsonl +0 -0
routing_data/expert_general.jsonl +0 -0
routing_data/expert_qa.jsonl +0 -0
routing_data/expert_reasoning.jsonl +0 -0
scripts/segment_dataset.py +84 -0
scripts/tune.sh +47 -0
training_data/expert_code.jsonl +3 -0
training_data/expert_creative.jsonl +3 -0
training_data/expert_function.jsonl +0 -0
training_data/expert_general.jsonl +3 -0
training_data/expert_qa.jsonl +3 -0
training_data/expert_reasoning.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+training_data/expert_code.jsonl filter=lfs diff=lfs merge=lfs -text
+training_data/expert_creative.jsonl filter=lfs diff=lfs merge=lfs -text
+training_data/expert_general.jsonl filter=lfs diff=lfs merge=lfs -text
+training_data/expert_qa.jsonl filter=lfs diff=lfs merge=lfs -text
+training_data/expert_reasoning.jsonl filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,5 @@
 ---
-license: llama2
 ---

 ---
+license: other
 ---
+https://github.com/jondurbin/airoboros#lmoe

adapters/code/README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+library_name: peft
+---
+## Training procedure
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+- PEFT 0.4.0
+- PEFT 0.4.0

adapters/code/adapter_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/llama-2-70b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16.0,
+  "lora_dropout": 0.1,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapters/code/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3191b3c1cfe3af2377501ca83f683ca3c86576092039651dc7d08eafd2064eb6
+size 1657155077

adapters/creative/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+library_name: peft
+---
+## Training procedure
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+- PEFT 0.4.0

adapters/creative/adapter_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/llama-2-70b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16.0,
+  "lora_dropout": 0.1,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapters/creative/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7706c60de732463422ae4fd2d2eaf39def04170a5f5481f65c5e3cb2d057906a
+size 1657155077

adapters/function/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+---
+library_name: peft
+---
+## Training procedure
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+- PEFT 0.4.0
+- PEFT 0.4.0
+tion config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+- PEFT 0.4.0
+- PEFT 0.4.0

adapters/function/adapter_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/llama-2-70b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16.0,
+  "lora_dropout": 0.1,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapters/function/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cce2878d176d9e64aaf65637e3303c8ba5d12c7a74bf86eeb7c9f68d6f1adb6
+size 1657155077

routing_data/expert_code.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

routing_data/expert_creative.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

routing_data/expert_function.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

routing_data/expert_general.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

routing_data/expert_qa.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

routing_data/expert_reasoning.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/segment_dataset.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import json
+import random
+from collections import defaultdict
+from smart_open import smart_open
+# URL to the dataset we're using.
+dataset_url = "https://huggingface.co/datasets/jondurbin/airoboros-2.1/resolve/main/instructions.jsonl"
+# Select the subset of data for each of our experts.
+experts = {
+  "qa": [
+    "quiz",
+    "multiple_choice",
+    "contextual",
+    "counterfactual_contextual"
+  ],
+  "creative": [
+    "card",
+    "writing",
+    "experience",
+    "song",
+    "roleplay",
+    "gtkm",
+    "rp",
+    "detailed_writing",
+    "joke"
+  ],
+  "code": [
+    "coding"
+  ],
+  "reasoning": [
+    "cot",
+    "theory_of_mind",
+    "riddle",
+    "orca"
+  ],
+  "function": [
+    "agent",
+    "plan"
+  ],
+  "general": [
+    "wordgame",
+    "trivia",
+    "general"
+  ]
+}
+# Map all of our training data into the categories per expert.
+categories = defaultdict(list)
+with smart_open(dataset_url, "r") as infile:
+    for line in infile.readlines():
+        item = json.loads(line)
+        if not item.get("category"):
+            continue
+        categories[item["category"]].append(item)
+# Include a random sampling of each expert's data in each other expert's dataset.
+samples = {}
+for expert, expert_cats in experts.items():
+    samples[expert] = []
+    for category in expert_cats:
+        samples[expert] += random.sample(categories[category], int(len(categories[category]) * 0.15) or 1)
+# Save the split datasets.
+if not os.path.exists("training_data"):
+    os.mkdir("training_data")
+if not os.path.exists("routing_data"):
+    os.mkdir("routing_data")
+for expert, expert_cats in experts.items():
+    with open(f"training_data/expert_{expert}.jsonl", "w") as outfile:
+        # Also, be sure to include stylized responses so it adapts to system prompt well.
+        for category in expert_cats + ["stylized_response"]:
+            for item in categories[category]:
+                outfile.write(json.dumps(item) + "\n")
+        for other in samples:
+            if other == expert:
+                continue
+            for item in samples[other]:
+                outfile.write(json.dumps(item) + "\n")
+    with open(f"routing_data/expert_{expert}.jsonl", "w") as outfile:
+        for category in expert_cats:
+            for item in categories[category]:
+                outfile.write(json.dumps({"instruction": item.get("system", "A chat.") + " " + item["instruction"]}) + "\n")

scripts/tune.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+export EXPERT=$1
+export MODEL_SIZE=$2
+export BATCH_SIZE=$3
+export CUDA_VISIBLE_DEVICES=$4
+export BASE_DIR=/workspace
+export WANDB_API_KEY=[redacted]
+export WANDB_PROJECT=airoboros-lmoe-$MODEL_SIZE-2.1-$EXPERT
+pyt qlora.py \
+    --model_name_or_path $BASE_DIR/llama-2-$MODEL_SIZE-hf \
+    --output_dir $BASE_DIR/$WANDB_PROJECT \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_strategy steps \
+    --save_steps 100 \
+    --save_total_limit 1 \
+    --data_seed 11422 \
+    --evaluation_strategy no \
+    --eval_dataset_size 2 \
+    --max_new_tokens 4096 \
+    --dataloader_num_workers 3 \
+    --logging_strategy steps \
+    --remove_unused_columns False \
+    --do_train \
+    --lora_r 64 \
+    --lora_alpha 16 \
+    --lora_modules all \
+    --bf16 \
+    --bits 4 \
+    --double_quant \
+    --quant_type nf4 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type constant \
+    --dataset airoboros-lmoe-2.1/expert_$EXPERT.jsonl \
+    --dataset_format airoboros \
+    --model_max_len 4096 \
+    --per_device_train_batch_size $BASE_SIZE \
+    --learning_rate 0.00017 \
+    --adam_beta2 0.999 \
+    --max_grad_norm 0.3 \
+    --lora_dropout 0.05 \
+    --weight_decay 0.0 \
+    --seed 11422 \
+    --report_to wandb \
+    --gradient_accumulation_steps 16 \
+    --gradient_checkpointing

training_data/expert_code.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2d2ee497fe2eb7ee9d8a53d8efe6c711174eadd3c593b447bfbf73f2c964ccf
+size 17716707

training_data/expert_creative.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c41f8aa2f90c066ba13fdc6e28458a4bfc4b6478c01f07b449c442f04093c3e0
+size 25482996

training_data/expert_function.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

training_data/expert_general.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e658a5081fd87b3869033e0b5305c16f69a6c8d27a8ad903473db9b03b7914b1
+size 18641341

training_data/expert_qa.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37ec3201c55bfb43326bc002032ea73179feae8b661b9678f32871b370fe7b02
+size 12318163

training_data/expert_reasoning.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00f4c3813c9c231a54ac3279af97ffe7268424e6c175eba4b048f310839fccd9
+size 17556799