Update README.md
Browse files
README.md
CHANGED
@@ -10,7 +10,7 @@ pipeline_tag: text-generation
|
|
10 |
|
11 |
# Jamba-Hercules
|
12 |
|
13 |
-
<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/
|
14 |
|
15 |
# *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
|
16 |
|
@@ -111,10 +111,10 @@ print(tokenizer.batch_decode(outputs)[0])
|
|
111 |
```py
|
112 |
|
113 |
lora_config = LoraConfig(
|
114 |
-
r=
|
115 |
-
lora_alpha=
|
116 |
target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
|
117 |
-
lora_dropout=0.05,
|
118 |
task_type="CAUSAL_LM",
|
119 |
bias="none"
|
120 |
)
|
@@ -127,19 +127,22 @@ trainer = SFTTrainer(
|
|
127 |
tokenizer=tokenizer,
|
128 |
args=TrainingArguments(
|
129 |
num_train_epochs=1,
|
130 |
-
lr_scheduler_type='
|
131 |
-
learning_rate=0.
|
132 |
per_device_train_batch_size=1,
|
133 |
gradient_accumulation_steps=8,
|
134 |
gradient_checkpointing=True,
|
135 |
-
warmup_steps=10,
|
136 |
-
weight_decay=0.01,
|
137 |
fp16=not torch.cuda.is_bf16_supported(),
|
138 |
bf16=torch.cuda.is_bf16_supported(),
|
139 |
-
logging_steps=1,
|
140 |
-
save_steps=200,
|
141 |
output_dir="outputs",
|
142 |
-
optim="
|
|
|
|
|
|
|
143 |
seed=42,
|
144 |
),
|
145 |
)
|
|
|
10 |
|
11 |
# Jamba-Hercules
|
12 |
|
13 |
+
<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/e4xnjDo6AnBeDXliwOoc6.webp" width="500" height="500">
|
14 |
|
15 |
# *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
|
16 |
|
|
|
111 |
```py
|
112 |
|
113 |
lora_config = LoraConfig(
|
114 |
+
r=16,
|
115 |
+
lora_alpha=32,
|
116 |
target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
|
117 |
+
lora_dropout=0.05,
|
118 |
task_type="CAUSAL_LM",
|
119 |
bias="none"
|
120 |
)
|
|
|
127 |
tokenizer=tokenizer,
|
128 |
args=TrainingArguments(
|
129 |
num_train_epochs=1,
|
130 |
+
lr_scheduler_type='cosine',
|
131 |
+
learning_rate=0.0002,
|
132 |
per_device_train_batch_size=1,
|
133 |
gradient_accumulation_steps=8,
|
134 |
gradient_checkpointing=True,
|
135 |
+
warmup_steps=10,
|
136 |
+
weight_decay=0.01,
|
137 |
fp16=not torch.cuda.is_bf16_supported(),
|
138 |
bf16=torch.cuda.is_bf16_supported(),
|
139 |
+
logging_steps=1,
|
140 |
+
save_steps=200,
|
141 |
output_dir="outputs",
|
142 |
+
optim="adamw_bnb_8bit",
|
143 |
+
adam_epsilon=0.00001,
|
144 |
+
adam_beta2=0.95,
|
145 |
+
max_grad_norm=1.0,
|
146 |
seed=42,
|
147 |
),
|
148 |
)
|