Severian
/

Jamba-Hercules

Text Generation

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

Severian commited on Apr 2

Commit

7269076

•

1 Parent(s): 8a199a7

Update README.md

Files changed (1) hide show

README.md +14 -11

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ pipeline_tag: text-generation
 # Jamba-Hercules
-<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/Ph6ZvxwF7a0m_B5Su_EK7.webp" width="500" height="500">
 # *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
@@ -111,10 +111,10 @@ print(tokenizer.batch_decode(outputs)[0])
 ```py
 lora_config = LoraConfig(
-    r=8,
-    lora_alpha=16,
     target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
-    lora_dropout=0.05,
     task_type="CAUSAL_LM",
     bias="none"
 )
@@ -127,19 +127,22 @@ trainer = SFTTrainer(
     tokenizer=tokenizer,
     args=TrainingArguments(
         num_train_epochs=1,
-        lr_scheduler_type='linear',
-        learning_rate=0.001,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
         gradient_checkpointing=True,
-        warmup_steps=10,
-        weight_decay=0.01,
         fp16=not torch.cuda.is_bf16_supported(),
         bf16=torch.cuda.is_bf16_supported(),
-        logging_steps=1,
-        save_steps=200,
         output_dir="outputs",
-        optim="adamw_8bit",
         seed=42,
     ),
 )

 # Jamba-Hercules
+<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/e4xnjDo6AnBeDXliwOoc6.webp" width="500" height="500">
 # *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
 ```py
 lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
     target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
+    lora_dropout=0.05,
     task_type="CAUSAL_LM",
     bias="none"
 )
     tokenizer=tokenizer,
     args=TrainingArguments(
         num_train_epochs=1,
+        lr_scheduler_type='cosine',
+        learning_rate=0.0002,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
         gradient_checkpointing=True,
+        warmup_steps=10,
+        weight_decay=0.01,
         fp16=not torch.cuda.is_bf16_supported(),
         bf16=torch.cuda.is_bf16_supported(),
+        logging_steps=1,
+        save_steps=200,
         output_dir="outputs",
+        optim="adamw_bnb_8bit",
+        adam_epsilon=0.00001,
+        adam_beta2=0.95,
+        max_grad_norm=1.0,
         seed=42,
     ),
 )