Severian commited on
Commit
7269076
1 Parent(s): 8a199a7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -11
README.md CHANGED
@@ -10,7 +10,7 @@ pipeline_tag: text-generation
10
 
11
  # Jamba-Hercules
12
 
13
- <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/Ph6ZvxwF7a0m_B5Su_EK7.webp" width="500" height="500">
14
 
15
  # *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
16
 
@@ -111,10 +111,10 @@ print(tokenizer.batch_decode(outputs)[0])
111
  ```py
112
 
113
  lora_config = LoraConfig(
114
- r=8,
115
- lora_alpha=16,
116
  target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
117
- lora_dropout=0.05,
118
  task_type="CAUSAL_LM",
119
  bias="none"
120
  )
@@ -127,19 +127,22 @@ trainer = SFTTrainer(
127
  tokenizer=tokenizer,
128
  args=TrainingArguments(
129
  num_train_epochs=1,
130
- lr_scheduler_type='linear',
131
- learning_rate=0.001,
132
  per_device_train_batch_size=1,
133
  gradient_accumulation_steps=8,
134
  gradient_checkpointing=True,
135
- warmup_steps=10,
136
- weight_decay=0.01,
137
  fp16=not torch.cuda.is_bf16_supported(),
138
  bf16=torch.cuda.is_bf16_supported(),
139
- logging_steps=1,
140
- save_steps=200,
141
  output_dir="outputs",
142
- optim="adamw_8bit",
 
 
 
143
  seed=42,
144
  ),
145
  )
 
10
 
11
  # Jamba-Hercules
12
 
13
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/e4xnjDo6AnBeDXliwOoc6.webp" width="500" height="500">
14
 
15
  # *Name was changed from Open-Hermes to Hercules. During multiple trainings and testings with lots of different datasets, I found that Jamba has BY FAR reacted the best to this dataset. It contains Open-Hermes-2.0 examples but offers A LOT more in diversity and complexity. Thanks to @Locutusque for the amazing work!
16
 
 
111
  ```py
112
 
113
  lora_config = LoraConfig(
114
+ r=16,
115
+ lora_alpha=32,
116
  target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
117
+ lora_dropout=0.05,
118
  task_type="CAUSAL_LM",
119
  bias="none"
120
  )
 
127
  tokenizer=tokenizer,
128
  args=TrainingArguments(
129
  num_train_epochs=1,
130
+ lr_scheduler_type='cosine',
131
+ learning_rate=0.0002,
132
  per_device_train_batch_size=1,
133
  gradient_accumulation_steps=8,
134
  gradient_checkpointing=True,
135
+ warmup_steps=10,
136
+ weight_decay=0.01,
137
  fp16=not torch.cuda.is_bf16_supported(),
138
  bf16=torch.cuda.is_bf16_supported(),
139
+ logging_steps=1,
140
+ save_steps=200,
141
  output_dir="outputs",
142
+ optim="adamw_bnb_8bit",
143
+ adam_epsilon=0.00001,
144
+ adam_beta2=0.95,
145
+ max_grad_norm=1.0,
146
  seed=42,
147
  ),
148
  )