LordNeel commited on
Commit
94c41e3
·
verified ·
1 Parent(s): 5b052c4

Upload train_glm47_flash.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_glm47_flash.py +210 -0
train_glm47_flash.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "torch>=2.0.0",
5
+ # "transformers>=4.45.0",
6
+ # "trl>=0.12.0",
7
+ # "peft>=0.7.0",
8
+ # "accelerate>=0.24.0",
9
+ # "datasets",
10
+ # "trackio",
11
+ # "bitsandbytes",
12
+ # ]
13
+ # ///
14
+
15
+ """
16
+ Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
17
+ Using TRL SFTTrainer with LoRA on A100-80GB.
18
+ """
19
+
20
+ import torch
21
+ import trackio
22
+ from datasets import load_dataset
23
+ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
25
+ from trl import SFTTrainer, SFTConfig
26
+
27
+ # Configuration
28
+ MODEL_NAME = "zai-org/GLM-4.7-Flash"
29
+ DATASET_NAME = "LordNeel/unblinded-mastery-sharegpt"
30
+ OUTPUT_MODEL = "LordNeel/GLM-4.7-Flash-Unblinded-Mastery"
31
+
32
+ print("=" * 60)
33
+ print("GLM-4.7-Flash Fine-tuning for Unblinded Mastery")
34
+ print("=" * 60)
35
+
36
+ # Load dataset
37
+ print("\nLoading dataset...")
38
+ dataset = load_dataset(DATASET_NAME, split="train")
39
+ print(f"Dataset loaded: {len(dataset)} examples")
40
+
41
+ # Create train/eval split
42
+ print("Creating train/eval split...")
43
+ dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
44
+ train_dataset = dataset_split["train"]
45
+ eval_dataset = dataset_split["test"]
46
+ print(f" Train: {len(train_dataset)} examples")
47
+ print(f" Eval: {len(eval_dataset)} examples")
48
+
49
+ # 4-bit quantization config for memory efficiency
50
+ print("\nSetting up 4-bit quantization...")
51
+ bnb_config = BitsAndBytesConfig(
52
+ load_in_4bit=True,
53
+ bnb_4bit_quant_type="nf4",
54
+ bnb_4bit_compute_dtype=torch.bfloat16,
55
+ bnb_4bit_use_double_quant=True,
56
+ )
57
+
58
+ # Load tokenizer
59
+ print("\nLoading tokenizer...")
60
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
61
+ if tokenizer.pad_token is None:
62
+ tokenizer.pad_token = tokenizer.eos_token
63
+ print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")
64
+
65
+ # Load model with 4-bit quantization
66
+ print("\nLoading model with 4-bit quantization...")
67
+ model = AutoModelForCausalLM.from_pretrained(
68
+ MODEL_NAME,
69
+ quantization_config=bnb_config,
70
+ device_map="auto",
71
+ trust_remote_code=True,
72
+ torch_dtype=torch.bfloat16,
73
+ )
74
+ print("Model loaded!")
75
+
76
+ # Prepare model for k-bit training
77
+ model = prepare_model_for_kbit_training(model)
78
+
79
+ # Find all linear layer names for LoRA
80
+ print("\nFinding linear layers for LoRA...")
81
+ def find_all_linear_names(model):
82
+ cls = torch.nn.Linear
83
+ lora_module_names = set()
84
+ for name, module in model.named_modules():
85
+ if isinstance(module, cls):
86
+ names = name.split('.')
87
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
88
+ # Remove output layer
89
+ if 'lm_head' in lora_module_names:
90
+ lora_module_names.remove('lm_head')
91
+ return list(lora_module_names)
92
+
93
+ target_modules = find_all_linear_names(model)
94
+ print(f" Found target modules: {target_modules}")
95
+
96
+ # LoRA configuration
97
+ print("\nConfiguring LoRA...")
98
+ peft_config = LoraConfig(
99
+ r=32,
100
+ lora_alpha=64,
101
+ lora_dropout=0.05,
102
+ bias="none",
103
+ task_type=TaskType.CAUSAL_LM,
104
+ target_modules=target_modules,
105
+ )
106
+
107
+ # Apply LoRA
108
+ model = get_peft_model(model, peft_config)
109
+ model.print_trainable_parameters()
110
+
111
+ # Format function for ShareGPT conversations
112
+ def format_sharegpt(example):
113
+ """Format ShareGPT conversations to chat template."""
114
+ messages = []
115
+ for turn in example["conversations"]:
116
+ role_map = {"system": "system", "human": "user", "gpt": "assistant"}
117
+ role = role_map.get(turn["from"], turn["from"])
118
+ messages.append({"role": role, "content": turn["value"]})
119
+
120
+ # Apply chat template
121
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
122
+ return {"text": text}
123
+
124
+ # Format datasets
125
+ print("\nFormatting datasets...")
126
+ train_dataset = train_dataset.map(format_sharegpt, remove_columns=train_dataset.column_names)
127
+ eval_dataset = eval_dataset.map(format_sharegpt, remove_columns=eval_dataset.column_names)
128
+ print("Datasets formatted!")
129
+
130
+ # Training configuration
131
+ print("\nConfiguring training...")
132
+ training_config = SFTConfig(
133
+ # Hub settings - CRITICAL for saving
134
+ output_dir=OUTPUT_MODEL.split("/")[-1],
135
+ push_to_hub=True,
136
+ hub_model_id=OUTPUT_MODEL,
137
+ hub_strategy="every_save",
138
+ hub_private_repo=False,
139
+
140
+ # Training parameters
141
+ num_train_epochs=3,
142
+ per_device_train_batch_size=2,
143
+ per_device_eval_batch_size=2,
144
+ gradient_accumulation_steps=8, # Effective batch size: 16
145
+ learning_rate=2e-4,
146
+ max_seq_length=2048,
147
+
148
+ # Memory optimization
149
+ gradient_checkpointing=True,
150
+ gradient_checkpointing_kwargs={"use_reentrant": False},
151
+
152
+ # Logging & checkpointing
153
+ logging_steps=10,
154
+ save_strategy="steps",
155
+ save_steps=100,
156
+ save_total_limit=3,
157
+
158
+ # Evaluation
159
+ eval_strategy="steps",
160
+ eval_steps=100,
161
+
162
+ # Optimization
163
+ warmup_ratio=0.1,
164
+ lr_scheduler_type="cosine",
165
+ optim="paged_adamw_8bit",
166
+
167
+ # Precision
168
+ bf16=True,
169
+ fp16=False,
170
+
171
+ # Monitoring
172
+ report_to="trackio",
173
+ project="unblinded-mastery-finetuning",
174
+ run_name="glm47flash-sft-lora",
175
+
176
+ # Dataset
177
+ dataset_text_field="text",
178
+ packing=False,
179
+ )
180
+
181
+ # Initialize trainer
182
+ print("\nInitializing trainer...")
183
+ trainer = SFTTrainer(
184
+ model=model,
185
+ train_dataset=train_dataset,
186
+ eval_dataset=eval_dataset,
187
+ args=training_config,
188
+ tokenizer=tokenizer,
189
+ peft_config=None, # Already applied above
190
+ )
191
+
192
+ # Train
193
+ print("\n" + "=" * 60)
194
+ print("STARTING TRAINING")
195
+ print("=" * 60)
196
+ trainer.train()
197
+
198
+ # Save and push to hub
199
+ print("\nSaving model to Hub...")
200
+ trainer.save_model()
201
+ trainer.push_to_hub()
202
+
203
+ # Finish tracking
204
+ trackio.finish()
205
+
206
+ print("\n" + "=" * 60)
207
+ print("TRAINING COMPLETE!")
208
+ print(f"Model saved to: https://huggingface.co/{OUTPUT_MODEL}")
209
+ print(f"View metrics at: https://huggingface.co/spaces/LordNeel/trackio")
210
+ print("=" * 60)