decula commited on
Commit
9b0e8a8
·
1 Parent(s): d8f76d5

added lll

Browse files
Files changed (1) hide show
  1. qwen3_9b_finetune.py +28 -23
qwen3_9b_finetune.py CHANGED
@@ -15,10 +15,9 @@ from transformers import (
15
  from datasets import load_dataset
16
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
17
 
18
- # 1. 显存碎片整理配置
19
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
20
 
21
- # 2. Kaggle 认证
22
  try:
23
  user_secrets = UserSecretsClient()
24
  hf_token = user_secrets.get_secret("HF_TOKEN")
@@ -35,7 +34,7 @@ if not dist.is_initialized():
35
  model_id = "Qwen/Qwen3.5-9B"
36
  dataset_id = "a686d380/h-corpus-2023"
37
 
38
- # 3. 极致量化配置
39
  bnb_config = BitsAndBytesConfig(
40
  load_in_4bit=True,
41
  bnb_4bit_use_double_quant=True,
@@ -43,9 +42,9 @@ bnb_config = BitsAndBytesConfig(
43
  bnb_4bit_compute_dtype=torch.float16
44
  )
45
 
46
- # 4. 加载模型
47
  if local_rank == 0:
48
- print(f"正在以生存模式加载 Qwen-9B...")
49
 
50
  model = AutoModelForCausalLM.from_pretrained(
51
  model_id,
@@ -55,59 +54,62 @@ model = AutoModelForCausalLM.from_pretrained(
55
  token=hf_token,
56
  torch_dtype=torch.float16,
57
  low_cpu_mem_usage=True,
58
- attn_implementation="sdpa" # T4 必须用 sdpa
59
  )
60
 
61
- # 5. 激进裁剪移除最后 12 层 (约占原模型 30% 显存)
62
  if hasattr(model, "model") and hasattr(model.model, "layers"):
63
  model.model.layers = model.model.layers[:-12]
64
  model.config.num_hidden_layers = len(model.model.layers)
65
  if local_rank == 0:
66
- print(f"裁剪成功。当前存留层数: {len(model.model.layers)}")
67
 
68
- # 6. 强制清理显存
69
  gc.collect()
70
  torch.cuda.empty_cache()
71
 
72
- # 7. QLoRA 配置 (精简 Target Modules 以节省优化器显存)
73
  model = prepare_model_for_kbit_training(model)
74
-
75
  lora_config = LoraConfig(
76
- r=8,
77
- lora_alpha=16,
78
- # 只针对 Q 和 V 投影,这是性价比最高的微调方式,显存占用最低
79
- target_modules=["q_proj", "v_proj"],
80
  lora_dropout=0.05,
81
  task_type="CAUSAL_LM"
82
  )
83
  model = get_peft_model(model, lora_config)
84
 
85
- # 8. 数据预处理
86
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token)
87
  tokenizer.pad_token = tokenizer.eos_token
88
 
89
- dataset = load_dataset(dataset_id, split="train", token=hf_token)
 
 
 
 
90
 
91
  def tokenize_fn(x):
92
  text_col = "text" if "text" in x else list(x.keys())[0]
93
- return tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length")
 
94
 
95
- tokenized_ds = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)
 
96
 
97
- # 9. 训练参数 (关闭不必要的监控)
98
  training_args = TrainingArguments(
99
- output_dir="./qwen_survival_out",
100
  per_device_train_batch_size=1,
101
  gradient_accumulation_steps=16,
102
  learning_rate=2e-4,
103
  fp16=True,
104
  gradient_checkpointing=True,
105
  logging_steps=5,
106
- max_steps=100,
107
  save_total_limit=1,
108
  ddp_find_unused_parameters=False,
109
  report_to="none",
110
- # 强制不使用推理时的 KV Cache
 
111
  gradient_checkpointing_kwargs={"use_reentrant": False}
112
  )
113
 
@@ -119,8 +121,11 @@ trainer = Trainer(
119
  )
120
 
121
  model.config.use_cache = False
 
 
122
  trainer.train()
123
 
 
124
  if local_rank == 0:
125
  trainer.model.save_pretrained("./qwen_final_lora")
126
  print("训练成功结束!")
 
15
  from datasets import load_dataset
16
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
17
 
18
+ # 1. 环境与显存配置
19
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
20
 
 
21
  try:
22
  user_secrets = UserSecretsClient()
23
  hf_token = user_secrets.get_secret("HF_TOKEN")
 
34
  model_id = "Qwen/Qwen3.5-9B"
35
  dataset_id = "a686d380/h-corpus-2023"
36
 
37
+ # 2. 极致量化配置
38
  bnb_config = BitsAndBytesConfig(
39
  load_in_4bit=True,
40
  bnb_4bit_use_double_quant=True,
 
42
  bnb_4bit_compute_dtype=torch.float16
43
  )
44
 
45
+ # 3. 加载模型 (核心修复点)
46
  if local_rank == 0:
47
+ print(f"正在加载模型核心: {model_id}...")
48
 
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_id,
 
54
  token=hf_token,
55
  torch_dtype=torch.float16,
56
  low_cpu_mem_usage=True,
57
+ attn_implementation="sdpa"
58
  )
59
 
60
+ # 4. 架构裁剪 (移除最后 12 层以确保 PEFT 转换时不崩)
61
  if hasattr(model, "model") and hasattr(model.model, "layers"):
62
  model.model.layers = model.model.layers[:-12]
63
  model.config.num_hidden_layers = len(model.model.layers)
64
  if local_rank == 0:
65
+ print(f"裁剪成功。当前层数: {len(model.model.layers)}")
66
 
 
67
  gc.collect()
68
  torch.cuda.empty_cache()
69
 
70
+ # 5. QLoRA 准备
71
  model = prepare_model_for_kbit_training(model)
 
72
  lora_config = LoraConfig(
73
+ r=8, lora_alpha=16,
74
+ target_modules=["q_proj", "v_proj"],
 
 
75
  lora_dropout=0.05,
76
  task_type="CAUSAL_LM"
77
  )
78
  model = get_peft_model(model, lora_config)
79
 
80
+ # 6. 数据预处理 (流式加速版)
81
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token)
82
  tokenizer.pad_token = tokenizer.eos_token
83
 
84
+ if local_rank == 0:
85
+ print("正在以流式模式连接数据集 (无需等待下载)...")
86
+
87
+ # 开启 streaming=True,这步是瞬间完成的
88
+ raw_dataset = load_dataset(dataset_id, split="train", token=hf_token, streaming=True)
89
 
90
  def tokenize_fn(x):
91
  text_col = "text" if "text" in x else list(x.keys())[0]
92
+ tokenized = tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length")
93
+ return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}
94
 
95
+ # 流式数据集的 map 也是惰性的,几乎不耗时
96
+ tokenized_ds = raw_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
97
 
98
+ # 7. 训练参数
99
  training_args = TrainingArguments(
100
+ output_dir="./qwen_stream_out",
101
  per_device_train_batch_size=1,
102
  gradient_accumulation_steps=16,
103
  learning_rate=2e-4,
104
  fp16=True,
105
  gradient_checkpointing=True,
106
  logging_steps=5,
107
+ max_steps=200, # 流式模式下建议手动指定 max_steps
108
  save_total_limit=1,
109
  ddp_find_unused_parameters=False,
110
  report_to="none",
111
+ # 必须指定这个,因为流式数据集无法通过 len() 获取长度
112
+ max_grad_norm=1.0,
113
  gradient_checkpointing_kwargs={"use_reentrant": False}
114
  )
115
 
 
121
  )
122
 
123
  model.config.use_cache = False
124
+ if local_rank == 0:
125
+ print("数据流已就绪,开始训练...")
126
  trainer.train()
127
 
128
+ # 8. 保存
129
  if local_rank == 0:
130
  trainer.model.save_pretrained("./qwen_final_lora")
131
  print("训练成功结束!")