decula
/

sd

@@ -15,10 +15,9 @@ from transformers import (
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-# 1. 显存碎片整理配置
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-# 2. Kaggle 认证
 try:
     user_secrets = UserSecretsClient()
     hf_token = user_secrets.get_secret("HF_TOKEN")
@@ -35,7 +34,7 @@ if not dist.is_initialized():
 model_id = "Qwen/Qwen3.5-9B"
 dataset_id = "a686d380/h-corpus-2023"
-# 3. 极致量化配置
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -43,9 +42,9 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.float16
 )
-# 4. 加载模型
 if local_rank == 0:
-    print(f"正在以生存模式加载 Qwen-9B...")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -55,59 +54,62 @@ model = AutoModelForCausalLM.from_pretrained(
     token=hf_token,
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True,
-    attn_implementation="sdpa" # T4 必须用 sdpa
 )
-# 5. 激进裁剪：移除最后 12 层 (约占原模型 30% 显存)
 if hasattr(model, "model") and hasattr(model.model, "layers"):
     model.model.layers = model.model.layers[:-12]
     model.config.num_hidden_layers = len(model.model.layers)
     if local_rank == 0:
-        print(f"裁剪成功。当前存留层数: {len(model.model.layers)}")
-# 6. 强制清理显存
 gc.collect()
 torch.cuda.empty_cache()
-# 7. QLoRA 配置 (精简 Target Modules 以节省优化器显存)
 model = prepare_model_for_kbit_training(model)
 lora_config = LoraConfig(
-    r=8,
-    lora_alpha=16,
-    # 只针对 Q 和 V 投影，这是性价比最高的微调方式，显存占用最低
-    target_modules=["q_proj", "v_proj"],
     lora_dropout=0.05,
     task_type="CAUSAL_LM"
 )
 model = get_peft_model(model, lora_config)
-# 8. 数据预处理
 tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token)
 tokenizer.pad_token = tokenizer.eos_token
-dataset = load_dataset(dataset_id, split="train", token=hf_token)
 def tokenize_fn(x):
     text_col = "text" if "text" in x else list(x.keys())[0]
-    return tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length")
-tokenized_ds = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)
-# 9. 训练参数 (关闭不必要的监控)
 training_args = TrainingArguments(
-    output_dir="./qwen_survival_out",
     per_device_train_batch_size=1,
     gradient_accumulation_steps=16,
     learning_rate=2e-4,
     fp16=True,
     gradient_checkpointing=True,
     logging_steps=5,
-    max_steps=100,
     save_total_limit=1,
     ddp_find_unused_parameters=False,
     report_to="none",
-    # 强制不使用推理时的 KV Cache
     gradient_checkpointing_kwargs={"use_reentrant": False}
 )
@@ -119,8 +121,11 @@ trainer = Trainer(
 )
 model.config.use_cache = False
 trainer.train()
 if local_rank == 0:
     trainer.model.save_pretrained("./qwen_final_lora")
     print("训练成功结束！")

 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+# 1. 环境与显存配置
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 try:
     user_secrets = UserSecretsClient()
     hf_token = user_secrets.get_secret("HF_TOKEN")
 model_id = "Qwen/Qwen3.5-9B"
 dataset_id = "a686d380/h-corpus-2023"
+# 2. 极致量化配置
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_compute_dtype=torch.float16
 )
+# 3. 加载模型 (核心修复点)
 if local_rank == 0:
+    print(f"正在加载模型核心: {model_id}...")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     token=hf_token,
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True,
+    attn_implementation="sdpa"
 )
+# 4. 架构裁剪 (移除最后 12 层以确保 PEFT 转换时不崩)
 if hasattr(model, "model") and hasattr(model.model, "layers"):
     model.model.layers = model.model.layers[:-12]
     model.config.num_hidden_layers = len(model.model.layers)
     if local_rank == 0:
+        print(f"裁剪成功。当前层数: {len(model.model.layers)}")
 gc.collect()
 torch.cuda.empty_cache()
+# 5. QLoRA 准备
 model = prepare_model_for_kbit_training(model)
 lora_config = LoraConfig(
+    r=8, lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
     lora_dropout=0.05,
     task_type="CAUSAL_LM"
 )
 model = get_peft_model(model, lora_config)
+# 6. 数据预处理 (流式加速版)
 tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token)
 tokenizer.pad_token = tokenizer.eos_token
+if local_rank == 0:
+    print("正在以流式模式连接数据集 (无需等待下载)...")
+# 开启 streaming=True，这步是瞬间完成的
+raw_dataset = load_dataset(dataset_id, split="train", token=hf_token, streaming=True)
 def tokenize_fn(x):
     text_col = "text" if "text" in x else list(x.keys())[0]
+    tokenized = tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length")
+    return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}
+# 流式数据集的 map 也是惰性的，几乎不耗时
+tokenized_ds = raw_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
+# 7. 训练参数
 training_args = TrainingArguments(
+    output_dir="./qwen_stream_out",
     per_device_train_batch_size=1,
     gradient_accumulation_steps=16,
     learning_rate=2e-4,
     fp16=True,
     gradient_checkpointing=True,
     logging_steps=5,
+    max_steps=200,          # 流式模式下建议手动指定 max_steps
     save_total_limit=1,
     ddp_find_unused_parameters=False,
     report_to="none",
+    # 必须指定这个，因为流式数据集无法通过 len() 获取长度
+    max_grad_norm=1.0,
     gradient_checkpointing_kwargs={"use_reentrant": False}
 )
 )
 model.config.use_cache = False
+if local_rank == 0:
+    print("数据流已就绪，开始训练...")
 trainer.train()
+# 8. 保存
 if local_rank == 0:
     trainer.model.save_pretrained("./qwen_final_lora")
     print("训练成功结束！")