xchuan commited on
Commit
fa9e87f
1 Parent(s): c5419c3

Upload folder using huggingface_hub (#3)

Browse files

- 0cf1359d1746adb23074cc2027b559d84c71823a194f6d10f9827ab4de94eea1 (d33c2a18f2b9f6d5a26f7ac83031998695e09a86)

Files changed (2) hide show
  1. .DS_Store +0 -0
  2. README.md +372 -0
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
README.md ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - pytorch
5
+ - stable-diffusion
6
+ - text2Image
7
+ - stabilityai/stable-diffusion-2-1
8
+ ---
9
+
10
+ # This LoRA is trained based on stabilityai/stable-diffusion-2-1.
11
+
12
+ ## Training code
13
+
14
+ ```python
15
+ import torch
16
+
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ from datasets import load_dataset
20
+
21
+ dataset = load_dataset("xchuan/text2image-fupo",split="train")
22
+
23
+ from transformers import CLIPTokenizer
24
+ from huggingface_hub import login
25
+ # ========== LoRA 模型库 ==========
26
+ from peft import LoraConfig, get_peft_model, PeftModel
27
+
28
+
29
+ login(token="替换为你自己的",add_to_git_credential=True)
30
+
31
+ weight_dtype = torch.bfloat16
32
+ train_batch_size = 2
33
+ snr_gamma = 5 # SNR 参数,用于信噪比加权损失的调节系数
34
+ # 设置随机数种子以确保可重复性
35
+ seed = 1126 # 随机数种子
36
+ torch.manual_seed(seed)
37
+ if torch.cuda.is_available():
38
+ torch.cuda.manual_seed_all(seed)
39
+
40
+ # 优化器参数
41
+ unet_learning_rate = 1e-4 # UNet 的学习率,控制 UNet 参数更新的步长
42
+ text_encoder_learning_rate = 1e-4 # 文本编码器的学习率,控制文本嵌入层的参数更新步长
43
+
44
+ # 学习率调度器参数
45
+ lr_scheduler_name = "cosine_with_restarts" # 设置学习率调度器为 Cosine annealing with restarts,逐渐减少学习率并定期重启
46
+ lr_warmup_steps = 100 # 学习率预热步数,在最初的 100 步中逐渐增加学习率到最大值
47
+ max_train_steps = 1000 # 总训练步数,决定了整个训练过程的迭代次数
48
+ num_cycles = 3 # Cosine 调度器的周期数量,在训练期间会重复 3 次学习率周期性递减并重启
49
+
50
+ pretrained_model_name_or_path = "stabilityai/stable-diffusion-2-1"
51
+
52
+ # LoRA 配置
53
+ lora_config = LoraConfig(
54
+ r=32, # LoRA 的秩,即低秩矩阵的维度,决定了参数调整的自由度
55
+ lora_alpha=16, # 缩放系数,控制 LoRA 权重对模型的影响
56
+ target_modules=[
57
+ # "q_proj", "v_proj", "k_proj", "out_proj", # 指定 Text encoder 的 LoRA 应用对象(用于调整注意力机制中的投影矩阵)
58
+ "to_k", "to_q", "to_v", "to_out.0" # 指定 UNet 的 LoRA 应用对象(用于调整 UNet 中的注意力机制)
59
+ ],
60
+ lora_dropout=0 # LoRA dropout 概率,0 表示不使用 dropout
61
+ )
62
+
63
+ from torchvision import transforms
64
+ from torch.utils.data import DataLoader
65
+
66
+ resolution = 512
67
+
68
+
69
+ train_transform = transforms.Compose([
70
+ transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR), # 调整图像大小
71
+ transforms.CenterCrop(resolution), # 中心裁剪图像
72
+ transforms.RandomHorizontalFlip(), # 随机水平翻转
73
+ transforms.ToTensor(), # 将图像转换为张量
74
+ ])
75
+
76
+ def collate_fn(examples):
77
+ pixel_values = []
78
+ input_ids = []
79
+
80
+ for example in examples:
81
+ image_tensor = train_transform(example["image"])
82
+ if not isinstance(image_tensor, torch.Tensor):
83
+ print(f"Expected Tensor, got {type(image_tensor)} instead.")
84
+ continue
85
+ pixel_values.append(image_tensor)
86
+
87
+ input_text = "fupo:" + example["text"]
88
+ tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
89
+ encode_text = tokenizer(input_text, return_tensors="pt",padding="max_length",truncation=True)
90
+ inputs_id = encode_text["input_ids"].squeeze(0)
91
+ input_ids.append(inputs_id)
92
+
93
+ # 如果没有有效的图像,则返回空的字典
94
+ if len(pixel_values) == 0:
95
+ return {"pixel_values": torch.empty(0), "input_ids": torch.empty(0)}
96
+
97
+ pixel_values = torch.stack(pixel_values, dim=0).float()
98
+ input_ids = torch.stack(input_ids, dim=0)
99
+ return {"pixel_values": pixel_values, "input_ids": input_ids}
100
+
101
+
102
+ train_dataloader = DataLoader(dataset, shuffle=True, collate_fn=collate_fn, batch_size=train_batch_size)
103
+
104
+ from diffusers import SD3Transformer2DModel
105
+
106
+ def prepare_lora_model(lora_config, pretrained_model_name_or_path, model_path=None, resume=False, merge_lora=False):
107
+ """
108
+ (1) 目标:
109
+ - 加载完整的 Stable Diffusion 模型,包括 LoRA 层,并根据需要合并 LoRA 权重。这包括 Tokenizer、噪声调度器、UNet、VAE 和文本编码器。
110
+
111
+ (2) 参数:
112
+ - lora_config: LoraConfig, LoRA 的配置对象
113
+ - pretrained_model_name_or_path: str, Hugging Face 上的模型名称或路径
114
+ - model_path: str, 预训练模型的路径
115
+ - resume: bool, 是否从上一次训练中恢复
116
+ - merge_lora: bool, 是否在推理时合并 LoRA 权重
117
+
118
+ (3) 返回:
119
+ - tokenizer: CLIPTokenizer
120
+ - noise_scheduler: DDPMScheduler
121
+ - unet: UNet2DConditionModel
122
+ - vae: AutoencoderKL
123
+ - text_encoder: CLIPTextModel
124
+ """
125
+ # 加载噪声调度器,用于控制扩散模型的噪声添加和移除过程
126
+ noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
127
+
128
+ # 加载 Tokenizer,用于将文本��注转换为 tokens
129
+ tokenizer = CLIPTokenizer.from_pretrained(
130
+ pretrained_model_name_or_path,
131
+ subfolder="tokenizer"
132
+ )
133
+
134
+ # 加载 CLIP 文本编码器,用于将文本标注转换为特征向量
135
+ text_encoder = CLIPTextModel.from_pretrained(
136
+ pretrained_model_name_or_path,
137
+ torch_dtype=weight_dtype,
138
+ subfolder="text_encoder"
139
+ )
140
+
141
+ # 加载 VAE 模型,用于在扩散模型中处理图像的潜在表示
142
+ vae = AutoencoderKL.from_pretrained(
143
+ pretrained_model_name_or_path,
144
+ subfolder="vae"
145
+ )
146
+
147
+ # 加载 UNet 模型,负责处理扩散模型中的图像生成和推理过程
148
+ unet = UNet2DConditionModel.from_pretrained(
149
+ pretrained_model_name_or_path,
150
+ torch_dtype=weight_dtype,
151
+ subfolder="unet"
152
+ )
153
+
154
+ # 如果设置为继续训练,则加载上一次的模型权重
155
+ if resume:
156
+ if model_path is None or not os.path.exists(model_path):
157
+ raise ValueError("当 resume 设置为 True 时,必须提供有效的 model_path")
158
+ # 使用 PEFT 的 from_pretrained 方法加载 LoRA 模型
159
+ text_encoder = PeftModel.from_pretrained(text_encoder, os.path.join(model_path, "text_encoder"))
160
+ unet = PeftModel.from_pretrained(unet, os.path.join(model_path, "unet"))
161
+
162
+ # 确保 UNet 的可训练参数的 requires_grad 为 True
163
+ for param in unet.parameters():
164
+ if param.requires_grad is False:
165
+ param.requires_grad = True
166
+
167
+ # 确保文本编码器的可训练参数的 requires_grad 为 True
168
+ for param in text_encoder.parameters():
169
+ if param.requires_grad is False:
170
+ param.requires_grad = True
171
+
172
+ print(f"✅ 已从 {model_path} 恢复模型权重")
173
+
174
+ else:
175
+ # 将 LoRA 配置应用到 text_encoder 和 unet
176
+ # text_encoder = get_peft_model(text_encoder, lora_config)
177
+ unet = get_peft_model(unet, lora_config)
178
+
179
+ # 打印可训练参数数量
180
+ # print("📊 Text Encoder 可训练参数:")
181
+ # text_encoder.print_trainable_parameters()
182
+ print("📊 UNet 可训练参数:")
183
+ unet.print_trainable_parameters()
184
+
185
+ if merge_lora:
186
+ # 合并 LoRA 权重到基础模型,仅在推理时调用
187
+ text_encoder = text_encoder.merge_and_unload()
188
+ unet = unet.merge_and_unload()
189
+
190
+ # 切换为评估模式
191
+ text_encoder.eval()
192
+ unet.eval()
193
+
194
+ # 冻结 VAE 参数
195
+ vae.requires_grad_(False)
196
+ text_encoder.requires_grad_(False)
197
+
198
+ # 将模型移动到 GPU 上并设置权重的数据类型
199
+ unet.to(device, dtype=weight_dtype)
200
+ vae.to(device, dtype=weight_dtype)
201
+ text_encoder.to(device, dtype=weight_dtype)
202
+
203
+ return tokenizer, noise_scheduler, unet, vae, text_encoder
204
+
205
+ def prepare_optimizer(unet, text_encoder, unet_learning_rate=5e-4, text_encoder_learning_rate=1e-4):
206
+ # 筛选出 UNet 中需要训练的 Lora 层参数
207
+ unet_lora_layers = [p for p in unet.parameters() if p.requires_grad]
208
+
209
+ # 筛选出文本编码器中需要训练的 Lora 层参数
210
+ text_encoder_lora_layers = [p for p in text_encoder.parameters() if p.requires_grad]
211
+
212
+ # 将需要训练的参数分组并设置不同的学习率
213
+ trainable_params = [
214
+ {"params": unet_lora_layers, "lr": unet_learning_rate},
215
+ {"params": text_encoder_lora_layers, "lr": text_encoder_learning_rate}
216
+ ]
217
+
218
+ # 使用 AdamW 优化器
219
+ optimizer = torch.optim.AdamW(trainable_params)
220
+
221
+ return optimizer
222
+
223
+ import os
224
+ from diffusers.optimization import get_scheduler
225
+ from diffusers.training_utils import compute_snr
226
+ from diffusers import DDPMScheduler,AutoencoderKL,UNet2DConditionModel
227
+ from transformers import CLIPTextModel
228
+
229
+ project_name = "fupo"
230
+ dataset_name = "fupo"
231
+ # 根目录和主要目录
232
+ root_dir = "./" # 当前目录
233
+ main_dir = os.path.join(root_dir, "SD-2-2") # 主目录
234
+ # 项目目录
235
+ project_dir = os.path.join(main_dir, project_name)
236
+ model_path = os.path.join(project_dir, "logs", "checkpoint-last")
237
+
238
+ # 项目目录
239
+ project_dir = os.path.join(main_dir, project_name)
240
+ model_path = os.path.join(project_dir, "logs", "checkpoint-last")
241
+
242
+ # 准备模型
243
+ tokenizer, noise_scheduler, unet, vae, text_encoder = prepare_lora_model(
244
+ lora_config,
245
+ pretrained_model_name_or_path,
246
+ model_path,
247
+ resume=False,
248
+ merge_lora=False
249
+ )
250
+
251
+ # 准备优化器
252
+ optimizer = prepare_optimizer(
253
+ unet,
254
+ text_encoder,
255
+ unet_learning_rate=unet_learning_rate,
256
+ text_encoder_learning_rate=text_encoder_learning_rate
257
+ )
258
+
259
+ # 设置学习率调度器
260
+ lr_scheduler = get_scheduler(
261
+ lr_scheduler_name,
262
+ optimizer=optimizer,
263
+ num_warmup_steps=lr_warmup_steps,
264
+ num_training_steps=max_train_steps,
265
+ num_cycles=num_cycles
266
+ )
267
+
268
+ print("✅ 模型和优化器准备完成!可以开始训练。")
269
+
270
+ import math
271
+ from huggingface_hub import HfApi, Repository
272
+ from tqdm.auto import tqdm
273
+ import torch.nn.functional as F
274
+
275
+ output_folder = os.path.join(project_dir, "logs")
276
+ # 禁用并行化,避免警告
277
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
278
+
279
+ # 初始化
280
+ global_step = 0
281
+ best_face_score = float("inf") # 初始化为正无穷大,存储最佳面部相似度分数
282
+
283
+ # 进度条显示训练进度
284
+ progress_bar = tqdm(
285
+ range(max_train_steps), # 根据 num_training_steps 设置
286
+ desc="训练步骤",
287
+ )
288
+
289
+ # 训练循环
290
+ for epoch in range(math.ceil(max_train_steps / len(train_dataloader))):
291
+ # 如果你想在训练中增加评估,那在循环中增加 train() 是有必要的
292
+ unet.train()
293
+ text_encoder.train()
294
+
295
+ for step, batch in enumerate(train_dataloader):
296
+ if global_step >= max_train_steps:
297
+ break
298
+
299
+ # 编码图像为潜在表示(latent)
300
+ latents = vae.encode(batch["pixel_values"].to(device, dtype=weight_dtype)).latent_dist.sample()
301
+ latents = latents * vae.config.scaling_factor # 根据 VAE 的缩放因子调整潜在空间
302
+
303
+ # 为潜在表示添加噪声,生成带噪声的图像
304
+ noise = torch.randn_like(latents) # 生成与潜在表示相同形状的随机噪声
305
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()
306
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
307
+
308
+ # 获取文本的嵌入表示
309
+ encoder_hidden_states = text_encoder(batch["input_ids"].to(device))[0]
310
+ assert encoder_hidden_states is not None, "Encoder hidden states should not be None"
311
+
312
+ # 计算目标值
313
+ if noise_scheduler.config.prediction_type == "epsilon":
314
+ target = noise # 预测噪声
315
+ elif noise_scheduler.config.prediction_type == "v_prediction":
316
+ target = noise_scheduler.get_velocity(latents, noise, timesteps) # 预测速度向量
317
+
318
+ # UNet 模型预测
319
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states)[0]
320
+ assert model_pred is not None, "Model prediction should not be None"
321
+
322
+ # 计算损失
323
+ if not snr_gamma:
324
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
325
+ else:
326
+ # 计算信噪比 (SNR) 并根据 SNR 加权 MSE 损失
327
+ snr = compute_snr(noise_scheduler, timesteps)
328
+ mse_loss_weights = torch.stack([snr, snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0]
329
+ if noise_scheduler.config.prediction_type == "epsilon":
330
+ mse_loss_weights = mse_loss_weights / snr
331
+ elif noise_scheduler.config.prediction_type == "v_prediction":
332
+ mse_loss_weights = mse_loss_weights / (snr + 1)
333
+
334
+ # 计算加权的 MSE 损失
335
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
336
+ loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
337
+ loss = loss.mean()
338
+
339
+ # 反向传播
340
+ loss.backward()
341
+ optimizer.step()
342
+ lr_scheduler.step()
343
+ optimizer.zero_grad()
344
+ progress_bar.update(1)
345
+ global_step += 1
346
+
347
+ # 打印训练损失
348
+ if global_step % 100 == 0 or global_step == max_train_steps:
349
+ print(f"🔥 步骤 {global_step}, 损失: {loss.item()}")
350
+
351
+ # 保存中间检查点,当前简单设置为每 500 步保存一次
352
+ if global_step % 500 == 0:
353
+ save_path = os.path.join(output_folder, f"checkpoint-{global_step}")
354
+ os.makedirs(save_path, exist_ok=True)
355
+
356
+ # 使用 save_pretrained 保存 PeftModel
357
+ unet.save_pretrained(os.path.join(save_path, "unet"))
358
+ text_encoder.save_pretrained(os.path.join(save_path, "text_encoder"))
359
+ print(f"💾 已保存中间模型到 {save_path}")
360
+
361
+ # 保存最终模型到 checkpoint-last
362
+ save_path = os.path.join(output_folder, "checkpoint-last")
363
+ os.makedirs(save_path, exist_ok=True)
364
+ unet.save_pretrained(os.path.join(save_path, "unet"))
365
+ # text_encoder.save_lora_weights(os.path.join(save_path, "text_encoder"))
366
+ print(f"💾 已保存最终模型到 {save_path}")
367
+
368
+ print("🎉 微调完成!")
369
+
370
+ # 上传到 Hugging Face Hub
371
+
372
+ ```