Spaces:
Runtime error
Runtime error
import torch.cuda | |
import argparse | |
from BOOXEL.util import create_BOOXEL_model, PIL2Tensor, Tensor2PIL, convert_dtype | |
from PIL import Image | |
from llava.llava_agent import LLavaAgent | |
from CKPT_PTH import LLAVA_MODEL_PATH | |
import os | |
from torch.nn.functional import interpolate | |
if torch.cuda.device_count() >= 2: | |
BOOXEL_device = 'cuda:0' | |
LLaVA_device = 'cuda:1' | |
elif torch.cuda.device_count() == 1: | |
BOOXEL_device = 'cuda:0' | |
LLaVA_device = 'cuda:0' | |
else: | |
raise ValueError('当前仅支持 CUDA。') | |
# 命令行参数 | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--img_dir", type=str) | |
parser.add_argument("--save_dir", type=str) | |
parser.add_argument("--upscale", type=int, default=1) | |
parser.add_argument("--BOOXEL_sign", type=str, default='Q', choices=['F', 'Q']) | |
parser.add_argument("--seed", type=int, default=1234) | |
parser.add_argument("--min_size", type=int, default=1024) | |
parser.add_argument("--edm_steps", type=int, default=50) | |
parser.add_argument("--s_stage1", type=int, default=-1) | |
parser.add_argument("--s_churn", type=int, default=5) | |
parser.add_argument("--s_noise", type=float, default=1.003) | |
parser.add_argument("--s_cfg", type=float, default=7.5) | |
parser.add_argument("--s_stage2", type=float, default=1.) | |
parser.add_argument("--num_samples", type=int, default=1) | |
parser.add_argument("--a_prompt", type=str, | |
default='电影级,高对比度,高度精细,使用哈苏相机拍摄,超精细照片,逼真的最大细节,32K,调色,超高清,极致的细节,皮肤毛孔细节,超清晰度,完美无变形。') | |
parser.add_argument("--n_prompt", type=str, | |
default='绘画,油画,插图,绘图,艺术,素描,动漫,卡通,CG 风格,3D 渲染,虚幻引擎,模糊,混色,不清晰,怪异纹理,丑陋,肮脏,凌乱,质量最差,质量低,框架,水印,签名,JPEG 伪影,变形,低分辨率,过度平滑') | |
parser.add_argument("--color_fix_type", type=str, default='Wavelet', choices=["None", "AdaIn", "Wavelet"]) | |
parser.add_argument("--linear_CFG", action='store_true', default=True) | |
parser.add_argument("--linear_s_stage2", action='store_true', default=False) | |
parser.add_argument("--spt_linear_CFG", type=float, default=4.0) | |
parser.add_argument("--spt_linear_s_stage2", type=float, default=0.) | |
parser.add_argument("--ae_dtype", type=str, default="bf16", choices=['fp32', 'bf16']) | |
parser.add_argument("--diff_dtype", type=str, default="fp16", choices=['fp32', 'fp16', 'bf16']) | |
parser.add_argument("--no_llava", action='store_true', default=False) | |
parser.add_argument("--loading_half_params", action='store_true', default=False) | |
parser.add_argument("--use_tile_vae", action='store_true', default=False) | |
parser.add_argument("--encoder_tile_size", type=int, default=512) | |
parser.add_argument("--decoder_tile_size", type=int, default=64) | |
parser.add_argument("--load_8bit_llava", action='store_true', default=False) | |
args = parser.parse_args() | |
print(args) | |
use_llava = not args.no_llava | |
# 加载 BOOXEL | |
model = create_BOOXEL_model('options/BOOXEL_v0.yaml', BOOXEL_sign=args.BOOXEL_sign) | |
if args.loading_half_params: | |
model = model.half() | |
if args.use_tile_vae: | |
model.init_tile_vae(encoder_tile_size=args.encoder_tile_size, decoder_tile_size=args.decoder_tile_size) | |
model.ae_dtype = convert_dtype(args.ae_dtype) | |
model.model.dtype = convert_dtype(args.diff_dtype) | |
model = model.to(BOOXEL_device) | |
# 加载 LLaVA | |
if use_llava: | |
llava_agent = LLavaAgent(LLAVA_MODEL_PATH, device=LLaVA_device, load_8bit=args.load_8bit_llava, load_4bit=False) | |
else: | |
llava_agent = None | |
os.makedirs(args.save_dir, exist_ok=True) | |
for img_pth in os.listdir(args.img_dir): | |
img_name = os.path.splitext(img_pth)[0] | |
LQ_ips = Image.open(os.path.join(args.img_dir, img_pth)) | |
LQ_img, h0, w0 = PIL2Tensor(LQ_ips, upsacle=args.upscale, min_size=args.min_size) | |
LQ_img = LQ_img.unsqueeze(0).to(BOOXEL_device)[:, :3, :, :] | |
# 步骤 1:为 LLaVA 进行预噪声化,调整大小至 512 | |
LQ_img_512, h1, w1 = PIL2Tensor(LQ_ips, upsacle=args.upscale, min_size=args.min_size, fix_resize=512) | |
LQ_img_512 = LQ_img_512.unsqueeze(0).to(BOOXEL_device)[:, :3, :, :] | |
clean_imgs = model.batchify_denoise(LQ_img_512) | |
clean_PIL_img = Tensor2PIL(clean_imgs[0], h1, w1) | |
# 步骤 2:LLaVA | |
if use_llava: | |
captions = llava_agent.gen_image_caption([clean_PIL_img]) | |
else: | |
captions = [''] | |
print(captions) | |
# 第 3 步:扩散过程 | |
samples = model.batchify_sample(LQ_img, captions, num_steps=args.edm_steps, restoration_scale=args.s_stage1, s_churn=args.s_churn, | |
s_noise=args.s_noise, cfg_scale=args.s_cfg, control_scale=args.s_stage2, seed=args.seed, | |
num_samples=args.num_samples, p_p=args.a_prompt, n_p=args.n_prompt, color_fix_type=args.color_fix_type, | |
use_linear_CFG=args.linear_CFG, use_linear_control_scale=args.linear_s_stage2, | |
cfg_scale_start=args.spt_linear_CFG, control_scale_start=args.spt_linear_s_stage2) | |
# 保存 | |
for _i, sample in enumerate(samples): | |
Tensor2PIL(sample, h0, w0).save(f'{args.save_dir}/{img_name}_{_i}.png') | |