|
|
from PIL import Image |
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig |
|
|
from transformers import Qwen2_5_VLForConditionalGeneration |
|
|
|
|
|
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig |
|
|
from diffusers import QwenImagePipeline, QwenImageTransformer2DModel, QwenImageInpaintPipeline |
|
|
|
|
|
|
|
|
prompt = "equirectangular, a woman and a man sitting at a cafe, the woman has red hair and she's wearing purple sweater with a black scarf and a white hat, the man is sitting on the other side of the table and he's wearing a white shirt with a purple scarf and red hat, both of them are sipping their coffee while in the table there's some cake slices on their respective plates, each with forks and knives at each side." |
|
|
negative_prompt = "" |
|
|
output_filename = "qwen_bnb_nf4.png" |
|
|
width, height = 2048, 1024 |
|
|
true_cfg_scale = 4.0 |
|
|
num_inference_steps = 25 |
|
|
seed = 42 |
|
|
|
|
|
lora_model_id = "ProGamerGov/qwen-360-diffusion" |
|
|
lora_filename = "qwen-360-diffusion-int8-bf16-v1.safetensors" |
|
|
|
|
|
model_id = "diffusers/qwen-image-nf4" |
|
|
torch_dtype = torch.bfloat16 |
|
|
device = "cuda" |
|
|
|
|
|
fix_seam = True |
|
|
inpaint_strength, seam_width = 0.5, 0.10 |
|
|
|
|
|
|
|
|
def shift_equirect(img): |
|
|
"""Horizontal 50% shift using torch.roll.""" |
|
|
t = torch.from_numpy(np.array(img)).permute(2, 0, 1).float() / 255.0 |
|
|
t = torch.roll(t, shifts=(0, t.shape[2] // 2), dims=(1, 2)) |
|
|
return Image.fromarray((t.permute(1, 2, 0).numpy() * 255).astype(np.uint8)) |
|
|
|
|
|
def create_seam_mask(w, h, frac=0.10): |
|
|
"""Create vertical seam mask as PIL Image (center seam).""" |
|
|
mask = torch.zeros((h, w)) |
|
|
seam_w = max(1, int(w * frac)) |
|
|
c = w // 2 |
|
|
mask[:, c - seam_w // 2:c + seam_w // 2] = 1.0 |
|
|
return Image.fromarray((mask.numpy() * 255).astype("uint8"), "L") |
|
|
|
|
|
|
|
|
def load_pipeline(text_encoder, transformer, mode="t2i"): |
|
|
pip_class = QwenImagePipeline if mode == "t2i" else QwenImageInpaintPipeline |
|
|
pipe = pip_class.from_pretrained( |
|
|
model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype |
|
|
) |
|
|
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) |
|
|
pipe.enable_model_cpu_offload() |
|
|
pipe.enable_vae_tiling() |
|
|
pipe.transformer.compile_repeated_blocks( |
|
|
fullgraph=True, dynamic=True |
|
|
) |
|
|
return pipe |
|
|
|
|
|
|
|
|
def main(): |
|
|
quantization_config = DiffusersBitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
llm_int8_skip_modules=["transformer_blocks.0.img_mod"], |
|
|
) |
|
|
transformer = QwenImageTransformer2DModel.from_pretrained( |
|
|
model_id, |
|
|
subfolder="transformer", |
|
|
quantization_config=quantization_config, |
|
|
torch_dtype=torch_dtype, |
|
|
).to("cpu") |
|
|
|
|
|
quantization_config = TransformersBitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
) |
|
|
text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
|
model_id, |
|
|
subfolder="text_encoder", |
|
|
quantization_config=quantization_config, |
|
|
torch_dtype=torch_dtype, |
|
|
).to("cpu") |
|
|
|
|
|
generator = torch.Generator(device=device).manual_seed(seed) |
|
|
pipe = load_pipeline(text_encoder, transformer, mode="t2i") |
|
|
|
|
|
image = pipe( |
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
width=width, |
|
|
height=height, |
|
|
num_inference_steps=num_inference_steps, |
|
|
true_cfg_scale=true_cfg_scale, |
|
|
generator=generator, |
|
|
).images[0] |
|
|
|
|
|
image.save(output_filename) |
|
|
|
|
|
if fix_seam: |
|
|
del pipe |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
shifted = shift_equirect(image) |
|
|
mask = create_seam_mask(width, height, frac=seam_width) |
|
|
|
|
|
pipe = load_pipeline(text_encoder, transformer, mode="i2i") |
|
|
image_fixed = pipe( |
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
image=shifted, |
|
|
mask_image=mask, |
|
|
strength=inpaint_strength, |
|
|
width=width, |
|
|
height=height, |
|
|
num_inference_steps=num_inference_steps, |
|
|
true_cfg_scale=true_cfg_scale, |
|
|
generator=generator, |
|
|
).images[0] |
|
|
image_fixed = shift_equirect(image_fixed) |
|
|
image_fixed.save(output_filename.replace(".png", "_seamfix.png")) |
|
|
|
|
|
main() |
|
|
|