File size: 4,417 Bytes
ece766c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import fire
from diffusers import StableDiffusionPipeline
import torch
import torch.nn as nn

from .lora import (
    save_all,
    _find_modules,
    LoraInjectedConv2d,
    LoraInjectedLinear,
    inject_trainable_lora,
    inject_trainable_lora_extended,
)


def _iter_lora(model):
    for module in model.modules():
        if isinstance(module, LoraInjectedConv2d) or isinstance(
            module, LoraInjectedLinear
        ):
            yield module


def overwrite_base(base_model, tuned_model, rank, clamp_quantile):
    device = base_model.device
    dtype = base_model.dtype

    for lor_base, lor_tune in zip(_iter_lora(base_model), _iter_lora(tuned_model)):

        if isinstance(lor_base, LoraInjectedLinear):
            residual = lor_tune.linear.weight.data - lor_base.linear.weight.data
            # SVD on residual
            print("Distill Linear shape ", residual.shape)
            residual = residual.float()
            U, S, Vh = torch.linalg.svd(residual)
            U = U[:, :rank]
            S = S[:rank]
            U = U @ torch.diag(S)

            Vh = Vh[:rank, :]

            dist = torch.cat([U.flatten(), Vh.flatten()])
            hi_val = torch.quantile(dist, clamp_quantile)
            low_val = -hi_val

            U = U.clamp(low_val, hi_val)
            Vh = Vh.clamp(low_val, hi_val)

            assert lor_base.lora_up.weight.shape == U.shape
            assert lor_base.lora_down.weight.shape == Vh.shape

            lor_base.lora_up.weight.data = U.to(device=device, dtype=dtype)
            lor_base.lora_down.weight.data = Vh.to(device=device, dtype=dtype)

        if isinstance(lor_base, LoraInjectedConv2d):
            residual = lor_tune.conv.weight.data - lor_base.conv.weight.data
            print("Distill Conv shape ", residual.shape)

            residual = residual.float()
            residual = residual.flatten(start_dim=1)

            # SVD on residual
            U, S, Vh = torch.linalg.svd(residual)
            U = U[:, :rank]
            S = S[:rank]
            U = U @ torch.diag(S)

            Vh = Vh[:rank, :]

            dist = torch.cat([U.flatten(), Vh.flatten()])
            hi_val = torch.quantile(dist, clamp_quantile)
            low_val = -hi_val

            U = U.clamp(low_val, hi_val)
            Vh = Vh.clamp(low_val, hi_val)

            # U is (out_channels, rank) with 1x1 conv. So,
            U = U.reshape(U.shape[0], U.shape[1], 1, 1)
            # V is (rank, in_channels * kernel_size1 * kernel_size2)
            # now reshape:
            Vh = Vh.reshape(
                Vh.shape[0],
                lor_base.conv.in_channels,
                lor_base.conv.kernel_size[0],
                lor_base.conv.kernel_size[1],
            )

            assert lor_base.lora_up.weight.shape == U.shape
            assert lor_base.lora_down.weight.shape == Vh.shape

            lor_base.lora_up.weight.data = U.to(device=device, dtype=dtype)
            lor_base.lora_down.weight.data = Vh.to(device=device, dtype=dtype)


def svd_distill(
    target_model: str,
    base_model: str,
    rank: int = 4,
    clamp_quantile: float = 0.99,
    device: str = "cuda:0",
    save_path: str = "svd_distill.safetensors",
):
    pipe_base = StableDiffusionPipeline.from_pretrained(
        base_model, torch_dtype=torch.float16
    ).to(device)

    pipe_tuned = StableDiffusionPipeline.from_pretrained(
        target_model, torch_dtype=torch.float16
    ).to(device)

    # Inject unet
    _ = inject_trainable_lora_extended(pipe_base.unet, r=rank)
    _ = inject_trainable_lora_extended(pipe_tuned.unet, r=rank)

    overwrite_base(
        pipe_base.unet, pipe_tuned.unet, rank=rank, clamp_quantile=clamp_quantile
    )

    # Inject text encoder
    _ = inject_trainable_lora(
        pipe_base.text_encoder, r=rank, target_replace_module={"CLIPAttention"}
    )
    _ = inject_trainable_lora(
        pipe_tuned.text_encoder, r=rank, target_replace_module={"CLIPAttention"}
    )

    overwrite_base(
        pipe_base.text_encoder,
        pipe_tuned.text_encoder,
        rank=rank,
        clamp_quantile=clamp_quantile,
    )

    save_all(
        unet=pipe_base.unet,
        text_encoder=pipe_base.text_encoder,
        placeholder_token_ids=None,
        placeholder_tokens=None,
        save_path=save_path,
        save_lora=True,
        save_ti=False,
    )


def main():
    fire.Fire(svd_distill)