Spaces:
Sleeping
Sleeping
support xformers. modify configs.
Browse files- config.yaml +1 -1
- ldm/modules/encoders/modules.py +7 -57
- requirements.txt +3 -1
config.yaml
CHANGED
@@ -85,4 +85,4 @@ model:
|
|
85 |
params:
|
86 |
freeze: True
|
87 |
layer: "penultimate"
|
88 |
-
device: "cpu" #TODO: specify
|
|
|
85 |
params:
|
86 |
freeze: True
|
87 |
layer: "penultimate"
|
88 |
+
# device: "cpu" #TODO: specify
|
ldm/modules/encoders/modules.py
CHANGED
@@ -118,34 +118,6 @@ class FrozenT5Embedder(AbstractEncoder):
|
|
118 |
def encode(self, text):
|
119 |
return self(text)
|
120 |
|
121 |
-
# class FrozenByT5Embedder(FrozenT5Embedder):
|
122 |
-
# """Uses the ByT5 transformer encoder for text"""
|
123 |
-
# def __init__(self, version="google/byt5-small", device="cuda", max_length=77, freeze=True): # others are google/byt5-v1_1-xl and google/t5-v1_1-xxl
|
124 |
-
# # super(super()).__init__()
|
125 |
-
# nn.Module.__init__(self)
|
126 |
-
# print(dir(super()))
|
127 |
-
# # self.tokenizer = T5Tokenizer.from_pretrained(version)
|
128 |
-
# # self.transformer = T5EncoderModel.from_pretrained(version)
|
129 |
-
# self.tokenizer = AutoTokenizer.from_pretrained(version)
|
130 |
-
# self.model = T5ForConditionalGeneration.from_pretrained(version)
|
131 |
-
# self.tokenizer_new = ByT5Tokenizer.from_pretrained(version)
|
132 |
-
# self.transformer = T5EncoderModel.from_pretrained(version)
|
133 |
-
# self.device = device
|
134 |
-
# self.max_length = max_length # TODO: typical value?
|
135 |
-
# if freeze:
|
136 |
-
# self.freeze()
|
137 |
-
|
138 |
-
# def forward(self, text):
|
139 |
-
# # code base: https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/tokenization_utils_base.py#L2414
|
140 |
-
# batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
141 |
-
# return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
142 |
-
# tokens = batch_encoding["input_ids"].to(self.device)
|
143 |
-
# outputs = self.transformer(input_ids=tokens)
|
144 |
-
|
145 |
-
# z = outputs.last_hidden_state
|
146 |
-
# return z
|
147 |
-
|
148 |
-
|
149 |
class FrozenCLIPEmbedder(AbstractEncoder):
|
150 |
"""Uses the CLIP transformer encoder for text (from huggingface)"""
|
151 |
LAYERS = [
|
@@ -211,8 +183,12 @@ class FrozenOpenCLIPEmbedder(AbstractEncoder):
|
|
211 |
# aa = model.encode_image(torch.zeros((1, 3,224,224)))
|
212 |
del model.visual
|
213 |
self.model = model
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
216 |
self.max_length = max_length
|
217 |
if freeze:
|
218 |
self.freeze()
|
@@ -439,12 +415,6 @@ class FrozenOpenCLIPT5ByT5SepEncoder(FrozenOpenCLIPT5ByT5Encoder):
|
|
439 |
clip_text = text[0]
|
440 |
t5_text = text[1] if len(text) > 1 else text[0]
|
441 |
byt5_text = text[-1]
|
442 |
-
# clip_z = self.clip_encoder.encode(text[0]) #B*77*1024
|
443 |
-
# t5_z = self.t5_encoder.encode(text[1]) #B*77*Z_1
|
444 |
-
# if len(text) == 2:
|
445 |
-
# byt5_z = self.byt5_encoder.encode(text[1]) #B*77*Z_2
|
446 |
-
# else:
|
447 |
-
# byt5_z = self.byt5_encoder.encode(text[2]) #B*77*Z_2
|
448 |
else:
|
449 |
clip_text = text
|
450 |
t5_text = text
|
@@ -460,17 +430,10 @@ class OpenCLIPImageEmbedder(AbstractEncoder):
|
|
460 |
"""
|
461 |
Uses the OpenCLIP transformer encoder for image
|
462 |
"""
|
463 |
-
|
464 |
-
# #"pooled",
|
465 |
-
# "last",
|
466 |
-
# "penultimate"
|
467 |
-
# ]
|
468 |
-
def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", #stage = "train", # max_length=77, layer = "last",
|
469 |
freeze=True, set_grad_checkpointing = True):
|
470 |
super().__init__()
|
471 |
-
# assert layer in self.LAYERS
|
472 |
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
|
473 |
-
# del model.visual
|
474 |
self.image_mean = model.visual.image_mean
|
475 |
self.image_std = model.visual.image_std
|
476 |
del model.transformer
|
@@ -480,29 +443,16 @@ class OpenCLIPImageEmbedder(AbstractEncoder):
|
|
480 |
del model.text_projection
|
481 |
del model.logit_scale
|
482 |
# only model.visual is left
|
483 |
-
# open_clip.model._build_vision_tower()
|
484 |
|
485 |
self.model = model
|
486 |
self.device = device
|
487 |
|
488 |
if not freeze and set_grad_checkpointing:
|
489 |
self.model.visual.set_grad_checkpointing(True)
|
490 |
-
# if freeze:
|
491 |
-
# self.freeze()
|
492 |
-
# else:
|
493 |
-
# if set_grad_checkpointing:
|
494 |
-
# self.model.visual.set_grad_checkpointing(True)
|
495 |
self.freeze_model = freeze
|
496 |
|
497 |
-
# def freeze(self):
|
498 |
-
# self.model = self.model.eval()
|
499 |
-
# for param in self.parameters(): #392
|
500 |
-
# param.requires_grad = False
|
501 |
-
|
502 |
def forward(self, img):
|
503 |
z = self.model.encode_image(img) # 2.0.2 , normalize=False) 2.7.0
|
504 |
-
# tokens = open_clip.tokenize(text)
|
505 |
-
# z = self.encode_with_transformer(tokens.to(self.device))
|
506 |
return z
|
507 |
|
508 |
def encode(self, img):
|
|
|
118 |
def encode(self, text):
|
119 |
return self(text)
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
class FrozenCLIPEmbedder(AbstractEncoder):
|
122 |
"""Uses the CLIP transformer encoder for text (from huggingface)"""
|
123 |
LAYERS = [
|
|
|
183 |
# aa = model.encode_image(torch.zeros((1, 3,224,224)))
|
184 |
del model.visual
|
185 |
self.model = model
|
186 |
+
|
187 |
+
if not torch.cuda.is_available():
|
188 |
+
self.device = "cpu"
|
189 |
+
else:
|
190 |
+
self.device = device
|
191 |
+
|
192 |
self.max_length = max_length
|
193 |
if freeze:
|
194 |
self.freeze()
|
|
|
415 |
clip_text = text[0]
|
416 |
t5_text = text[1] if len(text) > 1 else text[0]
|
417 |
byt5_text = text[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
else:
|
419 |
clip_text = text
|
420 |
t5_text = text
|
|
|
430 |
"""
|
431 |
Uses the OpenCLIP transformer encoder for image
|
432 |
"""
|
433 |
+
def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda",
|
|
|
|
|
|
|
|
|
|
|
434 |
freeze=True, set_grad_checkpointing = True):
|
435 |
super().__init__()
|
|
|
436 |
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
|
|
|
437 |
self.image_mean = model.visual.image_mean
|
438 |
self.image_std = model.visual.image_std
|
439 |
del model.transformer
|
|
|
443 |
del model.text_projection
|
444 |
del model.logit_scale
|
445 |
# only model.visual is left
|
|
|
446 |
|
447 |
self.model = model
|
448 |
self.device = device
|
449 |
|
450 |
if not freeze and set_grad_checkpointing:
|
451 |
self.model.visual.set_grad_checkpointing(True)
|
|
|
|
|
|
|
|
|
|
|
452 |
self.freeze_model = freeze
|
453 |
|
|
|
|
|
|
|
|
|
|
|
454 |
def forward(self, img):
|
455 |
z = self.model.encode_image(img) # 2.0.2 , normalize=False) 2.7.0
|
|
|
|
|
456 |
return z
|
457 |
|
458 |
def encode(self, img):
|
requirements.txt
CHANGED
@@ -6,4 +6,6 @@ gradio
|
|
6 |
einops
|
7 |
pytorch-lightning==1.6.5
|
8 |
transformers
|
9 |
-
open_clip_torch
|
|
|
|
|
|
6 |
einops
|
7 |
pytorch-lightning==1.6.5
|
8 |
transformers
|
9 |
+
open_clip_torch
|
10 |
+
ninja
|
11 |
+
git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|