Spaces:
Runtime error
Runtime error
anas-awadalla
commited on
Commit
•
15ed10c
1
Parent(s):
5842ec8
added files
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ model, image_processor, tokenizer = create_model_and_transforms(
|
|
61 |
|
62 |
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
|
63 |
model.load_state_dict(torch.load(checkpoint_path), strict=False)
|
64 |
-
model.eval()
|
65 |
|
66 |
def generate(
|
67 |
idx,
|
@@ -152,7 +152,7 @@ def generate(
|
|
152 |
|
153 |
# with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
154 |
output = model.generate(
|
155 |
-
vision_x=vision_x
|
156 |
lang_x=input_ids.to("cuda"),
|
157 |
attention_mask=attention_mask.to("cuda"),
|
158 |
max_new_tokens=30,
|
|
|
61 |
|
62 |
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
|
63 |
model.load_state_dict(torch.load(checkpoint_path), strict=False)
|
64 |
+
model.eval()
|
65 |
|
66 |
def generate(
|
67 |
idx,
|
|
|
152 |
|
153 |
# with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
154 |
output = model.generate(
|
155 |
+
vision_x=vision_x,
|
156 |
lang_x=input_ids.to("cuda"),
|
157 |
attention_mask=attention_mask.to("cuda"),
|
158 |
max_new_tokens=30,
|
open_flamingo/open_flamingo/src/factory.py
CHANGED
@@ -79,6 +79,7 @@ def create_model_and_transforms(
|
|
79 |
decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
|
80 |
lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
|
81 |
lang_encoder.resize_token_embeddings(len(text_tokenizer))
|
|
|
82 |
|
83 |
model = Flamingo(
|
84 |
vision_encoder,
|
|
|
79 |
decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
|
80 |
lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
|
81 |
lang_encoder.resize_token_embeddings(len(text_tokenizer))
|
82 |
+
lang_encoder.to(0)
|
83 |
|
84 |
model = Flamingo(
|
85 |
vision_encoder,
|
open_flamingo/open_flamingo/src/flamingo.py
CHANGED
@@ -212,7 +212,7 @@ class Flamingo(nn.Module):
|
|
212 |
with torch.no_grad():
|
213 |
vision_x = self.vision_encoder(vision_x)[1]
|
214 |
vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
|
215 |
-
vision_x = self.perceiver(vision_x)
|
216 |
|
217 |
for layer in self.lang_encoder._get_decoder_layers():
|
218 |
layer.condition_vis_x(vision_x)
|
|
|
212 |
with torch.no_grad():
|
213 |
vision_x = self.vision_encoder(vision_x)[1]
|
214 |
vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
|
215 |
+
vision_x = self.perceiver(vision_x).to(0)
|
216 |
|
217 |
for layer in self.lang_encoder._get_decoder_layers():
|
218 |
layer.condition_vis_x(vision_x)
|