anas-awadalla commited on
Commit
15ed10c
·
1 Parent(s): 5842ec8

added files

Browse files
app.py CHANGED
@@ -61,7 +61,7 @@ model, image_processor, tokenizer = create_model_and_transforms(
61
 
62
  checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
63
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
64
- model.eval().to("cuda")
65
 
66
  def generate(
67
  idx,
@@ -152,7 +152,7 @@ def generate(
152
 
153
  # with torch.cuda.amp.autocast(dtype=torch.bfloat16):
154
  output = model.generate(
155
- vision_x=vision_x.to("cuda"),
156
  lang_x=input_ids.to("cuda"),
157
  attention_mask=attention_mask.to("cuda"),
158
  max_new_tokens=30,
 
61
 
62
  checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
63
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
64
+ model.eval()
65
 
66
  def generate(
67
  idx,
 
152
 
153
  # with torch.cuda.amp.autocast(dtype=torch.bfloat16):
154
  output = model.generate(
155
+ vision_x=vision_x,
156
  lang_x=input_ids.to("cuda"),
157
  attention_mask=attention_mask.to("cuda"),
158
  max_new_tokens=30,
open_flamingo/open_flamingo/src/factory.py CHANGED
@@ -79,6 +79,7 @@ def create_model_and_transforms(
79
  decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
80
  lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
81
  lang_encoder.resize_token_embeddings(len(text_tokenizer))
 
82
 
83
  model = Flamingo(
84
  vision_encoder,
 
79
  decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
80
  lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
81
  lang_encoder.resize_token_embeddings(len(text_tokenizer))
82
+ lang_encoder.to(0)
83
 
84
  model = Flamingo(
85
  vision_encoder,
open_flamingo/open_flamingo/src/flamingo.py CHANGED
@@ -212,7 +212,7 @@ class Flamingo(nn.Module):
212
  with torch.no_grad():
213
  vision_x = self.vision_encoder(vision_x)[1]
214
  vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
215
- vision_x = self.perceiver(vision_x)
216
 
217
  for layer in self.lang_encoder._get_decoder_layers():
218
  layer.condition_vis_x(vision_x)
 
212
  with torch.no_grad():
213
  vision_x = self.vision_encoder(vision_x)[1]
214
  vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
215
+ vision_x = self.perceiver(vision_x).to(0)
216
 
217
  for layer in self.lang_encoder._get_decoder_layers():
218
  layer.condition_vis_x(vision_x)