ryaalbr commited on
Commit
eb1df9f
1 Parent(s): 2d9d672

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -40,6 +40,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
40
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
41
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
42
 
 
43
  #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
44
 
45
 
@@ -58,7 +59,7 @@ height = 256 # height for resizing images
58
 
59
  def predict(image, labels):
60
  with torch.no_grad():
61
- inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True)
62
  outputs = clip_model(**inputs)
63
  logits_per_image = outputs.logits_per_image # this is the image-text similarity score
64
  probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
@@ -144,7 +145,7 @@ def get_caption(img,model_name):
144
  model = model.eval()
145
  model = model.to(device)
146
 
147
- clip_model = clip_model.to(device)
148
 
149
  input = clip_processor(images=img, return_tensors="pt").to(device)
150
  with torch.no_grad():
@@ -181,7 +182,7 @@ def search(search_query):
181
  with torch.no_grad():
182
 
183
  # Encode and normalize the description using CLIP (HF CLIP)
184
- inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True)
185
  text_encoded = clip_model.get_text_features(**inputs)
186
 
187
  # # Encode and normalize the description using CLIP (original CLIP)
 
40
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
41
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
42
 
43
+ clip_model = clip_model.to(device)
44
  #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
45
 
46
 
 
59
 
60
  def predict(image, labels):
61
  with torch.no_grad():
62
+ inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True).to(device)
63
  outputs = clip_model(**inputs)
64
  logits_per_image = outputs.logits_per_image # this is the image-text similarity score
65
  probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
 
145
  model = model.eval()
146
  model = model.to(device)
147
 
148
+
149
 
150
  input = clip_processor(images=img, return_tensors="pt").to(device)
151
  with torch.no_grad():
 
182
  with torch.no_grad():
183
 
184
  # Encode and normalize the description using CLIP (HF CLIP)
185
+ inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True).to(device)
186
  text_encoded = clip_model.get_text_features(**inputs)
187
 
188
  # # Encode and normalize the description using CLIP (original CLIP)