Transformers
PyTorch
English
bridgetower
Inference Endpoints
anahita-b commited on
Commit
accc168
1 Parent(s): 8786058

Update examples in README.md

Browse files
Files changed (1) hide show
  1. README.md +35 -23
README.md CHANGED
@@ -28,42 +28,54 @@ You can use the raw model for image and text retrieval.
28
 
29
  ### How to use
30
 
31
- Here is how to use this model to get the features of a given text in PyTorch:
 
32
  ```python
33
- import os
34
- from PIL import Image
35
- from glob import glob
36
- from tqdm import tqdm
37
- import torch
38
  from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
 
 
39
 
40
- image_dir = "/datasets/COCO2017/val2017"
41
- search_text = "a woman holding an umbrella"
 
42
 
43
- processor = BridgeTowerProcessor.from_pretrained(("BridgeTower/bridgetower-base-itm-mlm"))
44
  model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
45
 
46
- max_score = float('-inf')
47
- best_match_image = None
48
- image_paths = glob(os.path.join(image_dir, '*.jpg'))[:1000]
 
 
 
 
 
49
 
50
- for image_path in tqdm(image_paths, smoothing=1):
51
- image = Image.open(image_path).convert("RGB")
52
- inputs = processor(image, search_text, return_tensors="pt")
53
 
54
- inputs = dict((k,v.to(device)) if isinstance(v, torch.Tensor) else (k,v) for k,v in inputs.items())
 
 
 
 
 
 
55
 
56
- outputs = model(**inputs)
 
57
 
58
- score = outputs.logits[0,1].item()
 
59
 
60
- if score > max_score:
61
- max_score = score
62
- best_match_image = image_path
63
 
64
- print(max_score)
65
- print(best_match_image)
 
 
66
  ```
 
67
  ### Limitations and bias
68
 
69
  TODO
 
28
 
29
  ### How to use
30
 
31
+ Here is how to use this model to perform image and text matching:
32
+
33
  ```python
 
 
 
 
 
34
  from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
35
+ import requests
36
+ from PIL import Image
37
 
38
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
39
+ image = Image.open(requests.get(url, stream=True).raw)
40
+ texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
41
 
42
+ processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
43
  model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
44
 
45
+ # forward pass
46
+ scores = dict()
47
+ for text in texts:
48
+ # prepare inputs
49
+ encoding = processor(image, text, return_tensors="pt")
50
+ outputs = model(**encoding)
51
+ scores[text] = outputs.logits[0, :].item()
52
+ ```
53
 
54
+ Here is how to use this model to perfom masked language modeling:
 
 
55
 
56
+ ```python
57
+ from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
58
+ from PIL import Image
59
+
60
+ url = "http://images.cocodataset.org/val2017/000000360943.jpg"
61
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
62
+ text = "a <mask> looking out of the window"
63
 
64
+ processor = BridgeTowerProcessor.from_pretrained(("BridgeTower/bridgetower-base-itm-mlm"))
65
+ model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
66
 
67
+ # prepare inputs
68
+ encoding = processor(image, text, return_tensors="pt")
69
 
70
+ # forward pass
71
+ outputs = model(**encoding)
 
72
 
73
+ results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())
74
+
75
+ print(results)
76
+ a cat looking out of the window.
77
  ```
78
+
79
  ### Limitations and bias
80
 
81
  TODO