Port ImageBind API
Browse files- .gitattributes +1 -0
- __pycache__/data.cpython-311.pyc +0 -0
- app.py +57 -0
- bpe/.DS_Store +0 -0
- bpe/bpe_simple_vocab_16e6.txt +0 -0
- bpe/bpe_simple_vocab_16e6.txt.gz +3 -0
- data.py +338 -0
- images/captured_image.jpg +3 -0
- models/__init__.py +0 -0
- models/__pycache__/__init__.cpython-311.pyc +0 -0
- models/__pycache__/__init__.cpython-38.pyc +0 -0
- models/__pycache__/helpers.cpython-311.pyc +0 -0
- models/__pycache__/helpers.cpython-38.pyc +0 -0
- models/__pycache__/imagebind_model.cpython-311.pyc +0 -0
- models/__pycache__/imagebind_model.cpython-38.pyc +0 -0
- models/__pycache__/multimodal_preprocessors.cpython-311.pyc +0 -0
- models/__pycache__/multimodal_preprocessors.cpython-38.pyc +0 -0
- models/__pycache__/transformer.cpython-311.pyc +0 -0
- models/__pycache__/transformer.cpython-38.pyc +0 -0
- models/helpers.py +140 -0
- models/imagebind_model.py +506 -0
- models/multimodal_preprocessors.py +685 -0
- models/transformer.py +280 -0
- output.txt +49 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
images/captured_image.jpg filter=lfs diff=lfs merge=lfs -text
|
__pycache__/data.cpython-311.pyc
ADDED
Binary file (15 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import cv2
|
3 |
+
import torch
|
4 |
+
import data
|
5 |
+
from models import imagebind_model
|
6 |
+
from models.imagebind_model import ModalityType
|
7 |
+
|
8 |
+
def read_dict_from_file(filename):
|
9 |
+
text_list = []
|
10 |
+
dictionary = {}
|
11 |
+
with open(filename, 'r') as file:
|
12 |
+
for line in file:
|
13 |
+
line = line.strip()
|
14 |
+
if line:
|
15 |
+
key, value = line.split(':', 1)
|
16 |
+
dictionary[key.strip()] = value.strip()
|
17 |
+
text_list.append(key)
|
18 |
+
return dictionary, text_list
|
19 |
+
|
20 |
+
text_output_path = 'output.txt'
|
21 |
+
database, text_list = read_dict_from_file(text_output_path)
|
22 |
+
image_paths = ['images/captured_image.jpg']
|
23 |
+
|
24 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
25 |
+
|
26 |
+
# Instantiate model
|
27 |
+
model = imagebind_model.imagebind_huge(pretrained=True)
|
28 |
+
model.eval()
|
29 |
+
model.to(device)
|
30 |
+
|
31 |
+
def run_model():
|
32 |
+
inputs = {
|
33 |
+
ModalityType.TEXT: data.load_and_transform_text(text_list, device),
|
34 |
+
ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
|
35 |
+
}
|
36 |
+
|
37 |
+
with torch.no_grad():
|
38 |
+
embeddings = model(inputs)
|
39 |
+
|
40 |
+
embeddings_matrix = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1)
|
41 |
+
|
42 |
+
model_image_to_text = torch.argmax(embeddings_matrix, dim=1)
|
43 |
+
|
44 |
+
return text_list[model_image_to_text[0]]
|
45 |
+
|
46 |
+
def predict(image):
|
47 |
+
# Save the image to the desired file path
|
48 |
+
cv2.imwrite(image_paths[0], cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
|
49 |
+
return run_model()
|
50 |
+
|
51 |
+
demo = iface = gr.Interface(
|
52 |
+
fn=predict,
|
53 |
+
inputs=gr.inputs.Image(),
|
54 |
+
outputs=gr.outputs.Label()
|
55 |
+
)
|
56 |
+
|
57 |
+
demo.launch()
|
bpe/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
bpe/bpe_simple_vocab_16e6.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
bpe/bpe_simple_vocab_16e6.txt.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
|
3 |
+
size 1356917
|
data.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
import logging
|
9 |
+
import math
|
10 |
+
|
11 |
+
import torch
|
12 |
+
import torch.nn as nn
|
13 |
+
import torchaudio
|
14 |
+
from PIL import Image
|
15 |
+
from pytorchvideo import transforms as pv_transforms
|
16 |
+
from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
|
17 |
+
from pytorchvideo.data.encoded_video import EncodedVideo
|
18 |
+
from torchvision import transforms
|
19 |
+
from torchvision.transforms._transforms_video import NormalizeVideo
|
20 |
+
|
21 |
+
from models.multimodal_preprocessors import SimpleTokenizer
|
22 |
+
|
23 |
+
DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
|
24 |
+
|
25 |
+
BPE_PATH = "bpe/bpe_simple_vocab_16e6.txt.gz"
|
26 |
+
|
27 |
+
|
28 |
+
def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
|
29 |
+
# Based on https://github.com/YuanGongND/ast/blob/d7d8b4b8e06cdaeb6c843cdb38794c1c7692234c/src/dataloader.py#L102
|
30 |
+
waveform -= waveform.mean()
|
31 |
+
fbank = torchaudio.compliance.kaldi.fbank(
|
32 |
+
waveform,
|
33 |
+
htk_compat=True,
|
34 |
+
sample_frequency=sample_rate,
|
35 |
+
use_energy=False,
|
36 |
+
window_type="hanning",
|
37 |
+
num_mel_bins=num_mel_bins,
|
38 |
+
dither=0.0,
|
39 |
+
frame_length=25,
|
40 |
+
frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
|
41 |
+
)
|
42 |
+
# Convert to [mel_bins, num_frames] shape
|
43 |
+
fbank = fbank.transpose(0, 1)
|
44 |
+
# Pad to target_length
|
45 |
+
n_frames = fbank.size(1)
|
46 |
+
p = target_length - n_frames
|
47 |
+
# if p is too large (say >20%), flash a warning
|
48 |
+
if abs(p) / n_frames > 0.2:
|
49 |
+
logging.warning(
|
50 |
+
"Large gap between audio n_frames(%d) and "
|
51 |
+
"target_length (%d). Is the audio_target_length "
|
52 |
+
"setting correct?",
|
53 |
+
n_frames,
|
54 |
+
target_length,
|
55 |
+
)
|
56 |
+
# cut and pad
|
57 |
+
if p > 0:
|
58 |
+
fbank = torch.nn.functional.pad(fbank, (0, p), mode="constant", value=0)
|
59 |
+
elif p < 0:
|
60 |
+
fbank = fbank[:, 0:target_length]
|
61 |
+
# Convert to [1, mel_bins, num_frames] shape, essentially like a 1
|
62 |
+
# channel image
|
63 |
+
fbank = fbank.unsqueeze(0)
|
64 |
+
return fbank
|
65 |
+
|
66 |
+
|
67 |
+
def get_clip_timepoints(clip_sampler, duration):
|
68 |
+
# Read out all clips in this video
|
69 |
+
all_clips_timepoints = []
|
70 |
+
is_last_clip = False
|
71 |
+
end = 0.0
|
72 |
+
while not is_last_clip:
|
73 |
+
start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
|
74 |
+
all_clips_timepoints.append((start, end))
|
75 |
+
return all_clips_timepoints
|
76 |
+
|
77 |
+
|
78 |
+
def load_and_transform_vision_data(image_paths, device):
|
79 |
+
if image_paths is None:
|
80 |
+
return None
|
81 |
+
|
82 |
+
image_outputs = []
|
83 |
+
for image_path in image_paths:
|
84 |
+
data_transform = transforms.Compose(
|
85 |
+
[
|
86 |
+
transforms.Resize(
|
87 |
+
224, interpolation=transforms.InterpolationMode.BICUBIC
|
88 |
+
),
|
89 |
+
transforms.CenterCrop(224),
|
90 |
+
transforms.ToTensor(),
|
91 |
+
transforms.Normalize(
|
92 |
+
mean=(0.48145466, 0.4578275, 0.40821073),
|
93 |
+
std=(0.26862954, 0.26130258, 0.27577711),
|
94 |
+
),
|
95 |
+
]
|
96 |
+
)
|
97 |
+
with open(image_path, "rb") as fopen:
|
98 |
+
image = Image.open(fopen).convert("RGB")
|
99 |
+
|
100 |
+
image = data_transform(image).to(device)
|
101 |
+
image_outputs.append(image)
|
102 |
+
return torch.stack(image_outputs, dim=0)
|
103 |
+
|
104 |
+
|
105 |
+
def load_and_transform_text(text, device):
|
106 |
+
if text is None:
|
107 |
+
return None
|
108 |
+
tokenizer = SimpleTokenizer(bpe_path=BPE_PATH)
|
109 |
+
tokens = [tokenizer(t).unsqueeze(0).to(device) for t in text]
|
110 |
+
tokens = torch.cat(tokens, dim=0)
|
111 |
+
return tokens
|
112 |
+
|
113 |
+
|
114 |
+
def load_and_transform_audio_data(
|
115 |
+
audio_paths,
|
116 |
+
device,
|
117 |
+
num_mel_bins=128,
|
118 |
+
target_length=204,
|
119 |
+
sample_rate=16000,
|
120 |
+
clip_duration=2,
|
121 |
+
clips_per_video=3,
|
122 |
+
mean=-4.268,
|
123 |
+
std=9.138,
|
124 |
+
):
|
125 |
+
if audio_paths is None:
|
126 |
+
return None
|
127 |
+
|
128 |
+
audio_outputs = []
|
129 |
+
clip_sampler = ConstantClipsPerVideoSampler(
|
130 |
+
clip_duration=clip_duration, clips_per_video=clips_per_video
|
131 |
+
)
|
132 |
+
|
133 |
+
for audio_path in audio_paths:
|
134 |
+
waveform, sr = torchaudio.load(audio_path)
|
135 |
+
if sample_rate != sr:
|
136 |
+
waveform = torchaudio.functional.resample(
|
137 |
+
waveform, orig_freq=sr, new_freq=sample_rate
|
138 |
+
)
|
139 |
+
all_clips_timepoints = get_clip_timepoints(
|
140 |
+
clip_sampler, waveform.size(1) / sample_rate
|
141 |
+
)
|
142 |
+
all_clips = []
|
143 |
+
for clip_timepoints in all_clips_timepoints:
|
144 |
+
waveform_clip = waveform[
|
145 |
+
:,
|
146 |
+
int(clip_timepoints[0] * sample_rate) : int(
|
147 |
+
clip_timepoints[1] * sample_rate
|
148 |
+
),
|
149 |
+
]
|
150 |
+
waveform_melspec = waveform2melspec(
|
151 |
+
waveform_clip, sample_rate, num_mel_bins, target_length
|
152 |
+
)
|
153 |
+
all_clips.append(waveform_melspec)
|
154 |
+
|
155 |
+
normalize = transforms.Normalize(mean=mean, std=std)
|
156 |
+
all_clips = [normalize(ac).to(device) for ac in all_clips]
|
157 |
+
|
158 |
+
all_clips = torch.stack(all_clips, dim=0)
|
159 |
+
audio_outputs.append(all_clips)
|
160 |
+
|
161 |
+
return torch.stack(audio_outputs, dim=0)
|
162 |
+
|
163 |
+
|
164 |
+
def crop_boxes(boxes, x_offset, y_offset):
|
165 |
+
"""
|
166 |
+
Perform crop on the bounding boxes given the offsets.
|
167 |
+
Args:
|
168 |
+
boxes (ndarray or None): bounding boxes to perform crop. The dimension
|
169 |
+
is `num boxes` x 4.
|
170 |
+
x_offset (int): cropping offset in the x axis.
|
171 |
+
y_offset (int): cropping offset in the y axis.
|
172 |
+
Returns:
|
173 |
+
cropped_boxes (ndarray or None): the cropped boxes with dimension of
|
174 |
+
`num boxes` x 4.
|
175 |
+
"""
|
176 |
+
cropped_boxes = boxes.copy()
|
177 |
+
cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
|
178 |
+
cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
|
179 |
+
|
180 |
+
return cropped_boxes
|
181 |
+
|
182 |
+
|
183 |
+
def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
|
184 |
+
"""
|
185 |
+
Perform uniform spatial sampling on the images and corresponding boxes.
|
186 |
+
Args:
|
187 |
+
images (tensor): images to perform uniform crop. The dimension is
|
188 |
+
`num frames` x `channel` x `height` x `width`.
|
189 |
+
size (int): size of height and weight to crop the images.
|
190 |
+
spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
|
191 |
+
is larger than height. Or 0, 1, or 2 for top, center, and bottom
|
192 |
+
crop if height is larger than width.
|
193 |
+
boxes (ndarray or None): optional. Corresponding boxes to images.
|
194 |
+
Dimension is `num boxes` x 4.
|
195 |
+
scale_size (int): optinal. If not None, resize the images to scale_size before
|
196 |
+
performing any crop.
|
197 |
+
Returns:
|
198 |
+
cropped (tensor): images with dimension of
|
199 |
+
`num frames` x `channel` x `size` x `size`.
|
200 |
+
cropped_boxes (ndarray or None): the cropped boxes with dimension of
|
201 |
+
`num boxes` x 4.
|
202 |
+
"""
|
203 |
+
assert spatial_idx in [0, 1, 2]
|
204 |
+
ndim = len(images.shape)
|
205 |
+
if ndim == 3:
|
206 |
+
images = images.unsqueeze(0)
|
207 |
+
height = images.shape[2]
|
208 |
+
width = images.shape[3]
|
209 |
+
|
210 |
+
if scale_size is not None:
|
211 |
+
if width <= height:
|
212 |
+
width, height = scale_size, int(height / width * scale_size)
|
213 |
+
else:
|
214 |
+
width, height = int(width / height * scale_size), scale_size
|
215 |
+
images = torch.nn.functional.interpolate(
|
216 |
+
images,
|
217 |
+
size=(height, width),
|
218 |
+
mode="bilinear",
|
219 |
+
align_corners=False,
|
220 |
+
)
|
221 |
+
|
222 |
+
y_offset = int(math.ceil((height - size) / 2))
|
223 |
+
x_offset = int(math.ceil((width - size) / 2))
|
224 |
+
|
225 |
+
if height > width:
|
226 |
+
if spatial_idx == 0:
|
227 |
+
y_offset = 0
|
228 |
+
elif spatial_idx == 2:
|
229 |
+
y_offset = height - size
|
230 |
+
else:
|
231 |
+
if spatial_idx == 0:
|
232 |
+
x_offset = 0
|
233 |
+
elif spatial_idx == 2:
|
234 |
+
x_offset = width - size
|
235 |
+
cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
|
236 |
+
cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
|
237 |
+
if ndim == 3:
|
238 |
+
cropped = cropped.squeeze(0)
|
239 |
+
return cropped, cropped_boxes
|
240 |
+
|
241 |
+
|
242 |
+
class SpatialCrop(nn.Module):
|
243 |
+
"""
|
244 |
+
Convert the video into 3 smaller clips spatially. Must be used after the
|
245 |
+
temporal crops to get spatial crops, and should be used with
|
246 |
+
-2 in the spatial crop at the slowfast augmentation stage (so full
|
247 |
+
frames are passed in here). Will return a larger list with the
|
248 |
+
3x spatial crops as well.
|
249 |
+
"""
|
250 |
+
|
251 |
+
def __init__(self, crop_size: int = 224, num_crops: int = 3):
|
252 |
+
super().__init__()
|
253 |
+
self.crop_size = crop_size
|
254 |
+
if num_crops == 3:
|
255 |
+
self.crops_to_ext = [0, 1, 2]
|
256 |
+
self.flipped_crops_to_ext = []
|
257 |
+
elif num_crops == 1:
|
258 |
+
self.crops_to_ext = [1]
|
259 |
+
self.flipped_crops_to_ext = []
|
260 |
+
else:
|
261 |
+
raise NotImplementedError("Nothing else supported yet")
|
262 |
+
|
263 |
+
def forward(self, videos):
|
264 |
+
"""
|
265 |
+
Args:
|
266 |
+
videos: A list of C, T, H, W videos.
|
267 |
+
Returns:
|
268 |
+
videos: A list with 3x the number of elements. Each video converted
|
269 |
+
to C, T, H', W' by spatial cropping.
|
270 |
+
"""
|
271 |
+
assert isinstance(videos, list), "Must be a list of videos after temporal crops"
|
272 |
+
assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
|
273 |
+
res = []
|
274 |
+
for video in videos:
|
275 |
+
for spatial_idx in self.crops_to_ext:
|
276 |
+
res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
|
277 |
+
if not self.flipped_crops_to_ext:
|
278 |
+
continue
|
279 |
+
flipped_video = transforms.functional.hflip(video)
|
280 |
+
for spatial_idx in self.flipped_crops_to_ext:
|
281 |
+
res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
|
282 |
+
return res
|
283 |
+
|
284 |
+
|
285 |
+
def load_and_transform_video_data(
|
286 |
+
video_paths,
|
287 |
+
device,
|
288 |
+
clip_duration=2,
|
289 |
+
clips_per_video=5,
|
290 |
+
sample_rate=16000,
|
291 |
+
):
|
292 |
+
if video_paths is None:
|
293 |
+
return None
|
294 |
+
|
295 |
+
video_outputs = []
|
296 |
+
video_transform = transforms.Compose(
|
297 |
+
[
|
298 |
+
pv_transforms.ShortSideScale(224),
|
299 |
+
NormalizeVideo(
|
300 |
+
mean=(0.48145466, 0.4578275, 0.40821073),
|
301 |
+
std=(0.26862954, 0.26130258, 0.27577711),
|
302 |
+
),
|
303 |
+
]
|
304 |
+
)
|
305 |
+
|
306 |
+
clip_sampler = ConstantClipsPerVideoSampler(
|
307 |
+
clip_duration=clip_duration, clips_per_video=clips_per_video
|
308 |
+
)
|
309 |
+
frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
|
310 |
+
|
311 |
+
for video_path in video_paths:
|
312 |
+
video = EncodedVideo.from_path(
|
313 |
+
video_path,
|
314 |
+
decoder="decord",
|
315 |
+
decode_audio=False,
|
316 |
+
**{"sample_rate": sample_rate},
|
317 |
+
)
|
318 |
+
|
319 |
+
all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
|
320 |
+
|
321 |
+
all_video = []
|
322 |
+
for clip_timepoints in all_clips_timepoints:
|
323 |
+
# Read the clip, get frames
|
324 |
+
clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
|
325 |
+
if clip is None:
|
326 |
+
raise ValueError("No clip found")
|
327 |
+
video_clip = frame_sampler(clip["video"])
|
328 |
+
video_clip = video_clip / 255.0 # since this is float, need 0-1
|
329 |
+
|
330 |
+
all_video.append(video_clip)
|
331 |
+
|
332 |
+
all_video = [video_transform(clip) for clip in all_video]
|
333 |
+
all_video = SpatialCrop(224, num_crops=3)(all_video)
|
334 |
+
|
335 |
+
all_video = torch.stack(all_video, dim=0)
|
336 |
+
video_outputs.append(all_video)
|
337 |
+
|
338 |
+
return torch.stack(video_outputs, dim=0).to(device)
|
images/captured_image.jpg
ADDED
Git LFS Details
|
models/__init__.py
ADDED
File without changes
|
models/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (197 Bytes). View file
|
|
models/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (164 Bytes). View file
|
|
models/__pycache__/helpers.cpython-311.pyc
ADDED
Binary file (8.84 kB). View file
|
|
models/__pycache__/helpers.cpython-38.pyc
ADDED
Binary file (5.2 kB). View file
|
|
models/__pycache__/imagebind_model.cpython-311.pyc
ADDED
Binary file (14.7 kB). View file
|
|
models/__pycache__/imagebind_model.cpython-38.pyc
ADDED
Binary file (8.11 kB). View file
|
|
models/__pycache__/multimodal_preprocessors.cpython-311.pyc
ADDED
Binary file (38.2 kB). View file
|
|
models/__pycache__/multimodal_preprocessors.cpython-38.pyc
ADDED
Binary file (20 kB). View file
|
|
models/__pycache__/transformer.cpython-311.pyc
ADDED
Binary file (14.8 kB). View file
|
|
models/__pycache__/transformer.cpython-38.pyc
ADDED
Binary file (7.92 kB). View file
|
|
models/helpers.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
|
9 |
+
import einops
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
import torch.nn as nn
|
13 |
+
|
14 |
+
|
15 |
+
class Normalize(nn.Module):
|
16 |
+
def __init__(self, dim: int) -> None:
|
17 |
+
super().__init__()
|
18 |
+
self.dim = dim
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
return torch.nn.functional.normalize(x, dim=self.dim, p=2)
|
22 |
+
|
23 |
+
|
24 |
+
class LearnableLogitScaling(nn.Module):
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
logit_scale_init: float = 1 / 0.07,
|
28 |
+
learnable: bool = True,
|
29 |
+
max_logit_scale: float = 100,
|
30 |
+
) -> None:
|
31 |
+
super().__init__()
|
32 |
+
self.max_logit_scale = max_logit_scale
|
33 |
+
self.logit_scale_init = logit_scale_init
|
34 |
+
self.learnable = learnable
|
35 |
+
log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init)
|
36 |
+
if learnable:
|
37 |
+
self.log_logit_scale = nn.Parameter(log_logit_scale)
|
38 |
+
else:
|
39 |
+
self.register_buffer("log_logit_scale", log_logit_scale)
|
40 |
+
|
41 |
+
def forward(self, x):
|
42 |
+
return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x
|
43 |
+
|
44 |
+
def extra_repr(self):
|
45 |
+
st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}," \
|
46 |
+
f" max_logit_scale={self.max_logit_scale}"
|
47 |
+
return st
|
48 |
+
|
49 |
+
|
50 |
+
class EinOpsRearrange(nn.Module):
|
51 |
+
def __init__(self, rearrange_expr: str, **kwargs) -> None:
|
52 |
+
super().__init__()
|
53 |
+
self.rearrange_expr = rearrange_expr
|
54 |
+
self.kwargs = kwargs
|
55 |
+
|
56 |
+
def forward(self, x):
|
57 |
+
assert isinstance(x, torch.Tensor)
|
58 |
+
return einops.rearrange(x, self.rearrange_expr, **self.kwargs)
|
59 |
+
|
60 |
+
|
61 |
+
class VerboseNNModule(nn.Module):
|
62 |
+
"""
|
63 |
+
Wrapper around nn.Module that prints registered buffers and parameter names.
|
64 |
+
"""
|
65 |
+
|
66 |
+
@staticmethod
|
67 |
+
def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str:
|
68 |
+
st = (
|
69 |
+
"("
|
70 |
+
+ name
|
71 |
+
+ "): "
|
72 |
+
+ "tensor("
|
73 |
+
+ str(tuple(tensor[1].shape))
|
74 |
+
+ ", requires_grad="
|
75 |
+
+ str(tensor[1].requires_grad)
|
76 |
+
+ ")\n"
|
77 |
+
)
|
78 |
+
return st
|
79 |
+
|
80 |
+
def extra_repr(self) -> str:
|
81 |
+
named_modules = set()
|
82 |
+
for p in self.named_modules():
|
83 |
+
named_modules.update([p[0]])
|
84 |
+
named_modules = list(named_modules)
|
85 |
+
|
86 |
+
string_repr = ""
|
87 |
+
for p in self.named_parameters():
|
88 |
+
name = p[0].split(".")[0]
|
89 |
+
if name not in named_modules:
|
90 |
+
string_repr += self.get_readable_tensor_repr(name, p)
|
91 |
+
|
92 |
+
for p in self.named_buffers():
|
93 |
+
name = p[0].split(".")[0]
|
94 |
+
string_repr += self.get_readable_tensor_repr(name, p)
|
95 |
+
|
96 |
+
return string_repr
|
97 |
+
|
98 |
+
|
99 |
+
def cast_if_src_dtype(
|
100 |
+
tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype
|
101 |
+
):
|
102 |
+
updated = False
|
103 |
+
if tensor.dtype == src_dtype:
|
104 |
+
tensor = tensor.to(dtype=tgt_dtype)
|
105 |
+
updated = True
|
106 |
+
return tensor, updated
|
107 |
+
|
108 |
+
|
109 |
+
class QuickGELU(nn.Module):
|
110 |
+
# From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166
|
111 |
+
def forward(self, x: torch.Tensor):
|
112 |
+
return x * torch.sigmoid(1.702 * x)
|
113 |
+
|
114 |
+
|
115 |
+
class SelectElement(nn.Module):
|
116 |
+
def __init__(self, index) -> None:
|
117 |
+
super().__init__()
|
118 |
+
self.index = index
|
119 |
+
|
120 |
+
def forward(self, x):
|
121 |
+
assert x.ndim >= 3
|
122 |
+
return x[:, self.index, ...]
|
123 |
+
|
124 |
+
|
125 |
+
class SelectEOSAndProject(nn.Module):
|
126 |
+
"""
|
127 |
+
Text Pooling used in OpenCLIP
|
128 |
+
"""
|
129 |
+
|
130 |
+
def __init__(self, proj: nn.Module) -> None:
|
131 |
+
super().__init__()
|
132 |
+
self.proj = proj
|
133 |
+
|
134 |
+
def forward(self, x, seq_len):
|
135 |
+
assert x.ndim == 3
|
136 |
+
# x is of shape B x L x D
|
137 |
+
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
138 |
+
x = x[torch.arange(x.shape[0]), seq_len]
|
139 |
+
x = self.proj(x)
|
140 |
+
return x
|
models/imagebind_model.py
ADDED
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
|
9 |
+
import os
|
10 |
+
from functools import partial
|
11 |
+
from types import SimpleNamespace
|
12 |
+
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
|
16 |
+
from models.helpers import (EinOpsRearrange, LearnableLogitScaling, Normalize,
|
17 |
+
SelectElement, SelectEOSAndProject)
|
18 |
+
from models.multimodal_preprocessors import (AudioPreprocessor,
|
19 |
+
IMUPreprocessor, PadIm2Video,
|
20 |
+
PatchEmbedGeneric,
|
21 |
+
RGBDTPreprocessor,
|
22 |
+
SpatioTemporalPosEmbeddingHelper,
|
23 |
+
TextPreprocessor,
|
24 |
+
ThermalPreprocessor)
|
25 |
+
from models.transformer import MultiheadAttention, SimpleTransformer
|
26 |
+
|
27 |
+
ModalityType = SimpleNamespace(
|
28 |
+
VISION="vision",
|
29 |
+
TEXT="text",
|
30 |
+
AUDIO="audio",
|
31 |
+
THERMAL="thermal",
|
32 |
+
DEPTH="depth",
|
33 |
+
IMU="imu",
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
class ImageBindModel(nn.Module):
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
video_frames=2,
|
41 |
+
kernel_size=(2, 14, 14),
|
42 |
+
audio_kernel_size=16,
|
43 |
+
audio_stride=10,
|
44 |
+
out_embed_dim=768,
|
45 |
+
vision_embed_dim=1024,
|
46 |
+
vision_num_blocks=24,
|
47 |
+
vision_num_heads=16,
|
48 |
+
audio_embed_dim=768,
|
49 |
+
audio_num_blocks=12,
|
50 |
+
audio_num_heads=12,
|
51 |
+
audio_num_mel_bins=128,
|
52 |
+
audio_target_len=204,
|
53 |
+
audio_drop_path=0.1,
|
54 |
+
text_embed_dim=768,
|
55 |
+
text_num_blocks=12,
|
56 |
+
text_num_heads=12,
|
57 |
+
depth_embed_dim=384,
|
58 |
+
depth_kernel_size=16,
|
59 |
+
depth_num_blocks=12,
|
60 |
+
depth_num_heads=8,
|
61 |
+
depth_drop_path=0.0,
|
62 |
+
thermal_embed_dim=768,
|
63 |
+
thermal_kernel_size=16,
|
64 |
+
thermal_num_blocks=12,
|
65 |
+
thermal_num_heads=12,
|
66 |
+
thermal_drop_path=0.0,
|
67 |
+
imu_embed_dim=512,
|
68 |
+
imu_kernel_size=8,
|
69 |
+
imu_num_blocks=6,
|
70 |
+
imu_num_heads=8,
|
71 |
+
imu_drop_path=0.7,
|
72 |
+
):
|
73 |
+
super().__init__()
|
74 |
+
|
75 |
+
self.modality_preprocessors = self._create_modality_preprocessors(
|
76 |
+
video_frames,
|
77 |
+
vision_embed_dim,
|
78 |
+
kernel_size,
|
79 |
+
text_embed_dim,
|
80 |
+
audio_embed_dim,
|
81 |
+
audio_kernel_size,
|
82 |
+
audio_stride,
|
83 |
+
audio_num_mel_bins,
|
84 |
+
audio_target_len,
|
85 |
+
depth_embed_dim,
|
86 |
+
depth_kernel_size,
|
87 |
+
thermal_embed_dim,
|
88 |
+
thermal_kernel_size,
|
89 |
+
imu_embed_dim,
|
90 |
+
)
|
91 |
+
|
92 |
+
self.modality_trunks = self._create_modality_trunks(
|
93 |
+
vision_embed_dim,
|
94 |
+
vision_num_blocks,
|
95 |
+
vision_num_heads,
|
96 |
+
text_embed_dim,
|
97 |
+
text_num_blocks,
|
98 |
+
text_num_heads,
|
99 |
+
audio_embed_dim,
|
100 |
+
audio_num_blocks,
|
101 |
+
audio_num_heads,
|
102 |
+
audio_drop_path,
|
103 |
+
depth_embed_dim,
|
104 |
+
depth_num_blocks,
|
105 |
+
depth_num_heads,
|
106 |
+
depth_drop_path,
|
107 |
+
thermal_embed_dim,
|
108 |
+
thermal_num_blocks,
|
109 |
+
thermal_num_heads,
|
110 |
+
thermal_drop_path,
|
111 |
+
imu_embed_dim,
|
112 |
+
imu_num_blocks,
|
113 |
+
imu_num_heads,
|
114 |
+
imu_drop_path,
|
115 |
+
)
|
116 |
+
|
117 |
+
self.modality_heads = self._create_modality_heads(
|
118 |
+
out_embed_dim,
|
119 |
+
vision_embed_dim,
|
120 |
+
text_embed_dim,
|
121 |
+
audio_embed_dim,
|
122 |
+
depth_embed_dim,
|
123 |
+
thermal_embed_dim,
|
124 |
+
imu_embed_dim,
|
125 |
+
)
|
126 |
+
|
127 |
+
self.modality_postprocessors = self._create_modality_postprocessors(
|
128 |
+
out_embed_dim
|
129 |
+
)
|
130 |
+
|
131 |
+
def _create_modality_preprocessors(
|
132 |
+
self,
|
133 |
+
video_frames=2,
|
134 |
+
vision_embed_dim=1024,
|
135 |
+
kernel_size=(2, 14, 14),
|
136 |
+
text_embed_dim=768,
|
137 |
+
audio_embed_dim=768,
|
138 |
+
audio_kernel_size=16,
|
139 |
+
audio_stride=10,
|
140 |
+
audio_num_mel_bins=128,
|
141 |
+
audio_target_len=204,
|
142 |
+
depth_embed_dim=768,
|
143 |
+
depth_kernel_size=16,
|
144 |
+
thermal_embed_dim=768,
|
145 |
+
thermal_kernel_size=16,
|
146 |
+
imu_embed_dim=512,
|
147 |
+
):
|
148 |
+
rgbt_stem = PatchEmbedGeneric(
|
149 |
+
proj_stem=[
|
150 |
+
PadIm2Video(pad_type="repeat", ntimes=2),
|
151 |
+
nn.Conv3d(
|
152 |
+
in_channels=3,
|
153 |
+
kernel_size=kernel_size,
|
154 |
+
out_channels=vision_embed_dim,
|
155 |
+
stride=kernel_size,
|
156 |
+
bias=False,
|
157 |
+
),
|
158 |
+
]
|
159 |
+
)
|
160 |
+
rgbt_preprocessor = RGBDTPreprocessor(
|
161 |
+
img_size=[3, video_frames, 224, 224],
|
162 |
+
num_cls_tokens=1,
|
163 |
+
pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
164 |
+
rgbt_stem=rgbt_stem,
|
165 |
+
depth_stem=None,
|
166 |
+
)
|
167 |
+
|
168 |
+
text_preprocessor = TextPreprocessor(
|
169 |
+
context_length=77,
|
170 |
+
vocab_size=49408,
|
171 |
+
embed_dim=text_embed_dim,
|
172 |
+
causal_masking=True,
|
173 |
+
)
|
174 |
+
|
175 |
+
audio_stem = PatchEmbedGeneric(
|
176 |
+
proj_stem=[
|
177 |
+
nn.Conv2d(
|
178 |
+
in_channels=1,
|
179 |
+
kernel_size=audio_kernel_size,
|
180 |
+
stride=audio_stride,
|
181 |
+
out_channels=audio_embed_dim,
|
182 |
+
bias=False,
|
183 |
+
),
|
184 |
+
],
|
185 |
+
norm_layer=nn.LayerNorm(normalized_shape=audio_embed_dim),
|
186 |
+
)
|
187 |
+
audio_preprocessor = AudioPreprocessor(
|
188 |
+
img_size=[1, audio_num_mel_bins, audio_target_len],
|
189 |
+
num_cls_tokens=1,
|
190 |
+
pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
191 |
+
audio_stem=audio_stem,
|
192 |
+
)
|
193 |
+
|
194 |
+
depth_stem = PatchEmbedGeneric(
|
195 |
+
[
|
196 |
+
nn.Conv2d(
|
197 |
+
kernel_size=depth_kernel_size,
|
198 |
+
in_channels=1,
|
199 |
+
out_channels=depth_embed_dim,
|
200 |
+
stride=depth_kernel_size,
|
201 |
+
bias=False,
|
202 |
+
),
|
203 |
+
],
|
204 |
+
norm_layer=nn.LayerNorm(normalized_shape=depth_embed_dim),
|
205 |
+
)
|
206 |
+
|
207 |
+
depth_preprocessor = RGBDTPreprocessor(
|
208 |
+
img_size=[1, 224, 224],
|
209 |
+
num_cls_tokens=1,
|
210 |
+
pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
211 |
+
rgbt_stem=None,
|
212 |
+
depth_stem=depth_stem,
|
213 |
+
)
|
214 |
+
|
215 |
+
thermal_stem = PatchEmbedGeneric(
|
216 |
+
[
|
217 |
+
nn.Conv2d(
|
218 |
+
kernel_size=thermal_kernel_size,
|
219 |
+
in_channels=1,
|
220 |
+
out_channels=thermal_embed_dim,
|
221 |
+
stride=thermal_kernel_size,
|
222 |
+
bias=False,
|
223 |
+
),
|
224 |
+
],
|
225 |
+
norm_layer=nn.LayerNorm(normalized_shape=thermal_embed_dim),
|
226 |
+
)
|
227 |
+
thermal_preprocessor = ThermalPreprocessor(
|
228 |
+
img_size=[1, 224, 224],
|
229 |
+
num_cls_tokens=1,
|
230 |
+
pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
231 |
+
thermal_stem=thermal_stem,
|
232 |
+
)
|
233 |
+
|
234 |
+
imu_stem = PatchEmbedGeneric(
|
235 |
+
[
|
236 |
+
nn.Linear(
|
237 |
+
in_features=48,
|
238 |
+
out_features=imu_embed_dim,
|
239 |
+
bias=False,
|
240 |
+
),
|
241 |
+
],
|
242 |
+
norm_layer=nn.LayerNorm(normalized_shape=imu_embed_dim),
|
243 |
+
)
|
244 |
+
|
245 |
+
imu_preprocessor = IMUPreprocessor(
|
246 |
+
img_size=[6, 2000],
|
247 |
+
num_cls_tokens=1,
|
248 |
+
kernel_size=8,
|
249 |
+
embed_dim=imu_embed_dim,
|
250 |
+
pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
251 |
+
imu_stem=imu_stem,
|
252 |
+
)
|
253 |
+
|
254 |
+
modality_preprocessors = {
|
255 |
+
ModalityType.VISION: rgbt_preprocessor,
|
256 |
+
ModalityType.TEXT: text_preprocessor,
|
257 |
+
ModalityType.AUDIO: audio_preprocessor,
|
258 |
+
ModalityType.DEPTH: depth_preprocessor,
|
259 |
+
ModalityType.THERMAL: thermal_preprocessor,
|
260 |
+
ModalityType.IMU: imu_preprocessor,
|
261 |
+
}
|
262 |
+
|
263 |
+
return nn.ModuleDict(modality_preprocessors)
|
264 |
+
|
265 |
+
def _create_modality_trunks(
|
266 |
+
self,
|
267 |
+
vision_embed_dim=1024,
|
268 |
+
vision_num_blocks=24,
|
269 |
+
vision_num_heads=16,
|
270 |
+
text_embed_dim=768,
|
271 |
+
text_num_blocks=12,
|
272 |
+
text_num_heads=12,
|
273 |
+
audio_embed_dim=768,
|
274 |
+
audio_num_blocks=12,
|
275 |
+
audio_num_heads=12,
|
276 |
+
audio_drop_path=0.0,
|
277 |
+
depth_embed_dim=768,
|
278 |
+
depth_num_blocks=12,
|
279 |
+
depth_num_heads=12,
|
280 |
+
depth_drop_path=0.0,
|
281 |
+
thermal_embed_dim=768,
|
282 |
+
thermal_num_blocks=12,
|
283 |
+
thermal_num_heads=12,
|
284 |
+
thermal_drop_path=0.0,
|
285 |
+
imu_embed_dim=512,
|
286 |
+
imu_num_blocks=6,
|
287 |
+
imu_num_heads=8,
|
288 |
+
imu_drop_path=0.7,
|
289 |
+
):
|
290 |
+
def instantiate_trunk(
|
291 |
+
embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path
|
292 |
+
):
|
293 |
+
return SimpleTransformer(
|
294 |
+
embed_dim=embed_dim,
|
295 |
+
num_blocks=num_blocks,
|
296 |
+
ffn_dropout_rate=0.0,
|
297 |
+
drop_path_rate=drop_path,
|
298 |
+
attn_target=partial(
|
299 |
+
MultiheadAttention,
|
300 |
+
embed_dim=embed_dim,
|
301 |
+
num_heads=num_heads,
|
302 |
+
bias=True,
|
303 |
+
add_bias_kv=add_bias_kv,
|
304 |
+
),
|
305 |
+
pre_transformer_layer=nn.Sequential(
|
306 |
+
nn.LayerNorm(embed_dim, eps=1e-6)
|
307 |
+
if pre_transformer_ln
|
308 |
+
else nn.Identity(),
|
309 |
+
EinOpsRearrange("b l d -> l b d"),
|
310 |
+
),
|
311 |
+
post_transformer_layer=EinOpsRearrange("l b d -> b l d"),
|
312 |
+
)
|
313 |
+
|
314 |
+
modality_trunks = {}
|
315 |
+
modality_trunks[ModalityType.VISION] = instantiate_trunk(
|
316 |
+
vision_embed_dim,
|
317 |
+
vision_num_blocks,
|
318 |
+
vision_num_heads,
|
319 |
+
pre_transformer_ln=True,
|
320 |
+
add_bias_kv=False,
|
321 |
+
drop_path=0.0,
|
322 |
+
)
|
323 |
+
modality_trunks[ModalityType.TEXT] = instantiate_trunk(
|
324 |
+
text_embed_dim,
|
325 |
+
text_num_blocks,
|
326 |
+
text_num_heads,
|
327 |
+
pre_transformer_ln=False,
|
328 |
+
add_bias_kv=False,
|
329 |
+
drop_path=0.0,
|
330 |
+
)
|
331 |
+
modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
|
332 |
+
audio_embed_dim,
|
333 |
+
audio_num_blocks,
|
334 |
+
audio_num_heads,
|
335 |
+
pre_transformer_ln=False,
|
336 |
+
add_bias_kv=True,
|
337 |
+
drop_path=audio_drop_path,
|
338 |
+
)
|
339 |
+
modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
|
340 |
+
depth_embed_dim,
|
341 |
+
depth_num_blocks,
|
342 |
+
depth_num_heads,
|
343 |
+
pre_transformer_ln=False,
|
344 |
+
add_bias_kv=True,
|
345 |
+
drop_path=depth_drop_path,
|
346 |
+
)
|
347 |
+
modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
|
348 |
+
thermal_embed_dim,
|
349 |
+
thermal_num_blocks,
|
350 |
+
thermal_num_heads,
|
351 |
+
pre_transformer_ln=False,
|
352 |
+
add_bias_kv=True,
|
353 |
+
drop_path=thermal_drop_path,
|
354 |
+
)
|
355 |
+
modality_trunks[ModalityType.IMU] = instantiate_trunk(
|
356 |
+
imu_embed_dim,
|
357 |
+
imu_num_blocks,
|
358 |
+
imu_num_heads,
|
359 |
+
pre_transformer_ln=False,
|
360 |
+
add_bias_kv=True,
|
361 |
+
drop_path=imu_drop_path,
|
362 |
+
)
|
363 |
+
|
364 |
+
return nn.ModuleDict(modality_trunks)
|
365 |
+
|
366 |
+
def _create_modality_heads(
|
367 |
+
self,
|
368 |
+
out_embed_dim,
|
369 |
+
vision_embed_dim,
|
370 |
+
text_embed_dim,
|
371 |
+
audio_embed_dim,
|
372 |
+
depth_embed_dim,
|
373 |
+
thermal_embed_dim,
|
374 |
+
imu_embed_dim,
|
375 |
+
):
|
376 |
+
modality_heads = {}
|
377 |
+
|
378 |
+
modality_heads[ModalityType.VISION] = nn.Sequential(
|
379 |
+
nn.LayerNorm(normalized_shape=vision_embed_dim, eps=1e-6),
|
380 |
+
SelectElement(index=0),
|
381 |
+
nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
|
382 |
+
)
|
383 |
+
|
384 |
+
modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
|
385 |
+
proj=nn.Sequential(
|
386 |
+
nn.LayerNorm(normalized_shape=text_embed_dim, eps=1e-6),
|
387 |
+
nn.Linear(text_embed_dim, out_embed_dim, bias=False),
|
388 |
+
)
|
389 |
+
)
|
390 |
+
|
391 |
+
modality_heads[ModalityType.AUDIO] = nn.Sequential(
|
392 |
+
nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
|
393 |
+
SelectElement(index=0),
|
394 |
+
nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
|
395 |
+
)
|
396 |
+
|
397 |
+
modality_heads[ModalityType.DEPTH] = nn.Sequential(
|
398 |
+
nn.LayerNorm(normalized_shape=depth_embed_dim, eps=1e-6),
|
399 |
+
SelectElement(index=0),
|
400 |
+
nn.Linear(depth_embed_dim, out_embed_dim, bias=False),
|
401 |
+
)
|
402 |
+
|
403 |
+
modality_heads[ModalityType.THERMAL] = nn.Sequential(
|
404 |
+
nn.LayerNorm(normalized_shape=thermal_embed_dim, eps=1e-6),
|
405 |
+
SelectElement(index=0),
|
406 |
+
nn.Linear(thermal_embed_dim, out_embed_dim, bias=False),
|
407 |
+
)
|
408 |
+
|
409 |
+
modality_heads[ModalityType.IMU] = nn.Sequential(
|
410 |
+
nn.LayerNorm(normalized_shape=imu_embed_dim, eps=1e-6),
|
411 |
+
SelectElement(index=0),
|
412 |
+
nn.Dropout(p=0.5),
|
413 |
+
nn.Linear(imu_embed_dim, out_embed_dim, bias=False),
|
414 |
+
)
|
415 |
+
|
416 |
+
return nn.ModuleDict(modality_heads)
|
417 |
+
|
418 |
+
def _create_modality_postprocessors(self, out_embed_dim):
|
419 |
+
modality_postprocessors = {}
|
420 |
+
|
421 |
+
modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
|
422 |
+
modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
|
423 |
+
Normalize(dim=-1), LearnableLogitScaling(learnable=True)
|
424 |
+
)
|
425 |
+
modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
|
426 |
+
Normalize(dim=-1),
|
427 |
+
LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
|
428 |
+
)
|
429 |
+
modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
|
430 |
+
Normalize(dim=-1),
|
431 |
+
LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
|
432 |
+
)
|
433 |
+
modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
|
434 |
+
Normalize(dim=-1),
|
435 |
+
LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
|
436 |
+
)
|
437 |
+
modality_postprocessors[ModalityType.IMU] = nn.Sequential(
|
438 |
+
Normalize(dim=-1),
|
439 |
+
LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
|
440 |
+
)
|
441 |
+
|
442 |
+
return nn.ModuleDict(modality_postprocessors)
|
443 |
+
|
444 |
+
def forward(self, inputs):
|
445 |
+
outputs = {}
|
446 |
+
for modality_key, modality_value in inputs.items():
|
447 |
+
reduce_list = (
|
448 |
+
modality_value.ndim >= 5
|
449 |
+
) # Audio and Video inputs consist of multiple clips
|
450 |
+
if reduce_list:
|
451 |
+
B, S = modality_value.shape[:2]
|
452 |
+
modality_value = modality_value.reshape(
|
453 |
+
B * S, *modality_value.shape[2:]
|
454 |
+
)
|
455 |
+
|
456 |
+
if modality_value is not None:
|
457 |
+
modality_value = self.modality_preprocessors[modality_key](
|
458 |
+
**{modality_key: modality_value}
|
459 |
+
)
|
460 |
+
trunk_inputs = modality_value["trunk"]
|
461 |
+
head_inputs = modality_value["head"]
|
462 |
+
modality_value = self.modality_trunks[modality_key](**trunk_inputs)
|
463 |
+
modality_value = self.modality_heads[modality_key](
|
464 |
+
modality_value, **head_inputs
|
465 |
+
)
|
466 |
+
modality_value = self.modality_postprocessors[modality_key](
|
467 |
+
modality_value
|
468 |
+
)
|
469 |
+
|
470 |
+
if reduce_list:
|
471 |
+
modality_value = modality_value.reshape(B, S, -1)
|
472 |
+
modality_value = modality_value.mean(dim=1)
|
473 |
+
|
474 |
+
outputs[modality_key] = modality_value
|
475 |
+
|
476 |
+
return outputs
|
477 |
+
|
478 |
+
|
479 |
+
def imagebind_huge(pretrained=False):
|
480 |
+
model = ImageBindModel(
|
481 |
+
vision_embed_dim=1280,
|
482 |
+
vision_num_blocks=32,
|
483 |
+
vision_num_heads=16,
|
484 |
+
text_embed_dim=1024,
|
485 |
+
text_num_blocks=24,
|
486 |
+
text_num_heads=16,
|
487 |
+
out_embed_dim=1024,
|
488 |
+
audio_drop_path=0.1,
|
489 |
+
imu_drop_path=0.7,
|
490 |
+
)
|
491 |
+
|
492 |
+
if pretrained:
|
493 |
+
if not os.path.exists(".checkpoints/imagebind_huge.pth"):
|
494 |
+
print(
|
495 |
+
"Downloading imagebind weights to .checkpoints/imagebind_huge.pth ..."
|
496 |
+
)
|
497 |
+
os.makedirs(".checkpoints", exist_ok=True)
|
498 |
+
torch.hub.download_url_to_file(
|
499 |
+
"https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
|
500 |
+
".checkpoints/imagebind_huge.pth",
|
501 |
+
progress=True,
|
502 |
+
)
|
503 |
+
|
504 |
+
model.load_state_dict(torch.load(".checkpoints/imagebind_huge.pth"))
|
505 |
+
|
506 |
+
return model
|
models/multimodal_preprocessors.py
ADDED
@@ -0,0 +1,685 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
import gzip
|
9 |
+
import html
|
10 |
+
import io
|
11 |
+
import math
|
12 |
+
from functools import lru_cache
|
13 |
+
from typing import Callable, List, Optional, Tuple
|
14 |
+
|
15 |
+
import ftfy
|
16 |
+
import numpy as np
|
17 |
+
import regex as re
|
18 |
+
import torch
|
19 |
+
import torch.nn as nn
|
20 |
+
from iopath.common.file_io import g_pathmgr
|
21 |
+
from timm.models.layers import trunc_normal_
|
22 |
+
|
23 |
+
from models.helpers import VerboseNNModule, cast_if_src_dtype
|
24 |
+
|
25 |
+
|
26 |
+
def get_sinusoid_encoding_table(n_position, d_hid):
|
27 |
+
"""Sinusoid position encoding table"""
|
28 |
+
|
29 |
+
# TODO: make it with torch instead of numpy
|
30 |
+
def get_position_angle_vec(position):
|
31 |
+
return [
|
32 |
+
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
|
33 |
+
for hid_j in range(d_hid)
|
34 |
+
]
|
35 |
+
|
36 |
+
sinusoid_table = np.array(
|
37 |
+
[get_position_angle_vec(pos_i) for pos_i in range(n_position)]
|
38 |
+
)
|
39 |
+
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
40 |
+
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
41 |
+
|
42 |
+
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
|
43 |
+
|
44 |
+
|
45 |
+
def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
|
46 |
+
N = pos_embed.shape[1]
|
47 |
+
if N == target_spatial_size:
|
48 |
+
return pos_embed
|
49 |
+
dim = pos_embed.shape[-1]
|
50 |
+
# nn.functional.interpolate doesn't work with bfloat16 so we cast to float32
|
51 |
+
pos_embed, updated = cast_if_src_dtype(pos_embed, torch.bfloat16, torch.float32)
|
52 |
+
pos_embed = nn.functional.interpolate(
|
53 |
+
pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(
|
54 |
+
0, 3, 1, 2
|
55 |
+
),
|
56 |
+
scale_factor=math.sqrt(target_spatial_size / N),
|
57 |
+
mode="bicubic",
|
58 |
+
)
|
59 |
+
if updated:
|
60 |
+
pos_embed, _ = cast_if_src_dtype(pos_embed, torch.float32, torch.bfloat16)
|
61 |
+
pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
62 |
+
return pos_embed
|
63 |
+
|
64 |
+
|
65 |
+
def interpolate_pos_encoding(
|
66 |
+
npatch_per_img,
|
67 |
+
pos_embed,
|
68 |
+
patches_layout,
|
69 |
+
input_shape=None,
|
70 |
+
first_patch_idx=1,
|
71 |
+
):
|
72 |
+
assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
|
73 |
+
N = pos_embed.shape[1] - first_patch_idx # since it's 1 if cls_token exists
|
74 |
+
if npatch_per_img == N:
|
75 |
+
return pos_embed
|
76 |
+
|
77 |
+
assert (
|
78 |
+
patches_layout[-1] == patches_layout[-2]
|
79 |
+
), "Interpolation of pos embed not supported for non-square layouts"
|
80 |
+
|
81 |
+
class_emb = pos_embed[:, :first_patch_idx]
|
82 |
+
pos_embed = pos_embed[:, first_patch_idx:]
|
83 |
+
|
84 |
+
if input_shape is None or patches_layout[0] == 1:
|
85 |
+
# simple 2D pos embedding, no temporal component
|
86 |
+
pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed)
|
87 |
+
elif patches_layout[0] > 1:
|
88 |
+
# pos embed has a temporal component
|
89 |
+
assert len(input_shape) == 4, "temporal interpolation not supported"
|
90 |
+
# we only support 2D interpolation in this case
|
91 |
+
num_frames = patches_layout[0]
|
92 |
+
num_spatial_tokens = patches_layout[1] * patches_layout[2]
|
93 |
+
pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
|
94 |
+
# interpolate embedding for zeroth frame
|
95 |
+
pos_embed = interpolate_pos_encoding_2d(
|
96 |
+
npatch_per_img, pos_embed[0, 0, ...].unsqueeze(0)
|
97 |
+
)
|
98 |
+
else:
|
99 |
+
raise ValueError("This type of interpolation isn't implemented")
|
100 |
+
|
101 |
+
return torch.cat((class_emb, pos_embed), dim=1)
|
102 |
+
|
103 |
+
|
104 |
+
def _get_pos_embedding(
|
105 |
+
npatch_per_img,
|
106 |
+
pos_embed,
|
107 |
+
patches_layout,
|
108 |
+
input_shape,
|
109 |
+
first_patch_idx=1,
|
110 |
+
):
|
111 |
+
pos_embed = interpolate_pos_encoding(
|
112 |
+
npatch_per_img,
|
113 |
+
pos_embed,
|
114 |
+
patches_layout,
|
115 |
+
input_shape=input_shape,
|
116 |
+
first_patch_idx=first_patch_idx,
|
117 |
+
)
|
118 |
+
return pos_embed
|
119 |
+
|
120 |
+
|
121 |
+
class PatchEmbedGeneric(nn.Module):
|
122 |
+
"""
|
123 |
+
PatchEmbed from Hydra
|
124 |
+
"""
|
125 |
+
|
126 |
+
def __init__(self, proj_stem, norm_layer: Optional[nn.Module] = None):
|
127 |
+
super().__init__()
|
128 |
+
|
129 |
+
if len(proj_stem) > 1:
|
130 |
+
self.proj = nn.Sequential(*proj_stem)
|
131 |
+
else:
|
132 |
+
# Special case to be able to load pre-trained models that were
|
133 |
+
# trained with a standard stem
|
134 |
+
self.proj = proj_stem[0]
|
135 |
+
self.norm_layer = norm_layer
|
136 |
+
|
137 |
+
def get_patch_layout(self, img_size):
|
138 |
+
with torch.no_grad():
|
139 |
+
dummy_img = torch.zeros(
|
140 |
+
[
|
141 |
+
1,
|
142 |
+
]
|
143 |
+
+ img_size
|
144 |
+
)
|
145 |
+
dummy_out = self.proj(dummy_img)
|
146 |
+
embed_dim = dummy_out.shape[1]
|
147 |
+
patches_layout = tuple(dummy_out.shape[2:])
|
148 |
+
num_patches = np.prod(patches_layout)
|
149 |
+
return patches_layout, num_patches, embed_dim
|
150 |
+
|
151 |
+
def forward(self, x):
|
152 |
+
x = self.proj(x)
|
153 |
+
# B C (T) H W -> B (T)HW C
|
154 |
+
x = x.flatten(2).transpose(1, 2)
|
155 |
+
if self.norm_layer is not None:
|
156 |
+
x = self.norm_layer(x)
|
157 |
+
return x
|
158 |
+
|
159 |
+
|
160 |
+
class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
|
161 |
+
def __init__(
|
162 |
+
self,
|
163 |
+
patches_layout: List,
|
164 |
+
num_patches: int,
|
165 |
+
num_cls_tokens: int,
|
166 |
+
embed_dim: int,
|
167 |
+
learnable: bool,
|
168 |
+
) -> None:
|
169 |
+
super().__init__()
|
170 |
+
self.num_cls_tokens = num_cls_tokens
|
171 |
+
self.patches_layout = patches_layout
|
172 |
+
self.num_patches = num_patches
|
173 |
+
self.num_tokens = num_cls_tokens + num_patches
|
174 |
+
self.learnable = learnable
|
175 |
+
if self.learnable:
|
176 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
|
177 |
+
trunc_normal_(self.pos_embed, std=0.02)
|
178 |
+
else:
|
179 |
+
self.register_buffer(
|
180 |
+
"pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)
|
181 |
+
)
|
182 |
+
|
183 |
+
def get_pos_embedding(self, vision_input, all_vision_tokens):
|
184 |
+
input_shape = vision_input.shape
|
185 |
+
pos_embed = _get_pos_embedding(
|
186 |
+
all_vision_tokens.size(1) - self.num_cls_tokens,
|
187 |
+
pos_embed=self.pos_embed,
|
188 |
+
patches_layout=self.patches_layout,
|
189 |
+
input_shape=input_shape,
|
190 |
+
first_patch_idx=self.num_cls_tokens,
|
191 |
+
)
|
192 |
+
return pos_embed
|
193 |
+
|
194 |
+
|
195 |
+
class RGBDTPreprocessor(VerboseNNModule):
|
196 |
+
def __init__(
|
197 |
+
self,
|
198 |
+
rgbt_stem: PatchEmbedGeneric,
|
199 |
+
depth_stem: Optional[PatchEmbedGeneric],
|
200 |
+
img_size: Tuple = (3, 224, 224),
|
201 |
+
num_cls_tokens: int = 1,
|
202 |
+
pos_embed_fn: Optional[Callable] = None,
|
203 |
+
use_type_embed: bool = False,
|
204 |
+
init_param_style: str = "openclip",
|
205 |
+
) -> None:
|
206 |
+
super().__init__()
|
207 |
+
stem = rgbt_stem if rgbt_stem is not None else depth_stem
|
208 |
+
(
|
209 |
+
self.patches_layout,
|
210 |
+
self.num_patches,
|
211 |
+
self.embed_dim,
|
212 |
+
) = stem.get_patch_layout(img_size)
|
213 |
+
self.rgbt_stem = rgbt_stem
|
214 |
+
self.depth_stem = depth_stem
|
215 |
+
self.use_pos_embed = pos_embed_fn is not None
|
216 |
+
self.use_type_embed = use_type_embed
|
217 |
+
self.num_cls_tokens = num_cls_tokens
|
218 |
+
|
219 |
+
if self.use_pos_embed:
|
220 |
+
self.pos_embedding_helper = pos_embed_fn(
|
221 |
+
patches_layout=self.patches_layout,
|
222 |
+
num_cls_tokens=num_cls_tokens,
|
223 |
+
num_patches=self.num_patches,
|
224 |
+
embed_dim=self.embed_dim,
|
225 |
+
)
|
226 |
+
if self.num_cls_tokens > 0:
|
227 |
+
self.cls_token = nn.Parameter(
|
228 |
+
torch.zeros(1, self.num_cls_tokens, self.embed_dim)
|
229 |
+
)
|
230 |
+
if self.use_type_embed:
|
231 |
+
self.type_embed = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
|
232 |
+
|
233 |
+
self.init_parameters(init_param_style)
|
234 |
+
|
235 |
+
@torch.no_grad()
|
236 |
+
def init_parameters(self, init_param_style):
|
237 |
+
if init_param_style == "openclip":
|
238 |
+
# OpenCLIP style initialization
|
239 |
+
scale = self.embed_dim**-0.5
|
240 |
+
if self.use_pos_embed:
|
241 |
+
nn.init.normal_(self.pos_embedding_helper.pos_embed)
|
242 |
+
self.pos_embedding_helper.pos_embed *= scale
|
243 |
+
|
244 |
+
if self.num_cls_tokens > 0:
|
245 |
+
nn.init.normal_(self.cls_token)
|
246 |
+
self.cls_token *= scale
|
247 |
+
elif init_param_style == "vit":
|
248 |
+
self.cls_token.data.fill_(0)
|
249 |
+
else:
|
250 |
+
raise ValueError(f"Unknown init {init_param_style}")
|
251 |
+
|
252 |
+
if self.use_type_embed:
|
253 |
+
nn.init.normal_(self.type_embed)
|
254 |
+
|
255 |
+
def tokenize_input_and_cls_pos(self, input, stem, mask):
|
256 |
+
# tokens is of shape B x L x D
|
257 |
+
tokens = stem(input)
|
258 |
+
assert tokens.ndim == 3
|
259 |
+
assert tokens.shape[2] == self.embed_dim
|
260 |
+
B = tokens.shape[0]
|
261 |
+
if self.num_cls_tokens > 0:
|
262 |
+
class_tokens = self.cls_token.expand(
|
263 |
+
B, -1, -1
|
264 |
+
) # stole class_tokens impl from Phil Wang, thanks
|
265 |
+
tokens = torch.cat((class_tokens, tokens), dim=1)
|
266 |
+
if self.use_pos_embed:
|
267 |
+
pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
|
268 |
+
tokens = tokens + pos_embed
|
269 |
+
if self.use_type_embed:
|
270 |
+
tokens = tokens + self.type_embed.expand(B, -1, -1)
|
271 |
+
return tokens
|
272 |
+
|
273 |
+
def forward(self, vision=None, depth=None, patch_mask=None):
|
274 |
+
if patch_mask is not None:
|
275 |
+
raise NotImplementedError()
|
276 |
+
|
277 |
+
if vision is not None:
|
278 |
+
vision_tokens = self.tokenize_input_and_cls_pos(
|
279 |
+
vision, self.rgbt_stem, patch_mask
|
280 |
+
)
|
281 |
+
|
282 |
+
if depth is not None:
|
283 |
+
depth_tokens = self.tokenize_input_and_cls_pos(
|
284 |
+
depth, self.depth_stem, patch_mask
|
285 |
+
)
|
286 |
+
|
287 |
+
# aggregate tokens
|
288 |
+
if vision is not None and depth is not None:
|
289 |
+
final_tokens = vision_tokens + depth_tokens
|
290 |
+
else:
|
291 |
+
final_tokens = vision_tokens if vision is not None else depth_tokens
|
292 |
+
return_dict = {
|
293 |
+
"trunk": {
|
294 |
+
"tokens": final_tokens,
|
295 |
+
},
|
296 |
+
"head": {},
|
297 |
+
}
|
298 |
+
return return_dict
|
299 |
+
|
300 |
+
|
301 |
+
class AudioPreprocessor(RGBDTPreprocessor):
|
302 |
+
def __init__(self, audio_stem: PatchEmbedGeneric, **kwargs) -> None:
|
303 |
+
super().__init__(rgbt_stem=audio_stem, depth_stem=None, **kwargs)
|
304 |
+
|
305 |
+
def forward(self, audio=None):
|
306 |
+
return super().forward(vision=audio)
|
307 |
+
|
308 |
+
|
309 |
+
class ThermalPreprocessor(RGBDTPreprocessor):
|
310 |
+
def __init__(self, thermal_stem: PatchEmbedGeneric, **kwargs) -> None:
|
311 |
+
super().__init__(rgbt_stem=thermal_stem, depth_stem=None, **kwargs)
|
312 |
+
|
313 |
+
def forward(self, thermal=None):
|
314 |
+
return super().forward(vision=thermal)
|
315 |
+
|
316 |
+
|
317 |
+
def build_causal_attention_mask(context_length):
|
318 |
+
# lazily create causal attention mask, with full attention between the vision tokens
|
319 |
+
# pytorch uses additive attention mask; fill with -inf
|
320 |
+
mask = torch.empty(context_length, context_length, requires_grad=False)
|
321 |
+
mask.fill_(float("-inf"))
|
322 |
+
mask.triu_(1) # zero out the lower diagonal
|
323 |
+
return mask
|
324 |
+
|
325 |
+
|
326 |
+
class TextPreprocessor(VerboseNNModule):
|
327 |
+
def __init__(
|
328 |
+
self,
|
329 |
+
vocab_size: int,
|
330 |
+
context_length: int,
|
331 |
+
embed_dim: int,
|
332 |
+
causal_masking: bool,
|
333 |
+
supply_seq_len_to_head: bool = True,
|
334 |
+
num_cls_tokens: int = 0,
|
335 |
+
init_param_style: str = "openclip",
|
336 |
+
) -> None:
|
337 |
+
super().__init__()
|
338 |
+
self.vocab_size = vocab_size
|
339 |
+
self.context_length = context_length
|
340 |
+
self.token_embedding = nn.Embedding(vocab_size, embed_dim)
|
341 |
+
self.pos_embed = nn.Parameter(
|
342 |
+
torch.empty(1, self.context_length + num_cls_tokens, embed_dim)
|
343 |
+
)
|
344 |
+
self.causal_masking = causal_masking
|
345 |
+
if self.causal_masking:
|
346 |
+
mask = build_causal_attention_mask(self.context_length)
|
347 |
+
# register the mask as a buffer so it can be moved to the right device
|
348 |
+
self.register_buffer("mask", mask)
|
349 |
+
|
350 |
+
self.supply_seq_len_to_head = supply_seq_len_to_head
|
351 |
+
self.num_cls_tokens = num_cls_tokens
|
352 |
+
self.embed_dim = embed_dim
|
353 |
+
if num_cls_tokens > 0:
|
354 |
+
assert self.causal_masking is False, "Masking + CLS token isn't implemented"
|
355 |
+
self.cls_token = nn.Parameter(
|
356 |
+
torch.zeros(1, self.num_cls_tokens, embed_dim)
|
357 |
+
)
|
358 |
+
|
359 |
+
self.init_parameters(init_param_style)
|
360 |
+
|
361 |
+
@torch.no_grad()
|
362 |
+
def init_parameters(self, init_param_style="openclip"):
|
363 |
+
# OpenCLIP style initialization
|
364 |
+
nn.init.normal_(self.token_embedding.weight, std=0.02)
|
365 |
+
nn.init.normal_(self.pos_embed, std=0.01)
|
366 |
+
|
367 |
+
if init_param_style == "openclip":
|
368 |
+
# OpenCLIP style initialization
|
369 |
+
scale = self.embed_dim**-0.5
|
370 |
+
if self.num_cls_tokens > 0:
|
371 |
+
nn.init.normal_(self.cls_token)
|
372 |
+
self.cls_token *= scale
|
373 |
+
elif init_param_style == "vit":
|
374 |
+
self.cls_token.data.fill_(0)
|
375 |
+
else:
|
376 |
+
raise ValueError(f"Unknown init {init_param_style}")
|
377 |
+
|
378 |
+
def forward(self, text):
|
379 |
+
# text tokens are of shape B x L x D
|
380 |
+
text_tokens = self.token_embedding(text)
|
381 |
+
# concat CLS tokens if any
|
382 |
+
if self.num_cls_tokens > 0:
|
383 |
+
B = text_tokens.shape[0]
|
384 |
+
class_tokens = self.cls_token.expand(
|
385 |
+
B, -1, -1
|
386 |
+
) # stole class_tokens impl from Phil Wang, thanks
|
387 |
+
text_tokens = torch.cat((class_tokens, text_tokens), dim=1)
|
388 |
+
text_tokens = text_tokens + self.pos_embed
|
389 |
+
return_dict = {
|
390 |
+
"trunk": {
|
391 |
+
"tokens": text_tokens,
|
392 |
+
},
|
393 |
+
"head": {},
|
394 |
+
}
|
395 |
+
# Compute sequence length after adding CLS tokens
|
396 |
+
if self.supply_seq_len_to_head:
|
397 |
+
text_lengths = text.argmax(dim=-1)
|
398 |
+
return_dict["head"] = {
|
399 |
+
"seq_len": text_lengths,
|
400 |
+
}
|
401 |
+
if self.causal_masking:
|
402 |
+
return_dict["trunk"].update({"attn_mask": self.mask})
|
403 |
+
return return_dict
|
404 |
+
|
405 |
+
|
406 |
+
class Im2Video(nn.Module):
|
407 |
+
"""Convert an image into a trivial video."""
|
408 |
+
|
409 |
+
def __init__(self, time_dim=2):
|
410 |
+
super().__init__()
|
411 |
+
self.time_dim = time_dim
|
412 |
+
|
413 |
+
def forward(self, x):
|
414 |
+
if x.ndim == 4:
|
415 |
+
# B, C, H, W -> B, C, T, H, W
|
416 |
+
return x.unsqueeze(self.time_dim)
|
417 |
+
elif x.ndim == 5:
|
418 |
+
return x
|
419 |
+
else:
|
420 |
+
raise ValueError(f"Dimension incorrect {x.shape}")
|
421 |
+
|
422 |
+
|
423 |
+
class PadIm2Video(Im2Video):
|
424 |
+
def __init__(self, ntimes, pad_type, time_dim=2):
|
425 |
+
super().__init__(time_dim=time_dim)
|
426 |
+
assert ntimes > 0
|
427 |
+
assert pad_type in ["zero", "repeat"]
|
428 |
+
self.ntimes = ntimes
|
429 |
+
self.pad_type = pad_type
|
430 |
+
|
431 |
+
def forward(self, x):
|
432 |
+
x = super().forward(x)
|
433 |
+
if x.shape[self.time_dim] == 1:
|
434 |
+
if self.pad_type == "repeat":
|
435 |
+
new_shape = [1] * len(x.shape)
|
436 |
+
new_shape[self.time_dim] = self.ntimes
|
437 |
+
x = x.repeat(new_shape)
|
438 |
+
elif self.pad_type == "zero":
|
439 |
+
padarg = [0, 0] * len(x.shape)
|
440 |
+
padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
|
441 |
+
x = nn.functional.pad(x, padarg)
|
442 |
+
return x
|
443 |
+
|
444 |
+
|
445 |
+
# Modified from github.com/openai/CLIP
|
446 |
+
@lru_cache()
|
447 |
+
def bytes_to_unicode():
|
448 |
+
"""
|
449 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
450 |
+
The reversible bpe codes work on unicode strings.
|
451 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
452 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
453 |
+
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
454 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
455 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
456 |
+
"""
|
457 |
+
bs = (
|
458 |
+
list(range(ord("!"), ord("~") + 1))
|
459 |
+
+ list(range(ord("¡"), ord("¬") + 1))
|
460 |
+
+ list(range(ord("®"), ord("ÿ") + 1))
|
461 |
+
)
|
462 |
+
cs = bs[:]
|
463 |
+
n = 0
|
464 |
+
for b in range(2**8):
|
465 |
+
if b not in bs:
|
466 |
+
bs.append(b)
|
467 |
+
cs.append(2**8 + n)
|
468 |
+
n += 1
|
469 |
+
cs = [chr(n) for n in cs]
|
470 |
+
return dict(zip(bs, cs))
|
471 |
+
|
472 |
+
|
473 |
+
def get_pairs(word):
|
474 |
+
"""Return set of symbol pairs in a word.
|
475 |
+
Word is represented as tuple of symbols (symbols being variable-length strings).
|
476 |
+
"""
|
477 |
+
pairs = set()
|
478 |
+
prev_char = word[0]
|
479 |
+
for char in word[1:]:
|
480 |
+
pairs.add((prev_char, char))
|
481 |
+
prev_char = char
|
482 |
+
return pairs
|
483 |
+
|
484 |
+
|
485 |
+
def basic_clean(text):
|
486 |
+
text = ftfy.fix_text(text)
|
487 |
+
text = html.unescape(html.unescape(text))
|
488 |
+
return text.strip()
|
489 |
+
|
490 |
+
|
491 |
+
def whitespace_clean(text):
|
492 |
+
text = re.sub(r"\s+", " ", text)
|
493 |
+
text = text.strip()
|
494 |
+
return text
|
495 |
+
|
496 |
+
|
497 |
+
class SimpleTokenizer(object):
|
498 |
+
def __init__(self, bpe_path: str, context_length=77):
|
499 |
+
self.byte_encoder = bytes_to_unicode()
|
500 |
+
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
501 |
+
|
502 |
+
with g_pathmgr.open(bpe_path, "rb") as fh:
|
503 |
+
bpe_bytes = io.BytesIO(fh.read())
|
504 |
+
merges: List[str] = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
|
505 |
+
merges = merges[1 : 49152 - 256 - 2 + 1]
|
506 |
+
merges: List[Tuple[str, ...]] = [tuple(merge.split()) for merge in merges]
|
507 |
+
vocab = list(bytes_to_unicode().values())
|
508 |
+
vocab = vocab + [v + "</w>" for v in vocab]
|
509 |
+
for merge in merges:
|
510 |
+
vocab.append("".join(merge))
|
511 |
+
vocab.extend(["<|startoftext|>", "<|endoftext|>"])
|
512 |
+
self.encoder = dict(zip(vocab, range(len(vocab))))
|
513 |
+
self.decoder = {v: k for k, v in self.encoder.items()}
|
514 |
+
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
515 |
+
self.cache = {
|
516 |
+
"<|startoftext|>": "<|startoftext|>",
|
517 |
+
"<|endoftext|>": "<|endoftext|>",
|
518 |
+
}
|
519 |
+
self.pat = re.compile(
|
520 |
+
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
|
521 |
+
re.IGNORECASE,
|
522 |
+
)
|
523 |
+
self.context_length = context_length
|
524 |
+
|
525 |
+
def bpe(self, token):
|
526 |
+
if token in self.cache:
|
527 |
+
return self.cache[token]
|
528 |
+
word = tuple(token[:-1]) + (token[-1] + "</w>",)
|
529 |
+
pairs = get_pairs(word)
|
530 |
+
|
531 |
+
if not pairs:
|
532 |
+
return token + "</w>"
|
533 |
+
|
534 |
+
while True:
|
535 |
+
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
|
536 |
+
if bigram not in self.bpe_ranks:
|
537 |
+
break
|
538 |
+
first, second = bigram
|
539 |
+
new_word = []
|
540 |
+
i = 0
|
541 |
+
while i < len(word):
|
542 |
+
try:
|
543 |
+
j = word.index(first, i)
|
544 |
+
new_word.extend(word[i:j])
|
545 |
+
i = j
|
546 |
+
except:
|
547 |
+
new_word.extend(word[i:])
|
548 |
+
break
|
549 |
+
|
550 |
+
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
|
551 |
+
new_word.append(first + second)
|
552 |
+
i += 2
|
553 |
+
else:
|
554 |
+
new_word.append(word[i])
|
555 |
+
i += 1
|
556 |
+
new_word = tuple(new_word)
|
557 |
+
word = new_word
|
558 |
+
if len(word) == 1:
|
559 |
+
break
|
560 |
+
else:
|
561 |
+
pairs = get_pairs(word)
|
562 |
+
word = " ".join(word)
|
563 |
+
self.cache[token] = word
|
564 |
+
return word
|
565 |
+
|
566 |
+
def encode(self, text):
|
567 |
+
bpe_tokens = []
|
568 |
+
text = whitespace_clean(basic_clean(text)).lower()
|
569 |
+
for token in re.findall(self.pat, text):
|
570 |
+
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
|
571 |
+
bpe_tokens.extend(
|
572 |
+
self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
|
573 |
+
)
|
574 |
+
return bpe_tokens
|
575 |
+
|
576 |
+
def decode(self, tokens):
|
577 |
+
text = "".join([self.decoder[token] for token in tokens])
|
578 |
+
text = (
|
579 |
+
bytearray([self.byte_decoder[c] for c in text])
|
580 |
+
.decode("utf-8", errors="replace")
|
581 |
+
.replace("</w>", " ")
|
582 |
+
)
|
583 |
+
return text
|
584 |
+
|
585 |
+
def __call__(self, texts, context_length=None):
|
586 |
+
if not context_length:
|
587 |
+
context_length = self.context_length
|
588 |
+
|
589 |
+
if isinstance(texts, str):
|
590 |
+
texts = [texts]
|
591 |
+
|
592 |
+
sot_token = self.encoder["<|startoftext|>"]
|
593 |
+
eot_token = self.encoder["<|endoftext|>"]
|
594 |
+
all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
|
595 |
+
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
|
596 |
+
|
597 |
+
for i, tokens in enumerate(all_tokens):
|
598 |
+
tokens = tokens[:context_length]
|
599 |
+
result[i, : len(tokens)] = torch.tensor(tokens)
|
600 |
+
|
601 |
+
if len(result) == 1:
|
602 |
+
return result[0]
|
603 |
+
return result
|
604 |
+
|
605 |
+
|
606 |
+
class IMUPreprocessor(VerboseNNModule):
|
607 |
+
def __init__(
|
608 |
+
self,
|
609 |
+
kernel_size: int,
|
610 |
+
imu_stem: PatchEmbedGeneric,
|
611 |
+
embed_dim: int,
|
612 |
+
img_size: Tuple = (6, 2000),
|
613 |
+
num_cls_tokens: int = 1,
|
614 |
+
pos_embed_fn: Optional[Callable] = None,
|
615 |
+
init_param_style: str = "openclip",
|
616 |
+
) -> None:
|
617 |
+
super().__init__()
|
618 |
+
self.imu_stem = imu_stem
|
619 |
+
self.embed_dim = embed_dim
|
620 |
+
self.use_pos_embed = pos_embed_fn is not None
|
621 |
+
self.num_cls_tokens = num_cls_tokens
|
622 |
+
self.kernel_size = kernel_size
|
623 |
+
self.pos_embed = nn.Parameter(
|
624 |
+
torch.empty(1, (img_size[1] // kernel_size) + num_cls_tokens, embed_dim)
|
625 |
+
)
|
626 |
+
|
627 |
+
if self.num_cls_tokens > 0:
|
628 |
+
self.cls_token = nn.Parameter(
|
629 |
+
torch.zeros(1, self.num_cls_tokens, self.embed_dim)
|
630 |
+
)
|
631 |
+
|
632 |
+
self.init_parameters(init_param_style)
|
633 |
+
|
634 |
+
@torch.no_grad()
|
635 |
+
def init_parameters(self, init_param_style):
|
636 |
+
nn.init.normal_(self.pos_embed, std=0.01)
|
637 |
+
|
638 |
+
if init_param_style == "openclip":
|
639 |
+
# OpenCLIP style initialization
|
640 |
+
scale = self.embed_dim**-0.5
|
641 |
+
|
642 |
+
if self.num_cls_tokens > 0:
|
643 |
+
nn.init.normal_(self.cls_token)
|
644 |
+
self.cls_token *= scale
|
645 |
+
elif init_param_style == "vit":
|
646 |
+
self.cls_token.data.fill_(0)
|
647 |
+
else:
|
648 |
+
raise ValueError(f"Unknown init {init_param_style}")
|
649 |
+
|
650 |
+
def tokenize_input_and_cls_pos(self, input, stem):
|
651 |
+
# tokens is of shape B x L x D
|
652 |
+
tokens = stem.norm_layer(stem.proj(input))
|
653 |
+
assert tokens.ndim == 3
|
654 |
+
assert tokens.shape[2] == self.embed_dim
|
655 |
+
B = tokens.shape[0]
|
656 |
+
if self.num_cls_tokens > 0:
|
657 |
+
class_tokens = self.cls_token.expand(
|
658 |
+
B, -1, -1
|
659 |
+
) # stole class_tokens impl from Phil Wang, thanks
|
660 |
+
tokens = torch.cat((class_tokens, tokens), dim=1)
|
661 |
+
if self.use_pos_embed:
|
662 |
+
tokens = tokens + self.pos_embed
|
663 |
+
return tokens
|
664 |
+
|
665 |
+
def forward(self, imu):
|
666 |
+
# Patchify
|
667 |
+
imu = imu.unfold(
|
668 |
+
-1,
|
669 |
+
self.kernel_size,
|
670 |
+
self.kernel_size,
|
671 |
+
).permute(0, 2, 1, 3)
|
672 |
+
imu = imu.reshape(imu.size(0), imu.size(1), -1)
|
673 |
+
|
674 |
+
imu_tokens = self.tokenize_input_and_cls_pos(
|
675 |
+
imu,
|
676 |
+
self.imu_stem,
|
677 |
+
)
|
678 |
+
|
679 |
+
return_dict = {
|
680 |
+
"trunk": {
|
681 |
+
"tokens": imu_tokens,
|
682 |
+
},
|
683 |
+
"head": {},
|
684 |
+
}
|
685 |
+
return return_dict
|
models/transformer.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
# Code modified from
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py ;
|
10 |
+
# https://github.com/facebookresearch/deit/blob/main/models.py
|
11 |
+
# and https://github.com/facebookresearch/vissl/blob/main/vissl/models/trunks/vision_transformer.py
|
12 |
+
|
13 |
+
|
14 |
+
from functools import partial
|
15 |
+
from typing import Callable, List, Optional
|
16 |
+
|
17 |
+
import torch
|
18 |
+
import torch.nn as nn
|
19 |
+
import torch.utils.checkpoint as checkpoint
|
20 |
+
from timm.models.layers import DropPath, trunc_normal_
|
21 |
+
|
22 |
+
|
23 |
+
class Attention(nn.Module):
|
24 |
+
def __init__(
|
25 |
+
self,
|
26 |
+
dim,
|
27 |
+
num_heads=8,
|
28 |
+
qkv_bias=False,
|
29 |
+
qk_scale=None,
|
30 |
+
attn_drop=0.0,
|
31 |
+
proj_drop=0.0,
|
32 |
+
):
|
33 |
+
super().__init__()
|
34 |
+
self.num_heads = num_heads
|
35 |
+
head_dim = dim // num_heads
|
36 |
+
# NOTE scale factor was wrong in my original version,
|
37 |
+
# can set manually to be compat with prev weights
|
38 |
+
self.scale = qk_scale or head_dim**-0.5
|
39 |
+
|
40 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
41 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
42 |
+
self.proj = nn.Linear(dim, dim)
|
43 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
44 |
+
|
45 |
+
def forward(self, x):
|
46 |
+
B, N, C = x.shape
|
47 |
+
qkv = (
|
48 |
+
self.qkv(x)
|
49 |
+
.reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
50 |
+
.permute(2, 0, 3, 1, 4)
|
51 |
+
)
|
52 |
+
q, k, v = (
|
53 |
+
qkv[0],
|
54 |
+
qkv[1],
|
55 |
+
qkv[2],
|
56 |
+
) # make torchscript happy (cannot use tensor as tuple)
|
57 |
+
|
58 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
59 |
+
attn = attn.softmax(dim=-1)
|
60 |
+
attn = self.attn_drop(attn)
|
61 |
+
|
62 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
63 |
+
x = self.proj(x)
|
64 |
+
x = self.proj_drop(x)
|
65 |
+
return x
|
66 |
+
|
67 |
+
|
68 |
+
class Mlp(nn.Module):
|
69 |
+
def __init__(
|
70 |
+
self,
|
71 |
+
in_features,
|
72 |
+
hidden_features=None,
|
73 |
+
out_features=None,
|
74 |
+
act_layer=nn.GELU,
|
75 |
+
drop=0.0,
|
76 |
+
):
|
77 |
+
super().__init__()
|
78 |
+
out_features = out_features or in_features
|
79 |
+
hidden_features = hidden_features or in_features
|
80 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
81 |
+
self.act = act_layer()
|
82 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
83 |
+
self.drop = nn.Dropout(drop)
|
84 |
+
|
85 |
+
def forward(self, x):
|
86 |
+
x = self.fc1(x)
|
87 |
+
x = self.act(x)
|
88 |
+
x = self.drop(x)
|
89 |
+
x = self.fc2(x)
|
90 |
+
x = self.drop(x)
|
91 |
+
return x
|
92 |
+
|
93 |
+
|
94 |
+
class MultiheadAttention(nn.MultiheadAttention):
|
95 |
+
def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
|
96 |
+
return super().forward(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
|
97 |
+
|
98 |
+
|
99 |
+
class ViTAttention(Attention):
|
100 |
+
def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
|
101 |
+
assert attn_mask is None
|
102 |
+
return super().forward(x)
|
103 |
+
|
104 |
+
|
105 |
+
class BlockWithMasking(nn.Module):
|
106 |
+
def __init__(
|
107 |
+
self,
|
108 |
+
dim: int,
|
109 |
+
attn_target: Callable,
|
110 |
+
mlp_ratio: int = 4,
|
111 |
+
act_layer: Callable = nn.GELU,
|
112 |
+
norm_layer: Callable = nn.LayerNorm,
|
113 |
+
ffn_dropout_rate: float = 0.0,
|
114 |
+
drop_path: float = 0.0,
|
115 |
+
layer_scale_type: Optional[str] = None,
|
116 |
+
layer_scale_init_value: float = 1e-4,
|
117 |
+
):
|
118 |
+
super().__init__()
|
119 |
+
|
120 |
+
assert not isinstance(
|
121 |
+
attn_target, nn.Module
|
122 |
+
), "attn_target should be a Callable. Otherwise attn_target is shared across blocks!"
|
123 |
+
self.attn = attn_target()
|
124 |
+
if drop_path > 0.0:
|
125 |
+
self.drop_path = DropPath(drop_path)
|
126 |
+
else:
|
127 |
+
self.drop_path = nn.Identity()
|
128 |
+
self.norm_1 = norm_layer(dim)
|
129 |
+
mlp_hidden_dim = int(mlp_ratio * dim)
|
130 |
+
self.mlp = Mlp(
|
131 |
+
in_features=dim,
|
132 |
+
hidden_features=mlp_hidden_dim,
|
133 |
+
act_layer=act_layer,
|
134 |
+
drop=ffn_dropout_rate,
|
135 |
+
)
|
136 |
+
self.norm_2 = norm_layer(dim)
|
137 |
+
self.layer_scale_type = layer_scale_type
|
138 |
+
if self.layer_scale_type is not None:
|
139 |
+
assert self.layer_scale_type in [
|
140 |
+
"per_channel",
|
141 |
+
"scalar",
|
142 |
+
], f"Found Layer scale type {self.layer_scale_type}"
|
143 |
+
if self.layer_scale_type == "per_channel":
|
144 |
+
# one gamma value per channel
|
145 |
+
gamma_shape = [1, 1, dim]
|
146 |
+
elif self.layer_scale_type == "scalar":
|
147 |
+
# single gamma value for all channels
|
148 |
+
gamma_shape = [1, 1, 1]
|
149 |
+
# two gammas: for each part of the fwd in the encoder
|
150 |
+
self.layer_scale_gamma1 = nn.Parameter(
|
151 |
+
torch.ones(size=gamma_shape) * layer_scale_init_value,
|
152 |
+
requires_grad=True,
|
153 |
+
)
|
154 |
+
self.layer_scale_gamma2 = nn.Parameter(
|
155 |
+
torch.ones(size=gamma_shape) * layer_scale_init_value,
|
156 |
+
requires_grad=True,
|
157 |
+
)
|
158 |
+
|
159 |
+
def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
|
160 |
+
if self.layer_scale_type is None:
|
161 |
+
x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask))
|
162 |
+
x = x + self.drop_path(self.mlp(self.norm_2(x)))
|
163 |
+
else:
|
164 |
+
x = (
|
165 |
+
x
|
166 |
+
+ self.drop_path(self.attn(self.norm_1(x), attn_mask))
|
167 |
+
* self.layer_scale_gamma1
|
168 |
+
)
|
169 |
+
x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2
|
170 |
+
return x
|
171 |
+
|
172 |
+
|
173 |
+
_LAYER_NORM = partial(nn.LayerNorm, eps=1e-6)
|
174 |
+
|
175 |
+
|
176 |
+
class SimpleTransformer(nn.Module):
|
177 |
+
def __init__(
|
178 |
+
self,
|
179 |
+
attn_target: Callable,
|
180 |
+
embed_dim: int,
|
181 |
+
num_blocks: int,
|
182 |
+
block: Callable = BlockWithMasking,
|
183 |
+
pre_transformer_layer: Optional[Callable] = None,
|
184 |
+
post_transformer_layer: Optional[Callable] = None,
|
185 |
+
drop_path_rate: float = 0.0,
|
186 |
+
drop_path_type: str = "progressive",
|
187 |
+
norm_layer: Callable = _LAYER_NORM,
|
188 |
+
mlp_ratio: int = 4,
|
189 |
+
ffn_dropout_rate: float = 0.0,
|
190 |
+
layer_scale_type: Optional[str] = None, # from cait; possible values are None, "per_channel", "scalar"
|
191 |
+
layer_scale_init_value: float = 1e-4, # from cait; float
|
192 |
+
weight_init_style: str = "jax", # possible values jax or pytorch
|
193 |
+
):
|
194 |
+
"""
|
195 |
+
Simple Transformer with the following features
|
196 |
+
1. Supports masked attention
|
197 |
+
2. Supports DropPath
|
198 |
+
3. Supports LayerScale
|
199 |
+
4. Supports Dropout in Attention and FFN
|
200 |
+
5. Makes few assumptions about the input except that it is a Tensor
|
201 |
+
"""
|
202 |
+
super().__init__()
|
203 |
+
self.pre_transformer_layer = pre_transformer_layer
|
204 |
+
if drop_path_type == "progressive":
|
205 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_blocks)]
|
206 |
+
elif drop_path_type == "uniform":
|
207 |
+
dpr = [drop_path_rate for i in range(num_blocks)]
|
208 |
+
else:
|
209 |
+
raise ValueError(f"Unknown drop_path_type: {drop_path_type}")
|
210 |
+
|
211 |
+
self.blocks = nn.Sequential(
|
212 |
+
*[
|
213 |
+
block(
|
214 |
+
dim=embed_dim,
|
215 |
+
attn_target=attn_target,
|
216 |
+
mlp_ratio=mlp_ratio,
|
217 |
+
ffn_dropout_rate=ffn_dropout_rate,
|
218 |
+
drop_path=dpr[i],
|
219 |
+
norm_layer=norm_layer,
|
220 |
+
layer_scale_type=layer_scale_type,
|
221 |
+
layer_scale_init_value=layer_scale_init_value,
|
222 |
+
)
|
223 |
+
for i in range(num_blocks)
|
224 |
+
]
|
225 |
+
)
|
226 |
+
self.post_transformer_layer = post_transformer_layer
|
227 |
+
self.weight_init_style = weight_init_style
|
228 |
+
self.apply(self._init_weights)
|
229 |
+
|
230 |
+
def _init_weights(self, m):
|
231 |
+
if isinstance(m, nn.Linear):
|
232 |
+
if self.weight_init_style == "jax":
|
233 |
+
# Based on MAE and official Jax ViT implementation
|
234 |
+
torch.nn.init.xavier_uniform_(m.weight)
|
235 |
+
elif self.weight_init_style == "pytorch":
|
236 |
+
# PyTorch ViT uses trunc_normal_
|
237 |
+
trunc_normal_(m.weight, std=0.02)
|
238 |
+
|
239 |
+
if m.bias is not None:
|
240 |
+
nn.init.constant_(m.bias, 0)
|
241 |
+
elif isinstance(m, (nn.LayerNorm)):
|
242 |
+
nn.init.constant_(m.bias, 0)
|
243 |
+
nn.init.constant_(m.weight, 1.0)
|
244 |
+
|
245 |
+
def forward(
|
246 |
+
self,
|
247 |
+
tokens: torch.Tensor,
|
248 |
+
attn_mask: torch.Tensor = None,
|
249 |
+
use_checkpoint: bool = False,
|
250 |
+
checkpoint_every_n: int = 1,
|
251 |
+
checkpoint_blk_ids: Optional[List[int]] = None,
|
252 |
+
):
|
253 |
+
"""
|
254 |
+
Inputs
|
255 |
+
- tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
|
256 |
+
- attn: mask of shape L x L
|
257 |
+
|
258 |
+
Output
|
259 |
+
- x: data of shape N x L x D (or L x N x D depending on the attention implementation)
|
260 |
+
"""
|
261 |
+
if self.pre_transformer_layer:
|
262 |
+
tokens = self.pre_transformer_layer(tokens)
|
263 |
+
if use_checkpoint and checkpoint_blk_ids is None:
|
264 |
+
checkpoint_blk_ids = [
|
265 |
+
blk_id
|
266 |
+
for blk_id in range(len(self.blocks))
|
267 |
+
if blk_id % checkpoint_every_n == 0
|
268 |
+
]
|
269 |
+
if checkpoint_blk_ids:
|
270 |
+
checkpoint_blk_ids = set(checkpoint_blk_ids)
|
271 |
+
for blk_id, blk in enumerate(self.blocks):
|
272 |
+
if use_checkpoint and blk_id in checkpoint_blk_ids:
|
273 |
+
tokens = checkpoint.checkpoint(
|
274 |
+
blk, tokens, attn_mask, use_reentrant=False
|
275 |
+
)
|
276 |
+
else:
|
277 |
+
tokens = blk(tokens, attn_mask=attn_mask)
|
278 |
+
if self.post_transformer_layer:
|
279 |
+
tokens = self.post_transformer_layer(tokens)
|
280 |
+
return tokens
|
output.txt
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pepsi Soda Cola 12 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 12 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
2 |
+
Pepsi Soda Cola 12 Fl Oz 12 Ct:brand is Pepsi | descriptionShort is Pepsi Soda Cola 12 Fl Oz 12 Ct | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
3 |
+
PepsiSodaCola12FlOz24Ct:brand is Pepsi | descriptionShort is PepsiSodaCola12FlOz24Ct | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Caramel Color, Sugar, Phosphoric Acid, Caffeine, Citric Acid, Natural Flavors.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
4 |
+
Diet Pepsi Soda Cola 12 Fl Oz:brand is Pepsi | descriptionShort is Diet Pepsi Soda Cola 12 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, CARAMEL COLOR, PHOSPHORIC ACID, POTASSIUM BENZOATE (PRESERVES FRESHNESS), SUCRALOSE, ACESULFAME POTASSIUM, CAFFEINE, NATURAL FLAVOR, CITRIC ACID.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
5 |
+
Diet Pepsi Soda Cola 12Flz24Ct:brand is Pepsi | descriptionShort is Diet Pepsi Soda Cola 12Flz24Ct | marketingMessage is Light. Crisp. Refreshing, 0 calories, and now aspartame free. Nothing refreshes like a Diet Pepsi | functionalName is Soda | ingredients is ['CARBONATED WATER, CARAMEL COLOR, PHOSPHORIC ACID, POTASSIUM BENZOATE (PRESERVES FRESHNESS), SUCRALOSE, ACESULFAME POTASSIUM, CAFFEINE, NATURAL FLAVOR, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
6 |
+
Mtn Dew Cd Red Dw Chry 12FlOz:brand is Mtn Dew | descriptionShort is Mtn Dew Cd Red Dw Chry 12FlOz | marketingMessage is Mtn Dew Code Red with a rush of Cherry flavor. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), SODIUM BENZOATE (PRESERVES FRESHNESS), NATURAL FLAVOR, CAFFEINE, SODIUM CITRATE, GUM ARABIC, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40, YELLOW 5, BLUE 1. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
7 |
+
H2Oh! SWB Original 20 Fl Oz:brand is H2Oh! | descriptionShort is H2Oh! SWB Original 20 Fl Oz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
|
8 |
+
H2Oh! SpklgWtBvg Berry 20 FlOz:brand is H2Oh! | descriptionShort is H2Oh! SpklgWtBvg Berry 20 FlOz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
|
9 |
+
Pepsi Caffeine Free 20 Fl Oz Pl Bt:brand is Pepsi | descriptionShort is Pepsi Caffeine Free 20 Fl Oz Pl Bt | marketingMessage is Caffeine Free Pepsi - great Pepsi taste without the caffeine | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
10 |
+
Mtn Dew Soda 12FlOz 6Pk:brand is Mtn Dew | descriptionShort is Mtn Dew Soda 12FlOz 6Pk | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['Ingredients: Carbonated Water,\xa0High Fructose Corn Syrup, Concentrated Orange Juice,\xa0Citric Acid,\xa0Natural Flavor,\xa0Sodium Benzoate (Preserves Freshness),\xa0Caffeine,\xa0Sodium Citrate,\xa0Erythorbic Acid (Preserves Freshness),\xa0Gum Arabic,\xa0Calcium Disodium EDTA (To Protect Flavor),\xa0Brominated Vegetable Oil,\xa0Yellow 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
11 |
+
Mtn Dew Soda Citrus 12 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 12 Fl Oz | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Erythorbic Acid (Preserves Freshness), Gum Arabic, Calcium Disodium EDTA (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
12 |
+
Mtn Dew Sda Citrus 12 FlOz 24Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Sda Citrus 12 FlOz 24Ct | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium EDTA (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
13 |
+
Dole 100% Apple Juice 10 Fl Oz:brand is Dole | descriptionShort is Dole 100% Apple Juice 10 Fl Oz | marketingMessage is Now you can enjoy the sweet taste of our sun-ripened fruits in a tall, refreshing glass. Dole juices are ready to drink right out of your refrigerator ' 100% juice and no added sugar. | functionalName is Juice Beverage | ingredients is ['APPLE JUICE CONCENTRATE (FILTERED WATER AND CONCENTRATED APPLE JUICE), ASCORBIC ACID (VITAMIN C), NATURAL FLAVORS, VITAMIN E ACETATE (VITAMIN E) AND NIACINAMIDE (VITAMIN B3). '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Juice Beverage
|
14 |
+
Pepsi Soda Cola 2 L:brand is Pepsi | descriptionShort is Pepsi Soda Cola 2 L | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
15 |
+
Pepsi Soda Cola 20 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 20 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
16 |
+
Mtn Dew 20 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew 20 Fl Oz | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, SODIUM BENZOATE (PRESERVES FRESHNESS), CAFFEINE, SODIUM CITRATE, GUM ARABIC, ERYTHORBIC ACID (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
17 |
+
Mtn Dew Diet Sda Citrus 20FlOz:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Sda Citrus 20FlOz | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
18 |
+
H2Oh! SWB Lemon-Lime 20 Fl Oz:brand is H2Oh! | descriptionShort is H2Oh! SWB Lemon-Lime 20 Fl Oz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
|
19 |
+
Aquafina Purified Dnkg Wtr 33.8FlOz:brand is Aquafina | descriptionShort is Aquafina Purified Dnkg Wtr 33.8FlOz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
|
20 |
+
Aquafina Water 20 Fl Oz:brand is Aquafina | descriptionShort is Aquafina Water 20 Fl Oz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
|
21 |
+
Aquafina Water 12FlOz 6Pk:brand is Aquafina | descriptionShort is Aquafina Water 12FlOz 6Pk | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Packaged Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Packaged Water
|
22 |
+
Mtn Dew Diet Soda Citrus 12 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Soda Citrus 12 Fl Oz | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
23 |
+
Diet Mtn Dew LwCl DWCtrs 12FlOz12Ct:brand is Mtn Dew | descriptionShort is Diet Mtn Dew LwCl DWCtrs 12FlOz12Ct | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
24 |
+
Mtn Dew Diet Ctrs 12 FlOz 24Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Ctrs 12 FlOz 24Ct | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
25 |
+
Starbucks Espresso & Crm 6.5FlOz:brand is Starbucks | descriptionShort is Starbucks Espresso & Crm 6.5FlOz | marketingMessage is A Premium Ready-to-Drink Coffee Beverage. Rich, bold Starbucks espresso, just the right amount of cream and a double dose of “done and done.” | functionalName is Premium Espresso Beverage | ingredients is ['Brewed Espresso Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Cream, Skim Milk.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Premium Espresso Beverage
|
26 |
+
Starbucks Frap Coffee 13.7FlOz:brand is Starbucks | descriptionShort is Starbucks Frap Coffee 13.7FlOz | marketingMessage is Starbucks coffee drinks offer the bold, delicious taste of coffee with the rich flavors you know and love. This indulgence is proof that you can enjoy a little Starbucks wherever you may be. | functionalName is Chilled Coffee Drink | ingredients is ['Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Maltodextrin, Pectin.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink
|
27 |
+
Dole Apple Juice 15.2 Fl Oz:brand is Dole | descriptionShort is Dole Apple Juice 15.2 Fl Oz | marketingMessage is | functionalName is Juice 100% | ingredients is | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Juice 100%
|
28 |
+
Brisk Iced Tea 1 Liter Plastic Bot:brand is Brisk | descriptionShort is Brisk Iced Tea 1 Liter Plastic Bot | marketingMessage is OK, we admit it. We scoured the South for the boldest Sweet Tea Recipe we could find. Then we stole it. Once you taste our Brisk Sweet Tea, we think you'll agree - the buckshot in our fenders was worth it. | functionalName is Iced Tea | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, PHOSPHORIC ACID, TEA POWDER, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), NATURAL FLAVOR, CARAMEL COLOR, POTASSIUM SORBATE (PRESERVES FRESHNESS), POTASSIUM BENZOATE (PRESERVES FRESHNESS), ACESULFAME POTASSIUM, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), SUCRALOSE, RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Iced Tea
|
29 |
+
Aquafina Water 50.7 Fl Oz:brand is Aquafina | descriptionShort is Aquafina Water 50.7 Fl Oz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Purified Drinking Water
|
30 |
+
Brisk Green Tea Apple 12FZ:brand is Brisk | descriptionShort is Brisk Green Tea Apple 12FZ | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Iced Tea | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, APPLE JUICE CONCENTRATE, SODIUM HEXAMETAPHOSPHATE (TO PROTECT FLAVOR), NATURAL FLAVOR, INSTANT TEA, GUM ARABIC, POTASSIUM SORBATE (PRESERVES FRESHNESS), CARAMEL COLOR, GLYCEROL ESTER OF ROSIN, CITRUS PECTIN, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Can'}] | sourceType is Iced Tea
|
31 |
+
Brisk JuiceDrink Lemnde 12FlOz:brand is Brisk | descriptionShort is Brisk JuiceDrink Lemnde 12FlOz | marketingMessage is Blast your Thirst with the bold, fruit flavored taste of Brisk Juice Drinks. Available in a wide variety of great tasting fruit flavors. | functionalName is Juice Drink | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, LEMON JUICE CONCENTRATE, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), GUM ARABIC, POTASSIUM BENZOATE (PRESERVE FRESHNESS), POTASSIUM SORBATE (PRESERVE FRESHNESS), SALT, GLYCEROL ESTER OF ROSIN, NATURAL FLAVOR, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Juice Drink
|
32 |
+
Mtn Dew Code Rd Chry 20 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Code Rd Chry 20 Fl Oz | marketingMessage is Mtn Dew Code Red with a rush of Cherry flavor. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), SODIUM BENZOATE (PRESERVES FRESHNESS), NATURAL FLAVOR, CAFFEINE, SODIUM CITRATE, GUM ARABIC, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40, YELLOW 5, BLUE 1. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
33 |
+
Mtn Dew Soda 21FlOz12Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Soda 21FlOz12Ct | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. The original, the one that started it all'Mtn Dew. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium Edta (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
34 |
+
Mtn Dew Soda Citrus 2L:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 2L | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium EDTA (to Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
|
35 |
+
Pepsi Soda Cola 8 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 8 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
36 |
+
Caleb's Kola 10 Fluid Ounce:brand is Caleb's Kola | descriptionShort is Caleb's Kola 10 Fluid Ounce | marketingMessage is A unique kola crafted by a passionate group of kola lovers using ingredients from around the world. | functionalName is Soda | ingredients is ['SPARKLING WATER, CANE SUGAR, CARAMEL COLOR, PHOSPHORIC ACID, NATURAL FLAVOR, SODIUM CITRATE, CAFFEINE, GUM ARABIC, CITRIC ACID, KOLA NUT EXTRACT.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Soda
|
37 |
+
Starbucks Frapp Chcfdr Crml 9.5FlOz:brand is Starbucks | descriptionShort is Starbucks Frapp Chcfdr Crml 9.5FlOz | marketingMessage is Pop the cap. Savor the sip. Go. Caramel Frappuccino® chilled coffee drink is a harmonious blend of Starbucks® coffee and creamy milk swirling with carmelly flavor. For a sweet burst of delicious. | functionalName is Chilled Coffee Drink | ingredients is ['Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Maltodextrin, Natural Flavors, Pectin.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink
|
38 |
+
Brisk JuiceDrink Lemnde 20FlOz:brand is Brisk | descriptionShort is Brisk JuiceDrink Lemnde 20FlOz | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Juice Drink | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, LEMON JUICE CONCENTRATE, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), GUM ARABIC, POTASSIUM BENZOATE (PRESERVE FRESHNESS), POTASSIUM SORBATE (PRESERVE FRESHNESS), SALT, GLYCEROL ESTER OF ROSIN, NATURAL FLAVOR, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Juice Drink
|
39 |
+
Brisk Iced Tea Lemon 24FO 6Pk:brand is Brisk | descriptionShort is Brisk Iced Tea Lemon 24FO 6Pk | marketingMessage is The original iced tea with tons of attitude. The one with the bold lemon flavor that kicked iced tea off the back porch and gave it some street cred. Now that's Brisk, baby! | functionalName is Iced Tea | ingredients is ['INGREDIENTS: WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, INSTANT TEA, SODIUM HEXAMETAPHOSPHATE (TO PROTECT FLAVOR), NATURAL FLAVOR, PHOSPHORIC ACID, SODIUM BENZOATE (PRESERVES FRESHNESS), POTASSIUM SORBATE (PRESERVES FRESHNESS), CARAMEL COLOR, ACESULFAME POTASSIUM, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), SUCRALOSE, RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Iced Tea
|
40 |
+
H2Oh! SWB Orange 20FlOz:brand is H2Oh! | descriptionShort is H2Oh! SWB Orange 20FlOz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
|
41 |
+
PEPSI 12OZPLSGL:brand is Pepsi | descriptionShort is PEPSI 12OZPLSGL | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Soda
|
42 |
+
Pepsi Soda Cola 1 L Bottle:brand is Pepsi | descriptionShort is Pepsi Soda Cola 1 L Bottle | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
|
43 |
+
Mtn Dew Soda Citrus 1L:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 1L | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, SODIUM BENZOATE (PRESERVES FRESHNESS), CAFFEINE, SODIUM CITRATE, GUM ARABIC, ERYTHORBIC ACID (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
44 |
+
Pepsi Soda Wild Cherry 12 FlOz:brand is Pepsi | descriptionShort is Pepsi Soda Wild Cherry 12 FlOz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, NATURAL FLAVOR, CAFFEINE, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
|
45 |
+
Pepsi Wild Cherry 2 L:brand is Pepsi | descriptionShort is Pepsi Wild Cherry 2 L | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, NATURAL FLAVOR, CAFFEINE, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
|
46 |
+
Brisk Diet IcdTea Lemn 12FlOz:brand is Brisk | descriptionShort is Brisk Diet IcdTea Lemn 12FlOz | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Iced Tea | ingredients is ['WATER, CITRIC ACID, TEA POWDER, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), NATURAL FLAVOR, CARAMEL COLOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), POTASSIUM SORBATE (PRESERVES FRESHNESS), ASPARTAME, ACESULFAME POTASSIUM, CITRUS PECTIN, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Iced Tea
|
47 |
+
Aquafina Water 16.9 Fl Oz 6 pack:brand is Aquafina | descriptionShort is Aquafina Water 16.9 Fl Oz 6 pack | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
|
48 |
+
Mtn Dew Diet Soda Citrus 1 L:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Soda Citrus 1 L | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
|
49 |
+
Starbucks Frpp ClcofDr MhLt 9.5FlOz:brand is Starbucks | descriptionShort is Starbucks Frpp ClcofDr MhLt 9.5FlOz | marketingMessage is Pop the cap. Savor the sip. Go. Mocha Light Frappuccino® chilled coffee drink is a harmonious blend of Starbucks® coffee and milk with chocolaty mocha'and 100 calories per bottle. For a chill that only tastes decadent. | functionalName is Chilled Coffee Drink | ingredients is ['Ingredients: Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Cocoa, Pectin, Sucralose, Acesulfame Potassium.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink
|