nimnim commited on
Commit
b66ad81
1 Parent(s): 8a086bb

Port ImageBind API

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/captured_image.jpg filter=lfs diff=lfs merge=lfs -text
__pycache__/data.cpython-311.pyc ADDED
Binary file (15 kB). View file
 
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import torch
4
+ import data
5
+ from models import imagebind_model
6
+ from models.imagebind_model import ModalityType
7
+
8
+ def read_dict_from_file(filename):
9
+ text_list = []
10
+ dictionary = {}
11
+ with open(filename, 'r') as file:
12
+ for line in file:
13
+ line = line.strip()
14
+ if line:
15
+ key, value = line.split(':', 1)
16
+ dictionary[key.strip()] = value.strip()
17
+ text_list.append(key)
18
+ return dictionary, text_list
19
+
20
+ text_output_path = 'output.txt'
21
+ database, text_list = read_dict_from_file(text_output_path)
22
+ image_paths = ['images/captured_image.jpg']
23
+
24
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
+
26
+ # Instantiate model
27
+ model = imagebind_model.imagebind_huge(pretrained=True)
28
+ model.eval()
29
+ model.to(device)
30
+
31
+ def run_model():
32
+ inputs = {
33
+ ModalityType.TEXT: data.load_and_transform_text(text_list, device),
34
+ ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
35
+ }
36
+
37
+ with torch.no_grad():
38
+ embeddings = model(inputs)
39
+
40
+ embeddings_matrix = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1)
41
+
42
+ model_image_to_text = torch.argmax(embeddings_matrix, dim=1)
43
+
44
+ return text_list[model_image_to_text[0]]
45
+
46
+ def predict(image):
47
+ # Save the image to the desired file path
48
+ cv2.imwrite(image_paths[0], cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
49
+ return run_model()
50
+
51
+ demo = iface = gr.Interface(
52
+ fn=predict,
53
+ inputs=gr.inputs.Image(),
54
+ outputs=gr.outputs.Label()
55
+ )
56
+
57
+ demo.launch()
bpe/.DS_Store ADDED
Binary file (6.15 kB). View file
 
bpe/bpe_simple_vocab_16e6.txt ADDED
The diff for this file is too large to render. See raw diff
 
bpe/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
data.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import logging
9
+ import math
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ import torchaudio
14
+ from PIL import Image
15
+ from pytorchvideo import transforms as pv_transforms
16
+ from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
17
+ from pytorchvideo.data.encoded_video import EncodedVideo
18
+ from torchvision import transforms
19
+ from torchvision.transforms._transforms_video import NormalizeVideo
20
+
21
+ from models.multimodal_preprocessors import SimpleTokenizer
22
+
23
+ DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
24
+
25
+ BPE_PATH = "bpe/bpe_simple_vocab_16e6.txt.gz"
26
+
27
+
28
+ def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
29
+ # Based on https://github.com/YuanGongND/ast/blob/d7d8b4b8e06cdaeb6c843cdb38794c1c7692234c/src/dataloader.py#L102
30
+ waveform -= waveform.mean()
31
+ fbank = torchaudio.compliance.kaldi.fbank(
32
+ waveform,
33
+ htk_compat=True,
34
+ sample_frequency=sample_rate,
35
+ use_energy=False,
36
+ window_type="hanning",
37
+ num_mel_bins=num_mel_bins,
38
+ dither=0.0,
39
+ frame_length=25,
40
+ frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
41
+ )
42
+ # Convert to [mel_bins, num_frames] shape
43
+ fbank = fbank.transpose(0, 1)
44
+ # Pad to target_length
45
+ n_frames = fbank.size(1)
46
+ p = target_length - n_frames
47
+ # if p is too large (say >20%), flash a warning
48
+ if abs(p) / n_frames > 0.2:
49
+ logging.warning(
50
+ "Large gap between audio n_frames(%d) and "
51
+ "target_length (%d). Is the audio_target_length "
52
+ "setting correct?",
53
+ n_frames,
54
+ target_length,
55
+ )
56
+ # cut and pad
57
+ if p > 0:
58
+ fbank = torch.nn.functional.pad(fbank, (0, p), mode="constant", value=0)
59
+ elif p < 0:
60
+ fbank = fbank[:, 0:target_length]
61
+ # Convert to [1, mel_bins, num_frames] shape, essentially like a 1
62
+ # channel image
63
+ fbank = fbank.unsqueeze(0)
64
+ return fbank
65
+
66
+
67
+ def get_clip_timepoints(clip_sampler, duration):
68
+ # Read out all clips in this video
69
+ all_clips_timepoints = []
70
+ is_last_clip = False
71
+ end = 0.0
72
+ while not is_last_clip:
73
+ start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
74
+ all_clips_timepoints.append((start, end))
75
+ return all_clips_timepoints
76
+
77
+
78
+ def load_and_transform_vision_data(image_paths, device):
79
+ if image_paths is None:
80
+ return None
81
+
82
+ image_outputs = []
83
+ for image_path in image_paths:
84
+ data_transform = transforms.Compose(
85
+ [
86
+ transforms.Resize(
87
+ 224, interpolation=transforms.InterpolationMode.BICUBIC
88
+ ),
89
+ transforms.CenterCrop(224),
90
+ transforms.ToTensor(),
91
+ transforms.Normalize(
92
+ mean=(0.48145466, 0.4578275, 0.40821073),
93
+ std=(0.26862954, 0.26130258, 0.27577711),
94
+ ),
95
+ ]
96
+ )
97
+ with open(image_path, "rb") as fopen:
98
+ image = Image.open(fopen).convert("RGB")
99
+
100
+ image = data_transform(image).to(device)
101
+ image_outputs.append(image)
102
+ return torch.stack(image_outputs, dim=0)
103
+
104
+
105
+ def load_and_transform_text(text, device):
106
+ if text is None:
107
+ return None
108
+ tokenizer = SimpleTokenizer(bpe_path=BPE_PATH)
109
+ tokens = [tokenizer(t).unsqueeze(0).to(device) for t in text]
110
+ tokens = torch.cat(tokens, dim=0)
111
+ return tokens
112
+
113
+
114
+ def load_and_transform_audio_data(
115
+ audio_paths,
116
+ device,
117
+ num_mel_bins=128,
118
+ target_length=204,
119
+ sample_rate=16000,
120
+ clip_duration=2,
121
+ clips_per_video=3,
122
+ mean=-4.268,
123
+ std=9.138,
124
+ ):
125
+ if audio_paths is None:
126
+ return None
127
+
128
+ audio_outputs = []
129
+ clip_sampler = ConstantClipsPerVideoSampler(
130
+ clip_duration=clip_duration, clips_per_video=clips_per_video
131
+ )
132
+
133
+ for audio_path in audio_paths:
134
+ waveform, sr = torchaudio.load(audio_path)
135
+ if sample_rate != sr:
136
+ waveform = torchaudio.functional.resample(
137
+ waveform, orig_freq=sr, new_freq=sample_rate
138
+ )
139
+ all_clips_timepoints = get_clip_timepoints(
140
+ clip_sampler, waveform.size(1) / sample_rate
141
+ )
142
+ all_clips = []
143
+ for clip_timepoints in all_clips_timepoints:
144
+ waveform_clip = waveform[
145
+ :,
146
+ int(clip_timepoints[0] * sample_rate) : int(
147
+ clip_timepoints[1] * sample_rate
148
+ ),
149
+ ]
150
+ waveform_melspec = waveform2melspec(
151
+ waveform_clip, sample_rate, num_mel_bins, target_length
152
+ )
153
+ all_clips.append(waveform_melspec)
154
+
155
+ normalize = transforms.Normalize(mean=mean, std=std)
156
+ all_clips = [normalize(ac).to(device) for ac in all_clips]
157
+
158
+ all_clips = torch.stack(all_clips, dim=0)
159
+ audio_outputs.append(all_clips)
160
+
161
+ return torch.stack(audio_outputs, dim=0)
162
+
163
+
164
+ def crop_boxes(boxes, x_offset, y_offset):
165
+ """
166
+ Perform crop on the bounding boxes given the offsets.
167
+ Args:
168
+ boxes (ndarray or None): bounding boxes to perform crop. The dimension
169
+ is `num boxes` x 4.
170
+ x_offset (int): cropping offset in the x axis.
171
+ y_offset (int): cropping offset in the y axis.
172
+ Returns:
173
+ cropped_boxes (ndarray or None): the cropped boxes with dimension of
174
+ `num boxes` x 4.
175
+ """
176
+ cropped_boxes = boxes.copy()
177
+ cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
178
+ cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
179
+
180
+ return cropped_boxes
181
+
182
+
183
+ def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
184
+ """
185
+ Perform uniform spatial sampling on the images and corresponding boxes.
186
+ Args:
187
+ images (tensor): images to perform uniform crop. The dimension is
188
+ `num frames` x `channel` x `height` x `width`.
189
+ size (int): size of height and weight to crop the images.
190
+ spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
191
+ is larger than height. Or 0, 1, or 2 for top, center, and bottom
192
+ crop if height is larger than width.
193
+ boxes (ndarray or None): optional. Corresponding boxes to images.
194
+ Dimension is `num boxes` x 4.
195
+ scale_size (int): optinal. If not None, resize the images to scale_size before
196
+ performing any crop.
197
+ Returns:
198
+ cropped (tensor): images with dimension of
199
+ `num frames` x `channel` x `size` x `size`.
200
+ cropped_boxes (ndarray or None): the cropped boxes with dimension of
201
+ `num boxes` x 4.
202
+ """
203
+ assert spatial_idx in [0, 1, 2]
204
+ ndim = len(images.shape)
205
+ if ndim == 3:
206
+ images = images.unsqueeze(0)
207
+ height = images.shape[2]
208
+ width = images.shape[3]
209
+
210
+ if scale_size is not None:
211
+ if width <= height:
212
+ width, height = scale_size, int(height / width * scale_size)
213
+ else:
214
+ width, height = int(width / height * scale_size), scale_size
215
+ images = torch.nn.functional.interpolate(
216
+ images,
217
+ size=(height, width),
218
+ mode="bilinear",
219
+ align_corners=False,
220
+ )
221
+
222
+ y_offset = int(math.ceil((height - size) / 2))
223
+ x_offset = int(math.ceil((width - size) / 2))
224
+
225
+ if height > width:
226
+ if spatial_idx == 0:
227
+ y_offset = 0
228
+ elif spatial_idx == 2:
229
+ y_offset = height - size
230
+ else:
231
+ if spatial_idx == 0:
232
+ x_offset = 0
233
+ elif spatial_idx == 2:
234
+ x_offset = width - size
235
+ cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
236
+ cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
237
+ if ndim == 3:
238
+ cropped = cropped.squeeze(0)
239
+ return cropped, cropped_boxes
240
+
241
+
242
+ class SpatialCrop(nn.Module):
243
+ """
244
+ Convert the video into 3 smaller clips spatially. Must be used after the
245
+ temporal crops to get spatial crops, and should be used with
246
+ -2 in the spatial crop at the slowfast augmentation stage (so full
247
+ frames are passed in here). Will return a larger list with the
248
+ 3x spatial crops as well.
249
+ """
250
+
251
+ def __init__(self, crop_size: int = 224, num_crops: int = 3):
252
+ super().__init__()
253
+ self.crop_size = crop_size
254
+ if num_crops == 3:
255
+ self.crops_to_ext = [0, 1, 2]
256
+ self.flipped_crops_to_ext = []
257
+ elif num_crops == 1:
258
+ self.crops_to_ext = [1]
259
+ self.flipped_crops_to_ext = []
260
+ else:
261
+ raise NotImplementedError("Nothing else supported yet")
262
+
263
+ def forward(self, videos):
264
+ """
265
+ Args:
266
+ videos: A list of C, T, H, W videos.
267
+ Returns:
268
+ videos: A list with 3x the number of elements. Each video converted
269
+ to C, T, H', W' by spatial cropping.
270
+ """
271
+ assert isinstance(videos, list), "Must be a list of videos after temporal crops"
272
+ assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
273
+ res = []
274
+ for video in videos:
275
+ for spatial_idx in self.crops_to_ext:
276
+ res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
277
+ if not self.flipped_crops_to_ext:
278
+ continue
279
+ flipped_video = transforms.functional.hflip(video)
280
+ for spatial_idx in self.flipped_crops_to_ext:
281
+ res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
282
+ return res
283
+
284
+
285
+ def load_and_transform_video_data(
286
+ video_paths,
287
+ device,
288
+ clip_duration=2,
289
+ clips_per_video=5,
290
+ sample_rate=16000,
291
+ ):
292
+ if video_paths is None:
293
+ return None
294
+
295
+ video_outputs = []
296
+ video_transform = transforms.Compose(
297
+ [
298
+ pv_transforms.ShortSideScale(224),
299
+ NormalizeVideo(
300
+ mean=(0.48145466, 0.4578275, 0.40821073),
301
+ std=(0.26862954, 0.26130258, 0.27577711),
302
+ ),
303
+ ]
304
+ )
305
+
306
+ clip_sampler = ConstantClipsPerVideoSampler(
307
+ clip_duration=clip_duration, clips_per_video=clips_per_video
308
+ )
309
+ frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
310
+
311
+ for video_path in video_paths:
312
+ video = EncodedVideo.from_path(
313
+ video_path,
314
+ decoder="decord",
315
+ decode_audio=False,
316
+ **{"sample_rate": sample_rate},
317
+ )
318
+
319
+ all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
320
+
321
+ all_video = []
322
+ for clip_timepoints in all_clips_timepoints:
323
+ # Read the clip, get frames
324
+ clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
325
+ if clip is None:
326
+ raise ValueError("No clip found")
327
+ video_clip = frame_sampler(clip["video"])
328
+ video_clip = video_clip / 255.0 # since this is float, need 0-1
329
+
330
+ all_video.append(video_clip)
331
+
332
+ all_video = [video_transform(clip) for clip in all_video]
333
+ all_video = SpatialCrop(224, num_crops=3)(all_video)
334
+
335
+ all_video = torch.stack(all_video, dim=0)
336
+ video_outputs.append(all_video)
337
+
338
+ return torch.stack(video_outputs, dim=0).to(device)
images/captured_image.jpg ADDED

Git LFS Details

  • SHA256: e913749e6317ce83d5c9a3c9024114e7465cb5c65900b225482a6d31e3a03fdd
  • Pointer size: 132 Bytes
  • Size of remote file: 1.22 MB
models/__init__.py ADDED
File without changes
models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (197 Bytes). View file
 
models/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (164 Bytes). View file
 
models/__pycache__/helpers.cpython-311.pyc ADDED
Binary file (8.84 kB). View file
 
models/__pycache__/helpers.cpython-38.pyc ADDED
Binary file (5.2 kB). View file
 
models/__pycache__/imagebind_model.cpython-311.pyc ADDED
Binary file (14.7 kB). View file
 
models/__pycache__/imagebind_model.cpython-38.pyc ADDED
Binary file (8.11 kB). View file
 
models/__pycache__/multimodal_preprocessors.cpython-311.pyc ADDED
Binary file (38.2 kB). View file
 
models/__pycache__/multimodal_preprocessors.cpython-38.pyc ADDED
Binary file (20 kB). View file
 
models/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (14.8 kB). View file
 
models/__pycache__/transformer.cpython-38.pyc ADDED
Binary file (7.92 kB). View file
 
models/helpers.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+
9
+ import einops
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn as nn
13
+
14
+
15
+ class Normalize(nn.Module):
16
+ def __init__(self, dim: int) -> None:
17
+ super().__init__()
18
+ self.dim = dim
19
+
20
+ def forward(self, x):
21
+ return torch.nn.functional.normalize(x, dim=self.dim, p=2)
22
+
23
+
24
+ class LearnableLogitScaling(nn.Module):
25
+ def __init__(
26
+ self,
27
+ logit_scale_init: float = 1 / 0.07,
28
+ learnable: bool = True,
29
+ max_logit_scale: float = 100,
30
+ ) -> None:
31
+ super().__init__()
32
+ self.max_logit_scale = max_logit_scale
33
+ self.logit_scale_init = logit_scale_init
34
+ self.learnable = learnable
35
+ log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init)
36
+ if learnable:
37
+ self.log_logit_scale = nn.Parameter(log_logit_scale)
38
+ else:
39
+ self.register_buffer("log_logit_scale", log_logit_scale)
40
+
41
+ def forward(self, x):
42
+ return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x
43
+
44
+ def extra_repr(self):
45
+ st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}," \
46
+ f" max_logit_scale={self.max_logit_scale}"
47
+ return st
48
+
49
+
50
+ class EinOpsRearrange(nn.Module):
51
+ def __init__(self, rearrange_expr: str, **kwargs) -> None:
52
+ super().__init__()
53
+ self.rearrange_expr = rearrange_expr
54
+ self.kwargs = kwargs
55
+
56
+ def forward(self, x):
57
+ assert isinstance(x, torch.Tensor)
58
+ return einops.rearrange(x, self.rearrange_expr, **self.kwargs)
59
+
60
+
61
+ class VerboseNNModule(nn.Module):
62
+ """
63
+ Wrapper around nn.Module that prints registered buffers and parameter names.
64
+ """
65
+
66
+ @staticmethod
67
+ def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str:
68
+ st = (
69
+ "("
70
+ + name
71
+ + "): "
72
+ + "tensor("
73
+ + str(tuple(tensor[1].shape))
74
+ + ", requires_grad="
75
+ + str(tensor[1].requires_grad)
76
+ + ")\n"
77
+ )
78
+ return st
79
+
80
+ def extra_repr(self) -> str:
81
+ named_modules = set()
82
+ for p in self.named_modules():
83
+ named_modules.update([p[0]])
84
+ named_modules = list(named_modules)
85
+
86
+ string_repr = ""
87
+ for p in self.named_parameters():
88
+ name = p[0].split(".")[0]
89
+ if name not in named_modules:
90
+ string_repr += self.get_readable_tensor_repr(name, p)
91
+
92
+ for p in self.named_buffers():
93
+ name = p[0].split(".")[0]
94
+ string_repr += self.get_readable_tensor_repr(name, p)
95
+
96
+ return string_repr
97
+
98
+
99
+ def cast_if_src_dtype(
100
+ tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype
101
+ ):
102
+ updated = False
103
+ if tensor.dtype == src_dtype:
104
+ tensor = tensor.to(dtype=tgt_dtype)
105
+ updated = True
106
+ return tensor, updated
107
+
108
+
109
+ class QuickGELU(nn.Module):
110
+ # From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166
111
+ def forward(self, x: torch.Tensor):
112
+ return x * torch.sigmoid(1.702 * x)
113
+
114
+
115
+ class SelectElement(nn.Module):
116
+ def __init__(self, index) -> None:
117
+ super().__init__()
118
+ self.index = index
119
+
120
+ def forward(self, x):
121
+ assert x.ndim >= 3
122
+ return x[:, self.index, ...]
123
+
124
+
125
+ class SelectEOSAndProject(nn.Module):
126
+ """
127
+ Text Pooling used in OpenCLIP
128
+ """
129
+
130
+ def __init__(self, proj: nn.Module) -> None:
131
+ super().__init__()
132
+ self.proj = proj
133
+
134
+ def forward(self, x, seq_len):
135
+ assert x.ndim == 3
136
+ # x is of shape B x L x D
137
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
138
+ x = x[torch.arange(x.shape[0]), seq_len]
139
+ x = self.proj(x)
140
+ return x
models/imagebind_model.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+
9
+ import os
10
+ from functools import partial
11
+ from types import SimpleNamespace
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+
16
+ from models.helpers import (EinOpsRearrange, LearnableLogitScaling, Normalize,
17
+ SelectElement, SelectEOSAndProject)
18
+ from models.multimodal_preprocessors import (AudioPreprocessor,
19
+ IMUPreprocessor, PadIm2Video,
20
+ PatchEmbedGeneric,
21
+ RGBDTPreprocessor,
22
+ SpatioTemporalPosEmbeddingHelper,
23
+ TextPreprocessor,
24
+ ThermalPreprocessor)
25
+ from models.transformer import MultiheadAttention, SimpleTransformer
26
+
27
+ ModalityType = SimpleNamespace(
28
+ VISION="vision",
29
+ TEXT="text",
30
+ AUDIO="audio",
31
+ THERMAL="thermal",
32
+ DEPTH="depth",
33
+ IMU="imu",
34
+ )
35
+
36
+
37
+ class ImageBindModel(nn.Module):
38
+ def __init__(
39
+ self,
40
+ video_frames=2,
41
+ kernel_size=(2, 14, 14),
42
+ audio_kernel_size=16,
43
+ audio_stride=10,
44
+ out_embed_dim=768,
45
+ vision_embed_dim=1024,
46
+ vision_num_blocks=24,
47
+ vision_num_heads=16,
48
+ audio_embed_dim=768,
49
+ audio_num_blocks=12,
50
+ audio_num_heads=12,
51
+ audio_num_mel_bins=128,
52
+ audio_target_len=204,
53
+ audio_drop_path=0.1,
54
+ text_embed_dim=768,
55
+ text_num_blocks=12,
56
+ text_num_heads=12,
57
+ depth_embed_dim=384,
58
+ depth_kernel_size=16,
59
+ depth_num_blocks=12,
60
+ depth_num_heads=8,
61
+ depth_drop_path=0.0,
62
+ thermal_embed_dim=768,
63
+ thermal_kernel_size=16,
64
+ thermal_num_blocks=12,
65
+ thermal_num_heads=12,
66
+ thermal_drop_path=0.0,
67
+ imu_embed_dim=512,
68
+ imu_kernel_size=8,
69
+ imu_num_blocks=6,
70
+ imu_num_heads=8,
71
+ imu_drop_path=0.7,
72
+ ):
73
+ super().__init__()
74
+
75
+ self.modality_preprocessors = self._create_modality_preprocessors(
76
+ video_frames,
77
+ vision_embed_dim,
78
+ kernel_size,
79
+ text_embed_dim,
80
+ audio_embed_dim,
81
+ audio_kernel_size,
82
+ audio_stride,
83
+ audio_num_mel_bins,
84
+ audio_target_len,
85
+ depth_embed_dim,
86
+ depth_kernel_size,
87
+ thermal_embed_dim,
88
+ thermal_kernel_size,
89
+ imu_embed_dim,
90
+ )
91
+
92
+ self.modality_trunks = self._create_modality_trunks(
93
+ vision_embed_dim,
94
+ vision_num_blocks,
95
+ vision_num_heads,
96
+ text_embed_dim,
97
+ text_num_blocks,
98
+ text_num_heads,
99
+ audio_embed_dim,
100
+ audio_num_blocks,
101
+ audio_num_heads,
102
+ audio_drop_path,
103
+ depth_embed_dim,
104
+ depth_num_blocks,
105
+ depth_num_heads,
106
+ depth_drop_path,
107
+ thermal_embed_dim,
108
+ thermal_num_blocks,
109
+ thermal_num_heads,
110
+ thermal_drop_path,
111
+ imu_embed_dim,
112
+ imu_num_blocks,
113
+ imu_num_heads,
114
+ imu_drop_path,
115
+ )
116
+
117
+ self.modality_heads = self._create_modality_heads(
118
+ out_embed_dim,
119
+ vision_embed_dim,
120
+ text_embed_dim,
121
+ audio_embed_dim,
122
+ depth_embed_dim,
123
+ thermal_embed_dim,
124
+ imu_embed_dim,
125
+ )
126
+
127
+ self.modality_postprocessors = self._create_modality_postprocessors(
128
+ out_embed_dim
129
+ )
130
+
131
+ def _create_modality_preprocessors(
132
+ self,
133
+ video_frames=2,
134
+ vision_embed_dim=1024,
135
+ kernel_size=(2, 14, 14),
136
+ text_embed_dim=768,
137
+ audio_embed_dim=768,
138
+ audio_kernel_size=16,
139
+ audio_stride=10,
140
+ audio_num_mel_bins=128,
141
+ audio_target_len=204,
142
+ depth_embed_dim=768,
143
+ depth_kernel_size=16,
144
+ thermal_embed_dim=768,
145
+ thermal_kernel_size=16,
146
+ imu_embed_dim=512,
147
+ ):
148
+ rgbt_stem = PatchEmbedGeneric(
149
+ proj_stem=[
150
+ PadIm2Video(pad_type="repeat", ntimes=2),
151
+ nn.Conv3d(
152
+ in_channels=3,
153
+ kernel_size=kernel_size,
154
+ out_channels=vision_embed_dim,
155
+ stride=kernel_size,
156
+ bias=False,
157
+ ),
158
+ ]
159
+ )
160
+ rgbt_preprocessor = RGBDTPreprocessor(
161
+ img_size=[3, video_frames, 224, 224],
162
+ num_cls_tokens=1,
163
+ pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
164
+ rgbt_stem=rgbt_stem,
165
+ depth_stem=None,
166
+ )
167
+
168
+ text_preprocessor = TextPreprocessor(
169
+ context_length=77,
170
+ vocab_size=49408,
171
+ embed_dim=text_embed_dim,
172
+ causal_masking=True,
173
+ )
174
+
175
+ audio_stem = PatchEmbedGeneric(
176
+ proj_stem=[
177
+ nn.Conv2d(
178
+ in_channels=1,
179
+ kernel_size=audio_kernel_size,
180
+ stride=audio_stride,
181
+ out_channels=audio_embed_dim,
182
+ bias=False,
183
+ ),
184
+ ],
185
+ norm_layer=nn.LayerNorm(normalized_shape=audio_embed_dim),
186
+ )
187
+ audio_preprocessor = AudioPreprocessor(
188
+ img_size=[1, audio_num_mel_bins, audio_target_len],
189
+ num_cls_tokens=1,
190
+ pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
191
+ audio_stem=audio_stem,
192
+ )
193
+
194
+ depth_stem = PatchEmbedGeneric(
195
+ [
196
+ nn.Conv2d(
197
+ kernel_size=depth_kernel_size,
198
+ in_channels=1,
199
+ out_channels=depth_embed_dim,
200
+ stride=depth_kernel_size,
201
+ bias=False,
202
+ ),
203
+ ],
204
+ norm_layer=nn.LayerNorm(normalized_shape=depth_embed_dim),
205
+ )
206
+
207
+ depth_preprocessor = RGBDTPreprocessor(
208
+ img_size=[1, 224, 224],
209
+ num_cls_tokens=1,
210
+ pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
211
+ rgbt_stem=None,
212
+ depth_stem=depth_stem,
213
+ )
214
+
215
+ thermal_stem = PatchEmbedGeneric(
216
+ [
217
+ nn.Conv2d(
218
+ kernel_size=thermal_kernel_size,
219
+ in_channels=1,
220
+ out_channels=thermal_embed_dim,
221
+ stride=thermal_kernel_size,
222
+ bias=False,
223
+ ),
224
+ ],
225
+ norm_layer=nn.LayerNorm(normalized_shape=thermal_embed_dim),
226
+ )
227
+ thermal_preprocessor = ThermalPreprocessor(
228
+ img_size=[1, 224, 224],
229
+ num_cls_tokens=1,
230
+ pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
231
+ thermal_stem=thermal_stem,
232
+ )
233
+
234
+ imu_stem = PatchEmbedGeneric(
235
+ [
236
+ nn.Linear(
237
+ in_features=48,
238
+ out_features=imu_embed_dim,
239
+ bias=False,
240
+ ),
241
+ ],
242
+ norm_layer=nn.LayerNorm(normalized_shape=imu_embed_dim),
243
+ )
244
+
245
+ imu_preprocessor = IMUPreprocessor(
246
+ img_size=[6, 2000],
247
+ num_cls_tokens=1,
248
+ kernel_size=8,
249
+ embed_dim=imu_embed_dim,
250
+ pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
251
+ imu_stem=imu_stem,
252
+ )
253
+
254
+ modality_preprocessors = {
255
+ ModalityType.VISION: rgbt_preprocessor,
256
+ ModalityType.TEXT: text_preprocessor,
257
+ ModalityType.AUDIO: audio_preprocessor,
258
+ ModalityType.DEPTH: depth_preprocessor,
259
+ ModalityType.THERMAL: thermal_preprocessor,
260
+ ModalityType.IMU: imu_preprocessor,
261
+ }
262
+
263
+ return nn.ModuleDict(modality_preprocessors)
264
+
265
+ def _create_modality_trunks(
266
+ self,
267
+ vision_embed_dim=1024,
268
+ vision_num_blocks=24,
269
+ vision_num_heads=16,
270
+ text_embed_dim=768,
271
+ text_num_blocks=12,
272
+ text_num_heads=12,
273
+ audio_embed_dim=768,
274
+ audio_num_blocks=12,
275
+ audio_num_heads=12,
276
+ audio_drop_path=0.0,
277
+ depth_embed_dim=768,
278
+ depth_num_blocks=12,
279
+ depth_num_heads=12,
280
+ depth_drop_path=0.0,
281
+ thermal_embed_dim=768,
282
+ thermal_num_blocks=12,
283
+ thermal_num_heads=12,
284
+ thermal_drop_path=0.0,
285
+ imu_embed_dim=512,
286
+ imu_num_blocks=6,
287
+ imu_num_heads=8,
288
+ imu_drop_path=0.7,
289
+ ):
290
+ def instantiate_trunk(
291
+ embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path
292
+ ):
293
+ return SimpleTransformer(
294
+ embed_dim=embed_dim,
295
+ num_blocks=num_blocks,
296
+ ffn_dropout_rate=0.0,
297
+ drop_path_rate=drop_path,
298
+ attn_target=partial(
299
+ MultiheadAttention,
300
+ embed_dim=embed_dim,
301
+ num_heads=num_heads,
302
+ bias=True,
303
+ add_bias_kv=add_bias_kv,
304
+ ),
305
+ pre_transformer_layer=nn.Sequential(
306
+ nn.LayerNorm(embed_dim, eps=1e-6)
307
+ if pre_transformer_ln
308
+ else nn.Identity(),
309
+ EinOpsRearrange("b l d -> l b d"),
310
+ ),
311
+ post_transformer_layer=EinOpsRearrange("l b d -> b l d"),
312
+ )
313
+
314
+ modality_trunks = {}
315
+ modality_trunks[ModalityType.VISION] = instantiate_trunk(
316
+ vision_embed_dim,
317
+ vision_num_blocks,
318
+ vision_num_heads,
319
+ pre_transformer_ln=True,
320
+ add_bias_kv=False,
321
+ drop_path=0.0,
322
+ )
323
+ modality_trunks[ModalityType.TEXT] = instantiate_trunk(
324
+ text_embed_dim,
325
+ text_num_blocks,
326
+ text_num_heads,
327
+ pre_transformer_ln=False,
328
+ add_bias_kv=False,
329
+ drop_path=0.0,
330
+ )
331
+ modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
332
+ audio_embed_dim,
333
+ audio_num_blocks,
334
+ audio_num_heads,
335
+ pre_transformer_ln=False,
336
+ add_bias_kv=True,
337
+ drop_path=audio_drop_path,
338
+ )
339
+ modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
340
+ depth_embed_dim,
341
+ depth_num_blocks,
342
+ depth_num_heads,
343
+ pre_transformer_ln=False,
344
+ add_bias_kv=True,
345
+ drop_path=depth_drop_path,
346
+ )
347
+ modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
348
+ thermal_embed_dim,
349
+ thermal_num_blocks,
350
+ thermal_num_heads,
351
+ pre_transformer_ln=False,
352
+ add_bias_kv=True,
353
+ drop_path=thermal_drop_path,
354
+ )
355
+ modality_trunks[ModalityType.IMU] = instantiate_trunk(
356
+ imu_embed_dim,
357
+ imu_num_blocks,
358
+ imu_num_heads,
359
+ pre_transformer_ln=False,
360
+ add_bias_kv=True,
361
+ drop_path=imu_drop_path,
362
+ )
363
+
364
+ return nn.ModuleDict(modality_trunks)
365
+
366
+ def _create_modality_heads(
367
+ self,
368
+ out_embed_dim,
369
+ vision_embed_dim,
370
+ text_embed_dim,
371
+ audio_embed_dim,
372
+ depth_embed_dim,
373
+ thermal_embed_dim,
374
+ imu_embed_dim,
375
+ ):
376
+ modality_heads = {}
377
+
378
+ modality_heads[ModalityType.VISION] = nn.Sequential(
379
+ nn.LayerNorm(normalized_shape=vision_embed_dim, eps=1e-6),
380
+ SelectElement(index=0),
381
+ nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
382
+ )
383
+
384
+ modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
385
+ proj=nn.Sequential(
386
+ nn.LayerNorm(normalized_shape=text_embed_dim, eps=1e-6),
387
+ nn.Linear(text_embed_dim, out_embed_dim, bias=False),
388
+ )
389
+ )
390
+
391
+ modality_heads[ModalityType.AUDIO] = nn.Sequential(
392
+ nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
393
+ SelectElement(index=0),
394
+ nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
395
+ )
396
+
397
+ modality_heads[ModalityType.DEPTH] = nn.Sequential(
398
+ nn.LayerNorm(normalized_shape=depth_embed_dim, eps=1e-6),
399
+ SelectElement(index=0),
400
+ nn.Linear(depth_embed_dim, out_embed_dim, bias=False),
401
+ )
402
+
403
+ modality_heads[ModalityType.THERMAL] = nn.Sequential(
404
+ nn.LayerNorm(normalized_shape=thermal_embed_dim, eps=1e-6),
405
+ SelectElement(index=0),
406
+ nn.Linear(thermal_embed_dim, out_embed_dim, bias=False),
407
+ )
408
+
409
+ modality_heads[ModalityType.IMU] = nn.Sequential(
410
+ nn.LayerNorm(normalized_shape=imu_embed_dim, eps=1e-6),
411
+ SelectElement(index=0),
412
+ nn.Dropout(p=0.5),
413
+ nn.Linear(imu_embed_dim, out_embed_dim, bias=False),
414
+ )
415
+
416
+ return nn.ModuleDict(modality_heads)
417
+
418
+ def _create_modality_postprocessors(self, out_embed_dim):
419
+ modality_postprocessors = {}
420
+
421
+ modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
422
+ modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
423
+ Normalize(dim=-1), LearnableLogitScaling(learnable=True)
424
+ )
425
+ modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
426
+ Normalize(dim=-1),
427
+ LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
428
+ )
429
+ modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
430
+ Normalize(dim=-1),
431
+ LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
432
+ )
433
+ modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
434
+ Normalize(dim=-1),
435
+ LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
436
+ )
437
+ modality_postprocessors[ModalityType.IMU] = nn.Sequential(
438
+ Normalize(dim=-1),
439
+ LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
440
+ )
441
+
442
+ return nn.ModuleDict(modality_postprocessors)
443
+
444
+ def forward(self, inputs):
445
+ outputs = {}
446
+ for modality_key, modality_value in inputs.items():
447
+ reduce_list = (
448
+ modality_value.ndim >= 5
449
+ ) # Audio and Video inputs consist of multiple clips
450
+ if reduce_list:
451
+ B, S = modality_value.shape[:2]
452
+ modality_value = modality_value.reshape(
453
+ B * S, *modality_value.shape[2:]
454
+ )
455
+
456
+ if modality_value is not None:
457
+ modality_value = self.modality_preprocessors[modality_key](
458
+ **{modality_key: modality_value}
459
+ )
460
+ trunk_inputs = modality_value["trunk"]
461
+ head_inputs = modality_value["head"]
462
+ modality_value = self.modality_trunks[modality_key](**trunk_inputs)
463
+ modality_value = self.modality_heads[modality_key](
464
+ modality_value, **head_inputs
465
+ )
466
+ modality_value = self.modality_postprocessors[modality_key](
467
+ modality_value
468
+ )
469
+
470
+ if reduce_list:
471
+ modality_value = modality_value.reshape(B, S, -1)
472
+ modality_value = modality_value.mean(dim=1)
473
+
474
+ outputs[modality_key] = modality_value
475
+
476
+ return outputs
477
+
478
+
479
+ def imagebind_huge(pretrained=False):
480
+ model = ImageBindModel(
481
+ vision_embed_dim=1280,
482
+ vision_num_blocks=32,
483
+ vision_num_heads=16,
484
+ text_embed_dim=1024,
485
+ text_num_blocks=24,
486
+ text_num_heads=16,
487
+ out_embed_dim=1024,
488
+ audio_drop_path=0.1,
489
+ imu_drop_path=0.7,
490
+ )
491
+
492
+ if pretrained:
493
+ if not os.path.exists(".checkpoints/imagebind_huge.pth"):
494
+ print(
495
+ "Downloading imagebind weights to .checkpoints/imagebind_huge.pth ..."
496
+ )
497
+ os.makedirs(".checkpoints", exist_ok=True)
498
+ torch.hub.download_url_to_file(
499
+ "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
500
+ ".checkpoints/imagebind_huge.pth",
501
+ progress=True,
502
+ )
503
+
504
+ model.load_state_dict(torch.load(".checkpoints/imagebind_huge.pth"))
505
+
506
+ return model
models/multimodal_preprocessors.py ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import gzip
9
+ import html
10
+ import io
11
+ import math
12
+ from functools import lru_cache
13
+ from typing import Callable, List, Optional, Tuple
14
+
15
+ import ftfy
16
+ import numpy as np
17
+ import regex as re
18
+ import torch
19
+ import torch.nn as nn
20
+ from iopath.common.file_io import g_pathmgr
21
+ from timm.models.layers import trunc_normal_
22
+
23
+ from models.helpers import VerboseNNModule, cast_if_src_dtype
24
+
25
+
26
+ def get_sinusoid_encoding_table(n_position, d_hid):
27
+ """Sinusoid position encoding table"""
28
+
29
+ # TODO: make it with torch instead of numpy
30
+ def get_position_angle_vec(position):
31
+ return [
32
+ position / np.power(10000, 2 * (hid_j // 2) / d_hid)
33
+ for hid_j in range(d_hid)
34
+ ]
35
+
36
+ sinusoid_table = np.array(
37
+ [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
38
+ )
39
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
40
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
41
+
42
+ return torch.FloatTensor(sinusoid_table).unsqueeze(0)
43
+
44
+
45
+ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
46
+ N = pos_embed.shape[1]
47
+ if N == target_spatial_size:
48
+ return pos_embed
49
+ dim = pos_embed.shape[-1]
50
+ # nn.functional.interpolate doesn't work with bfloat16 so we cast to float32
51
+ pos_embed, updated = cast_if_src_dtype(pos_embed, torch.bfloat16, torch.float32)
52
+ pos_embed = nn.functional.interpolate(
53
+ pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(
54
+ 0, 3, 1, 2
55
+ ),
56
+ scale_factor=math.sqrt(target_spatial_size / N),
57
+ mode="bicubic",
58
+ )
59
+ if updated:
60
+ pos_embed, _ = cast_if_src_dtype(pos_embed, torch.float32, torch.bfloat16)
61
+ pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
62
+ return pos_embed
63
+
64
+
65
+ def interpolate_pos_encoding(
66
+ npatch_per_img,
67
+ pos_embed,
68
+ patches_layout,
69
+ input_shape=None,
70
+ first_patch_idx=1,
71
+ ):
72
+ assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
73
+ N = pos_embed.shape[1] - first_patch_idx # since it's 1 if cls_token exists
74
+ if npatch_per_img == N:
75
+ return pos_embed
76
+
77
+ assert (
78
+ patches_layout[-1] == patches_layout[-2]
79
+ ), "Interpolation of pos embed not supported for non-square layouts"
80
+
81
+ class_emb = pos_embed[:, :first_patch_idx]
82
+ pos_embed = pos_embed[:, first_patch_idx:]
83
+
84
+ if input_shape is None or patches_layout[0] == 1:
85
+ # simple 2D pos embedding, no temporal component
86
+ pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed)
87
+ elif patches_layout[0] > 1:
88
+ # pos embed has a temporal component
89
+ assert len(input_shape) == 4, "temporal interpolation not supported"
90
+ # we only support 2D interpolation in this case
91
+ num_frames = patches_layout[0]
92
+ num_spatial_tokens = patches_layout[1] * patches_layout[2]
93
+ pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
94
+ # interpolate embedding for zeroth frame
95
+ pos_embed = interpolate_pos_encoding_2d(
96
+ npatch_per_img, pos_embed[0, 0, ...].unsqueeze(0)
97
+ )
98
+ else:
99
+ raise ValueError("This type of interpolation isn't implemented")
100
+
101
+ return torch.cat((class_emb, pos_embed), dim=1)
102
+
103
+
104
+ def _get_pos_embedding(
105
+ npatch_per_img,
106
+ pos_embed,
107
+ patches_layout,
108
+ input_shape,
109
+ first_patch_idx=1,
110
+ ):
111
+ pos_embed = interpolate_pos_encoding(
112
+ npatch_per_img,
113
+ pos_embed,
114
+ patches_layout,
115
+ input_shape=input_shape,
116
+ first_patch_idx=first_patch_idx,
117
+ )
118
+ return pos_embed
119
+
120
+
121
+ class PatchEmbedGeneric(nn.Module):
122
+ """
123
+ PatchEmbed from Hydra
124
+ """
125
+
126
+ def __init__(self, proj_stem, norm_layer: Optional[nn.Module] = None):
127
+ super().__init__()
128
+
129
+ if len(proj_stem) > 1:
130
+ self.proj = nn.Sequential(*proj_stem)
131
+ else:
132
+ # Special case to be able to load pre-trained models that were
133
+ # trained with a standard stem
134
+ self.proj = proj_stem[0]
135
+ self.norm_layer = norm_layer
136
+
137
+ def get_patch_layout(self, img_size):
138
+ with torch.no_grad():
139
+ dummy_img = torch.zeros(
140
+ [
141
+ 1,
142
+ ]
143
+ + img_size
144
+ )
145
+ dummy_out = self.proj(dummy_img)
146
+ embed_dim = dummy_out.shape[1]
147
+ patches_layout = tuple(dummy_out.shape[2:])
148
+ num_patches = np.prod(patches_layout)
149
+ return patches_layout, num_patches, embed_dim
150
+
151
+ def forward(self, x):
152
+ x = self.proj(x)
153
+ # B C (T) H W -> B (T)HW C
154
+ x = x.flatten(2).transpose(1, 2)
155
+ if self.norm_layer is not None:
156
+ x = self.norm_layer(x)
157
+ return x
158
+
159
+
160
+ class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
161
+ def __init__(
162
+ self,
163
+ patches_layout: List,
164
+ num_patches: int,
165
+ num_cls_tokens: int,
166
+ embed_dim: int,
167
+ learnable: bool,
168
+ ) -> None:
169
+ super().__init__()
170
+ self.num_cls_tokens = num_cls_tokens
171
+ self.patches_layout = patches_layout
172
+ self.num_patches = num_patches
173
+ self.num_tokens = num_cls_tokens + num_patches
174
+ self.learnable = learnable
175
+ if self.learnable:
176
+ self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
177
+ trunc_normal_(self.pos_embed, std=0.02)
178
+ else:
179
+ self.register_buffer(
180
+ "pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)
181
+ )
182
+
183
+ def get_pos_embedding(self, vision_input, all_vision_tokens):
184
+ input_shape = vision_input.shape
185
+ pos_embed = _get_pos_embedding(
186
+ all_vision_tokens.size(1) - self.num_cls_tokens,
187
+ pos_embed=self.pos_embed,
188
+ patches_layout=self.patches_layout,
189
+ input_shape=input_shape,
190
+ first_patch_idx=self.num_cls_tokens,
191
+ )
192
+ return pos_embed
193
+
194
+
195
+ class RGBDTPreprocessor(VerboseNNModule):
196
+ def __init__(
197
+ self,
198
+ rgbt_stem: PatchEmbedGeneric,
199
+ depth_stem: Optional[PatchEmbedGeneric],
200
+ img_size: Tuple = (3, 224, 224),
201
+ num_cls_tokens: int = 1,
202
+ pos_embed_fn: Optional[Callable] = None,
203
+ use_type_embed: bool = False,
204
+ init_param_style: str = "openclip",
205
+ ) -> None:
206
+ super().__init__()
207
+ stem = rgbt_stem if rgbt_stem is not None else depth_stem
208
+ (
209
+ self.patches_layout,
210
+ self.num_patches,
211
+ self.embed_dim,
212
+ ) = stem.get_patch_layout(img_size)
213
+ self.rgbt_stem = rgbt_stem
214
+ self.depth_stem = depth_stem
215
+ self.use_pos_embed = pos_embed_fn is not None
216
+ self.use_type_embed = use_type_embed
217
+ self.num_cls_tokens = num_cls_tokens
218
+
219
+ if self.use_pos_embed:
220
+ self.pos_embedding_helper = pos_embed_fn(
221
+ patches_layout=self.patches_layout,
222
+ num_cls_tokens=num_cls_tokens,
223
+ num_patches=self.num_patches,
224
+ embed_dim=self.embed_dim,
225
+ )
226
+ if self.num_cls_tokens > 0:
227
+ self.cls_token = nn.Parameter(
228
+ torch.zeros(1, self.num_cls_tokens, self.embed_dim)
229
+ )
230
+ if self.use_type_embed:
231
+ self.type_embed = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
232
+
233
+ self.init_parameters(init_param_style)
234
+
235
+ @torch.no_grad()
236
+ def init_parameters(self, init_param_style):
237
+ if init_param_style == "openclip":
238
+ # OpenCLIP style initialization
239
+ scale = self.embed_dim**-0.5
240
+ if self.use_pos_embed:
241
+ nn.init.normal_(self.pos_embedding_helper.pos_embed)
242
+ self.pos_embedding_helper.pos_embed *= scale
243
+
244
+ if self.num_cls_tokens > 0:
245
+ nn.init.normal_(self.cls_token)
246
+ self.cls_token *= scale
247
+ elif init_param_style == "vit":
248
+ self.cls_token.data.fill_(0)
249
+ else:
250
+ raise ValueError(f"Unknown init {init_param_style}")
251
+
252
+ if self.use_type_embed:
253
+ nn.init.normal_(self.type_embed)
254
+
255
+ def tokenize_input_and_cls_pos(self, input, stem, mask):
256
+ # tokens is of shape B x L x D
257
+ tokens = stem(input)
258
+ assert tokens.ndim == 3
259
+ assert tokens.shape[2] == self.embed_dim
260
+ B = tokens.shape[0]
261
+ if self.num_cls_tokens > 0:
262
+ class_tokens = self.cls_token.expand(
263
+ B, -1, -1
264
+ ) # stole class_tokens impl from Phil Wang, thanks
265
+ tokens = torch.cat((class_tokens, tokens), dim=1)
266
+ if self.use_pos_embed:
267
+ pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
268
+ tokens = tokens + pos_embed
269
+ if self.use_type_embed:
270
+ tokens = tokens + self.type_embed.expand(B, -1, -1)
271
+ return tokens
272
+
273
+ def forward(self, vision=None, depth=None, patch_mask=None):
274
+ if patch_mask is not None:
275
+ raise NotImplementedError()
276
+
277
+ if vision is not None:
278
+ vision_tokens = self.tokenize_input_and_cls_pos(
279
+ vision, self.rgbt_stem, patch_mask
280
+ )
281
+
282
+ if depth is not None:
283
+ depth_tokens = self.tokenize_input_and_cls_pos(
284
+ depth, self.depth_stem, patch_mask
285
+ )
286
+
287
+ # aggregate tokens
288
+ if vision is not None and depth is not None:
289
+ final_tokens = vision_tokens + depth_tokens
290
+ else:
291
+ final_tokens = vision_tokens if vision is not None else depth_tokens
292
+ return_dict = {
293
+ "trunk": {
294
+ "tokens": final_tokens,
295
+ },
296
+ "head": {},
297
+ }
298
+ return return_dict
299
+
300
+
301
+ class AudioPreprocessor(RGBDTPreprocessor):
302
+ def __init__(self, audio_stem: PatchEmbedGeneric, **kwargs) -> None:
303
+ super().__init__(rgbt_stem=audio_stem, depth_stem=None, **kwargs)
304
+
305
+ def forward(self, audio=None):
306
+ return super().forward(vision=audio)
307
+
308
+
309
+ class ThermalPreprocessor(RGBDTPreprocessor):
310
+ def __init__(self, thermal_stem: PatchEmbedGeneric, **kwargs) -> None:
311
+ super().__init__(rgbt_stem=thermal_stem, depth_stem=None, **kwargs)
312
+
313
+ def forward(self, thermal=None):
314
+ return super().forward(vision=thermal)
315
+
316
+
317
+ def build_causal_attention_mask(context_length):
318
+ # lazily create causal attention mask, with full attention between the vision tokens
319
+ # pytorch uses additive attention mask; fill with -inf
320
+ mask = torch.empty(context_length, context_length, requires_grad=False)
321
+ mask.fill_(float("-inf"))
322
+ mask.triu_(1) # zero out the lower diagonal
323
+ return mask
324
+
325
+
326
+ class TextPreprocessor(VerboseNNModule):
327
+ def __init__(
328
+ self,
329
+ vocab_size: int,
330
+ context_length: int,
331
+ embed_dim: int,
332
+ causal_masking: bool,
333
+ supply_seq_len_to_head: bool = True,
334
+ num_cls_tokens: int = 0,
335
+ init_param_style: str = "openclip",
336
+ ) -> None:
337
+ super().__init__()
338
+ self.vocab_size = vocab_size
339
+ self.context_length = context_length
340
+ self.token_embedding = nn.Embedding(vocab_size, embed_dim)
341
+ self.pos_embed = nn.Parameter(
342
+ torch.empty(1, self.context_length + num_cls_tokens, embed_dim)
343
+ )
344
+ self.causal_masking = causal_masking
345
+ if self.causal_masking:
346
+ mask = build_causal_attention_mask(self.context_length)
347
+ # register the mask as a buffer so it can be moved to the right device
348
+ self.register_buffer("mask", mask)
349
+
350
+ self.supply_seq_len_to_head = supply_seq_len_to_head
351
+ self.num_cls_tokens = num_cls_tokens
352
+ self.embed_dim = embed_dim
353
+ if num_cls_tokens > 0:
354
+ assert self.causal_masking is False, "Masking + CLS token isn't implemented"
355
+ self.cls_token = nn.Parameter(
356
+ torch.zeros(1, self.num_cls_tokens, embed_dim)
357
+ )
358
+
359
+ self.init_parameters(init_param_style)
360
+
361
+ @torch.no_grad()
362
+ def init_parameters(self, init_param_style="openclip"):
363
+ # OpenCLIP style initialization
364
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
365
+ nn.init.normal_(self.pos_embed, std=0.01)
366
+
367
+ if init_param_style == "openclip":
368
+ # OpenCLIP style initialization
369
+ scale = self.embed_dim**-0.5
370
+ if self.num_cls_tokens > 0:
371
+ nn.init.normal_(self.cls_token)
372
+ self.cls_token *= scale
373
+ elif init_param_style == "vit":
374
+ self.cls_token.data.fill_(0)
375
+ else:
376
+ raise ValueError(f"Unknown init {init_param_style}")
377
+
378
+ def forward(self, text):
379
+ # text tokens are of shape B x L x D
380
+ text_tokens = self.token_embedding(text)
381
+ # concat CLS tokens if any
382
+ if self.num_cls_tokens > 0:
383
+ B = text_tokens.shape[0]
384
+ class_tokens = self.cls_token.expand(
385
+ B, -1, -1
386
+ ) # stole class_tokens impl from Phil Wang, thanks
387
+ text_tokens = torch.cat((class_tokens, text_tokens), dim=1)
388
+ text_tokens = text_tokens + self.pos_embed
389
+ return_dict = {
390
+ "trunk": {
391
+ "tokens": text_tokens,
392
+ },
393
+ "head": {},
394
+ }
395
+ # Compute sequence length after adding CLS tokens
396
+ if self.supply_seq_len_to_head:
397
+ text_lengths = text.argmax(dim=-1)
398
+ return_dict["head"] = {
399
+ "seq_len": text_lengths,
400
+ }
401
+ if self.causal_masking:
402
+ return_dict["trunk"].update({"attn_mask": self.mask})
403
+ return return_dict
404
+
405
+
406
+ class Im2Video(nn.Module):
407
+ """Convert an image into a trivial video."""
408
+
409
+ def __init__(self, time_dim=2):
410
+ super().__init__()
411
+ self.time_dim = time_dim
412
+
413
+ def forward(self, x):
414
+ if x.ndim == 4:
415
+ # B, C, H, W -> B, C, T, H, W
416
+ return x.unsqueeze(self.time_dim)
417
+ elif x.ndim == 5:
418
+ return x
419
+ else:
420
+ raise ValueError(f"Dimension incorrect {x.shape}")
421
+
422
+
423
+ class PadIm2Video(Im2Video):
424
+ def __init__(self, ntimes, pad_type, time_dim=2):
425
+ super().__init__(time_dim=time_dim)
426
+ assert ntimes > 0
427
+ assert pad_type in ["zero", "repeat"]
428
+ self.ntimes = ntimes
429
+ self.pad_type = pad_type
430
+
431
+ def forward(self, x):
432
+ x = super().forward(x)
433
+ if x.shape[self.time_dim] == 1:
434
+ if self.pad_type == "repeat":
435
+ new_shape = [1] * len(x.shape)
436
+ new_shape[self.time_dim] = self.ntimes
437
+ x = x.repeat(new_shape)
438
+ elif self.pad_type == "zero":
439
+ padarg = [0, 0] * len(x.shape)
440
+ padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
441
+ x = nn.functional.pad(x, padarg)
442
+ return x
443
+
444
+
445
+ # Modified from github.com/openai/CLIP
446
+ @lru_cache()
447
+ def bytes_to_unicode():
448
+ """
449
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
450
+ The reversible bpe codes work on unicode strings.
451
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
452
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
453
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
454
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
455
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
456
+ """
457
+ bs = (
458
+ list(range(ord("!"), ord("~") + 1))
459
+ + list(range(ord("¡"), ord("¬") + 1))
460
+ + list(range(ord("®"), ord("ÿ") + 1))
461
+ )
462
+ cs = bs[:]
463
+ n = 0
464
+ for b in range(2**8):
465
+ if b not in bs:
466
+ bs.append(b)
467
+ cs.append(2**8 + n)
468
+ n += 1
469
+ cs = [chr(n) for n in cs]
470
+ return dict(zip(bs, cs))
471
+
472
+
473
+ def get_pairs(word):
474
+ """Return set of symbol pairs in a word.
475
+ Word is represented as tuple of symbols (symbols being variable-length strings).
476
+ """
477
+ pairs = set()
478
+ prev_char = word[0]
479
+ for char in word[1:]:
480
+ pairs.add((prev_char, char))
481
+ prev_char = char
482
+ return pairs
483
+
484
+
485
+ def basic_clean(text):
486
+ text = ftfy.fix_text(text)
487
+ text = html.unescape(html.unescape(text))
488
+ return text.strip()
489
+
490
+
491
+ def whitespace_clean(text):
492
+ text = re.sub(r"\s+", " ", text)
493
+ text = text.strip()
494
+ return text
495
+
496
+
497
+ class SimpleTokenizer(object):
498
+ def __init__(self, bpe_path: str, context_length=77):
499
+ self.byte_encoder = bytes_to_unicode()
500
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
501
+
502
+ with g_pathmgr.open(bpe_path, "rb") as fh:
503
+ bpe_bytes = io.BytesIO(fh.read())
504
+ merges: List[str] = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
505
+ merges = merges[1 : 49152 - 256 - 2 + 1]
506
+ merges: List[Tuple[str, ...]] = [tuple(merge.split()) for merge in merges]
507
+ vocab = list(bytes_to_unicode().values())
508
+ vocab = vocab + [v + "</w>" for v in vocab]
509
+ for merge in merges:
510
+ vocab.append("".join(merge))
511
+ vocab.extend(["<|startoftext|>", "<|endoftext|>"])
512
+ self.encoder = dict(zip(vocab, range(len(vocab))))
513
+ self.decoder = {v: k for k, v in self.encoder.items()}
514
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
515
+ self.cache = {
516
+ "<|startoftext|>": "<|startoftext|>",
517
+ "<|endoftext|>": "<|endoftext|>",
518
+ }
519
+ self.pat = re.compile(
520
+ r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
521
+ re.IGNORECASE,
522
+ )
523
+ self.context_length = context_length
524
+
525
+ def bpe(self, token):
526
+ if token in self.cache:
527
+ return self.cache[token]
528
+ word = tuple(token[:-1]) + (token[-1] + "</w>",)
529
+ pairs = get_pairs(word)
530
+
531
+ if not pairs:
532
+ return token + "</w>"
533
+
534
+ while True:
535
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
536
+ if bigram not in self.bpe_ranks:
537
+ break
538
+ first, second = bigram
539
+ new_word = []
540
+ i = 0
541
+ while i < len(word):
542
+ try:
543
+ j = word.index(first, i)
544
+ new_word.extend(word[i:j])
545
+ i = j
546
+ except:
547
+ new_word.extend(word[i:])
548
+ break
549
+
550
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
551
+ new_word.append(first + second)
552
+ i += 2
553
+ else:
554
+ new_word.append(word[i])
555
+ i += 1
556
+ new_word = tuple(new_word)
557
+ word = new_word
558
+ if len(word) == 1:
559
+ break
560
+ else:
561
+ pairs = get_pairs(word)
562
+ word = " ".join(word)
563
+ self.cache[token] = word
564
+ return word
565
+
566
+ def encode(self, text):
567
+ bpe_tokens = []
568
+ text = whitespace_clean(basic_clean(text)).lower()
569
+ for token in re.findall(self.pat, text):
570
+ token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
571
+ bpe_tokens.extend(
572
+ self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
573
+ )
574
+ return bpe_tokens
575
+
576
+ def decode(self, tokens):
577
+ text = "".join([self.decoder[token] for token in tokens])
578
+ text = (
579
+ bytearray([self.byte_decoder[c] for c in text])
580
+ .decode("utf-8", errors="replace")
581
+ .replace("</w>", " ")
582
+ )
583
+ return text
584
+
585
+ def __call__(self, texts, context_length=None):
586
+ if not context_length:
587
+ context_length = self.context_length
588
+
589
+ if isinstance(texts, str):
590
+ texts = [texts]
591
+
592
+ sot_token = self.encoder["<|startoftext|>"]
593
+ eot_token = self.encoder["<|endoftext|>"]
594
+ all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
595
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
596
+
597
+ for i, tokens in enumerate(all_tokens):
598
+ tokens = tokens[:context_length]
599
+ result[i, : len(tokens)] = torch.tensor(tokens)
600
+
601
+ if len(result) == 1:
602
+ return result[0]
603
+ return result
604
+
605
+
606
+ class IMUPreprocessor(VerboseNNModule):
607
+ def __init__(
608
+ self,
609
+ kernel_size: int,
610
+ imu_stem: PatchEmbedGeneric,
611
+ embed_dim: int,
612
+ img_size: Tuple = (6, 2000),
613
+ num_cls_tokens: int = 1,
614
+ pos_embed_fn: Optional[Callable] = None,
615
+ init_param_style: str = "openclip",
616
+ ) -> None:
617
+ super().__init__()
618
+ self.imu_stem = imu_stem
619
+ self.embed_dim = embed_dim
620
+ self.use_pos_embed = pos_embed_fn is not None
621
+ self.num_cls_tokens = num_cls_tokens
622
+ self.kernel_size = kernel_size
623
+ self.pos_embed = nn.Parameter(
624
+ torch.empty(1, (img_size[1] // kernel_size) + num_cls_tokens, embed_dim)
625
+ )
626
+
627
+ if self.num_cls_tokens > 0:
628
+ self.cls_token = nn.Parameter(
629
+ torch.zeros(1, self.num_cls_tokens, self.embed_dim)
630
+ )
631
+
632
+ self.init_parameters(init_param_style)
633
+
634
+ @torch.no_grad()
635
+ def init_parameters(self, init_param_style):
636
+ nn.init.normal_(self.pos_embed, std=0.01)
637
+
638
+ if init_param_style == "openclip":
639
+ # OpenCLIP style initialization
640
+ scale = self.embed_dim**-0.5
641
+
642
+ if self.num_cls_tokens > 0:
643
+ nn.init.normal_(self.cls_token)
644
+ self.cls_token *= scale
645
+ elif init_param_style == "vit":
646
+ self.cls_token.data.fill_(0)
647
+ else:
648
+ raise ValueError(f"Unknown init {init_param_style}")
649
+
650
+ def tokenize_input_and_cls_pos(self, input, stem):
651
+ # tokens is of shape B x L x D
652
+ tokens = stem.norm_layer(stem.proj(input))
653
+ assert tokens.ndim == 3
654
+ assert tokens.shape[2] == self.embed_dim
655
+ B = tokens.shape[0]
656
+ if self.num_cls_tokens > 0:
657
+ class_tokens = self.cls_token.expand(
658
+ B, -1, -1
659
+ ) # stole class_tokens impl from Phil Wang, thanks
660
+ tokens = torch.cat((class_tokens, tokens), dim=1)
661
+ if self.use_pos_embed:
662
+ tokens = tokens + self.pos_embed
663
+ return tokens
664
+
665
+ def forward(self, imu):
666
+ # Patchify
667
+ imu = imu.unfold(
668
+ -1,
669
+ self.kernel_size,
670
+ self.kernel_size,
671
+ ).permute(0, 2, 1, 3)
672
+ imu = imu.reshape(imu.size(0), imu.size(1), -1)
673
+
674
+ imu_tokens = self.tokenize_input_and_cls_pos(
675
+ imu,
676
+ self.imu_stem,
677
+ )
678
+
679
+ return_dict = {
680
+ "trunk": {
681
+ "tokens": imu_tokens,
682
+ },
683
+ "head": {},
684
+ }
685
+ return return_dict
models/transformer.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # Code modified from
9
+ # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py ;
10
+ # https://github.com/facebookresearch/deit/blob/main/models.py
11
+ # and https://github.com/facebookresearch/vissl/blob/main/vissl/models/trunks/vision_transformer.py
12
+
13
+
14
+ from functools import partial
15
+ from typing import Callable, List, Optional
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.utils.checkpoint as checkpoint
20
+ from timm.models.layers import DropPath, trunc_normal_
21
+
22
+
23
+ class Attention(nn.Module):
24
+ def __init__(
25
+ self,
26
+ dim,
27
+ num_heads=8,
28
+ qkv_bias=False,
29
+ qk_scale=None,
30
+ attn_drop=0.0,
31
+ proj_drop=0.0,
32
+ ):
33
+ super().__init__()
34
+ self.num_heads = num_heads
35
+ head_dim = dim // num_heads
36
+ # NOTE scale factor was wrong in my original version,
37
+ # can set manually to be compat with prev weights
38
+ self.scale = qk_scale or head_dim**-0.5
39
+
40
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
41
+ self.attn_drop = nn.Dropout(attn_drop)
42
+ self.proj = nn.Linear(dim, dim)
43
+ self.proj_drop = nn.Dropout(proj_drop)
44
+
45
+ def forward(self, x):
46
+ B, N, C = x.shape
47
+ qkv = (
48
+ self.qkv(x)
49
+ .reshape(B, N, 3, self.num_heads, C // self.num_heads)
50
+ .permute(2, 0, 3, 1, 4)
51
+ )
52
+ q, k, v = (
53
+ qkv[0],
54
+ qkv[1],
55
+ qkv[2],
56
+ ) # make torchscript happy (cannot use tensor as tuple)
57
+
58
+ attn = (q @ k.transpose(-2, -1)) * self.scale
59
+ attn = attn.softmax(dim=-1)
60
+ attn = self.attn_drop(attn)
61
+
62
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
63
+ x = self.proj(x)
64
+ x = self.proj_drop(x)
65
+ return x
66
+
67
+
68
+ class Mlp(nn.Module):
69
+ def __init__(
70
+ self,
71
+ in_features,
72
+ hidden_features=None,
73
+ out_features=None,
74
+ act_layer=nn.GELU,
75
+ drop=0.0,
76
+ ):
77
+ super().__init__()
78
+ out_features = out_features or in_features
79
+ hidden_features = hidden_features or in_features
80
+ self.fc1 = nn.Linear(in_features, hidden_features)
81
+ self.act = act_layer()
82
+ self.fc2 = nn.Linear(hidden_features, out_features)
83
+ self.drop = nn.Dropout(drop)
84
+
85
+ def forward(self, x):
86
+ x = self.fc1(x)
87
+ x = self.act(x)
88
+ x = self.drop(x)
89
+ x = self.fc2(x)
90
+ x = self.drop(x)
91
+ return x
92
+
93
+
94
+ class MultiheadAttention(nn.MultiheadAttention):
95
+ def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
96
+ return super().forward(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
97
+
98
+
99
+ class ViTAttention(Attention):
100
+ def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
101
+ assert attn_mask is None
102
+ return super().forward(x)
103
+
104
+
105
+ class BlockWithMasking(nn.Module):
106
+ def __init__(
107
+ self,
108
+ dim: int,
109
+ attn_target: Callable,
110
+ mlp_ratio: int = 4,
111
+ act_layer: Callable = nn.GELU,
112
+ norm_layer: Callable = nn.LayerNorm,
113
+ ffn_dropout_rate: float = 0.0,
114
+ drop_path: float = 0.0,
115
+ layer_scale_type: Optional[str] = None,
116
+ layer_scale_init_value: float = 1e-4,
117
+ ):
118
+ super().__init__()
119
+
120
+ assert not isinstance(
121
+ attn_target, nn.Module
122
+ ), "attn_target should be a Callable. Otherwise attn_target is shared across blocks!"
123
+ self.attn = attn_target()
124
+ if drop_path > 0.0:
125
+ self.drop_path = DropPath(drop_path)
126
+ else:
127
+ self.drop_path = nn.Identity()
128
+ self.norm_1 = norm_layer(dim)
129
+ mlp_hidden_dim = int(mlp_ratio * dim)
130
+ self.mlp = Mlp(
131
+ in_features=dim,
132
+ hidden_features=mlp_hidden_dim,
133
+ act_layer=act_layer,
134
+ drop=ffn_dropout_rate,
135
+ )
136
+ self.norm_2 = norm_layer(dim)
137
+ self.layer_scale_type = layer_scale_type
138
+ if self.layer_scale_type is not None:
139
+ assert self.layer_scale_type in [
140
+ "per_channel",
141
+ "scalar",
142
+ ], f"Found Layer scale type {self.layer_scale_type}"
143
+ if self.layer_scale_type == "per_channel":
144
+ # one gamma value per channel
145
+ gamma_shape = [1, 1, dim]
146
+ elif self.layer_scale_type == "scalar":
147
+ # single gamma value for all channels
148
+ gamma_shape = [1, 1, 1]
149
+ # two gammas: for each part of the fwd in the encoder
150
+ self.layer_scale_gamma1 = nn.Parameter(
151
+ torch.ones(size=gamma_shape) * layer_scale_init_value,
152
+ requires_grad=True,
153
+ )
154
+ self.layer_scale_gamma2 = nn.Parameter(
155
+ torch.ones(size=gamma_shape) * layer_scale_init_value,
156
+ requires_grad=True,
157
+ )
158
+
159
+ def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
160
+ if self.layer_scale_type is None:
161
+ x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask))
162
+ x = x + self.drop_path(self.mlp(self.norm_2(x)))
163
+ else:
164
+ x = (
165
+ x
166
+ + self.drop_path(self.attn(self.norm_1(x), attn_mask))
167
+ * self.layer_scale_gamma1
168
+ )
169
+ x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2
170
+ return x
171
+
172
+
173
+ _LAYER_NORM = partial(nn.LayerNorm, eps=1e-6)
174
+
175
+
176
+ class SimpleTransformer(nn.Module):
177
+ def __init__(
178
+ self,
179
+ attn_target: Callable,
180
+ embed_dim: int,
181
+ num_blocks: int,
182
+ block: Callable = BlockWithMasking,
183
+ pre_transformer_layer: Optional[Callable] = None,
184
+ post_transformer_layer: Optional[Callable] = None,
185
+ drop_path_rate: float = 0.0,
186
+ drop_path_type: str = "progressive",
187
+ norm_layer: Callable = _LAYER_NORM,
188
+ mlp_ratio: int = 4,
189
+ ffn_dropout_rate: float = 0.0,
190
+ layer_scale_type: Optional[str] = None, # from cait; possible values are None, "per_channel", "scalar"
191
+ layer_scale_init_value: float = 1e-4, # from cait; float
192
+ weight_init_style: str = "jax", # possible values jax or pytorch
193
+ ):
194
+ """
195
+ Simple Transformer with the following features
196
+ 1. Supports masked attention
197
+ 2. Supports DropPath
198
+ 3. Supports LayerScale
199
+ 4. Supports Dropout in Attention and FFN
200
+ 5. Makes few assumptions about the input except that it is a Tensor
201
+ """
202
+ super().__init__()
203
+ self.pre_transformer_layer = pre_transformer_layer
204
+ if drop_path_type == "progressive":
205
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_blocks)]
206
+ elif drop_path_type == "uniform":
207
+ dpr = [drop_path_rate for i in range(num_blocks)]
208
+ else:
209
+ raise ValueError(f"Unknown drop_path_type: {drop_path_type}")
210
+
211
+ self.blocks = nn.Sequential(
212
+ *[
213
+ block(
214
+ dim=embed_dim,
215
+ attn_target=attn_target,
216
+ mlp_ratio=mlp_ratio,
217
+ ffn_dropout_rate=ffn_dropout_rate,
218
+ drop_path=dpr[i],
219
+ norm_layer=norm_layer,
220
+ layer_scale_type=layer_scale_type,
221
+ layer_scale_init_value=layer_scale_init_value,
222
+ )
223
+ for i in range(num_blocks)
224
+ ]
225
+ )
226
+ self.post_transformer_layer = post_transformer_layer
227
+ self.weight_init_style = weight_init_style
228
+ self.apply(self._init_weights)
229
+
230
+ def _init_weights(self, m):
231
+ if isinstance(m, nn.Linear):
232
+ if self.weight_init_style == "jax":
233
+ # Based on MAE and official Jax ViT implementation
234
+ torch.nn.init.xavier_uniform_(m.weight)
235
+ elif self.weight_init_style == "pytorch":
236
+ # PyTorch ViT uses trunc_normal_
237
+ trunc_normal_(m.weight, std=0.02)
238
+
239
+ if m.bias is not None:
240
+ nn.init.constant_(m.bias, 0)
241
+ elif isinstance(m, (nn.LayerNorm)):
242
+ nn.init.constant_(m.bias, 0)
243
+ nn.init.constant_(m.weight, 1.0)
244
+
245
+ def forward(
246
+ self,
247
+ tokens: torch.Tensor,
248
+ attn_mask: torch.Tensor = None,
249
+ use_checkpoint: bool = False,
250
+ checkpoint_every_n: int = 1,
251
+ checkpoint_blk_ids: Optional[List[int]] = None,
252
+ ):
253
+ """
254
+ Inputs
255
+ - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
256
+ - attn: mask of shape L x L
257
+
258
+ Output
259
+ - x: data of shape N x L x D (or L x N x D depending on the attention implementation)
260
+ """
261
+ if self.pre_transformer_layer:
262
+ tokens = self.pre_transformer_layer(tokens)
263
+ if use_checkpoint and checkpoint_blk_ids is None:
264
+ checkpoint_blk_ids = [
265
+ blk_id
266
+ for blk_id in range(len(self.blocks))
267
+ if blk_id % checkpoint_every_n == 0
268
+ ]
269
+ if checkpoint_blk_ids:
270
+ checkpoint_blk_ids = set(checkpoint_blk_ids)
271
+ for blk_id, blk in enumerate(self.blocks):
272
+ if use_checkpoint and blk_id in checkpoint_blk_ids:
273
+ tokens = checkpoint.checkpoint(
274
+ blk, tokens, attn_mask, use_reentrant=False
275
+ )
276
+ else:
277
+ tokens = blk(tokens, attn_mask=attn_mask)
278
+ if self.post_transformer_layer:
279
+ tokens = self.post_transformer_layer(tokens)
280
+ return tokens
output.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pepsi Soda Cola 12 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 12 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
2
+ Pepsi Soda Cola 12 Fl Oz 12 Ct:brand is Pepsi | descriptionShort is Pepsi Soda Cola 12 Fl Oz 12 Ct | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
3
+ PepsiSodaCola12FlOz24Ct:brand is Pepsi | descriptionShort is PepsiSodaCola12FlOz24Ct | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Caramel Color, Sugar, Phosphoric Acid, Caffeine, Citric Acid, Natural Flavors.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
4
+ Diet Pepsi Soda Cola 12 Fl Oz:brand is Pepsi | descriptionShort is Diet Pepsi Soda Cola 12 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, CARAMEL COLOR, PHOSPHORIC ACID, POTASSIUM BENZOATE (PRESERVES FRESHNESS), SUCRALOSE, ACESULFAME POTASSIUM, CAFFEINE, NATURAL FLAVOR, CITRIC ACID.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
5
+ Diet Pepsi Soda Cola 12Flz24Ct:brand is Pepsi | descriptionShort is Diet Pepsi Soda Cola 12Flz24Ct | marketingMessage is Light. Crisp. Refreshing, 0 calories, and now aspartame free. Nothing refreshes like a Diet Pepsi | functionalName is Soda | ingredients is ['CARBONATED WATER, CARAMEL COLOR, PHOSPHORIC ACID, POTASSIUM BENZOATE (PRESERVES FRESHNESS), SUCRALOSE, ACESULFAME POTASSIUM, CAFFEINE, NATURAL FLAVOR, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
6
+ Mtn Dew Cd Red Dw Chry 12FlOz:brand is Mtn Dew | descriptionShort is Mtn Dew Cd Red Dw Chry 12FlOz | marketingMessage is Mtn Dew Code Red with a rush of Cherry flavor. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), SODIUM BENZOATE (PRESERVES FRESHNESS), NATURAL FLAVOR, CAFFEINE, SODIUM CITRATE, GUM ARABIC, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40, YELLOW 5, BLUE 1. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
7
+ H2Oh! SWB Original 20 Fl Oz:brand is H2Oh! | descriptionShort is H2Oh! SWB Original 20 Fl Oz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
8
+ H2Oh! SpklgWtBvg Berry 20 FlOz:brand is H2Oh! | descriptionShort is H2Oh! SpklgWtBvg Berry 20 FlOz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
9
+ Pepsi Caffeine Free 20 Fl Oz Pl Bt:brand is Pepsi | descriptionShort is Pepsi Caffeine Free 20 Fl Oz Pl Bt | marketingMessage is Caffeine Free Pepsi - great Pepsi taste without the caffeine | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
10
+ Mtn Dew Soda 12FlOz 6Pk:brand is Mtn Dew | descriptionShort is Mtn Dew Soda 12FlOz 6Pk | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['Ingredients: Carbonated Water,\xa0High Fructose Corn Syrup, Concentrated Orange Juice,\xa0Citric Acid,\xa0Natural Flavor,\xa0Sodium Benzoate (Preserves Freshness),\xa0Caffeine,\xa0Sodium Citrate,\xa0Erythorbic Acid (Preserves Freshness),\xa0Gum Arabic,\xa0Calcium Disodium EDTA (To Protect Flavor),\xa0Brominated Vegetable Oil,\xa0Yellow 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
11
+ Mtn Dew Soda Citrus 12 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 12 Fl Oz | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Erythorbic Acid (Preserves Freshness), Gum Arabic, Calcium Disodium EDTA (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
12
+ Mtn Dew Sda Citrus 12 FlOz 24Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Sda Citrus 12 FlOz 24Ct | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium EDTA (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
13
+ Dole 100% Apple Juice 10 Fl Oz:brand is Dole | descriptionShort is Dole 100% Apple Juice 10 Fl Oz | marketingMessage is Now you can enjoy the sweet taste of our sun-ripened fruits in a tall, refreshing glass. Dole juices are ready to drink right out of your refrigerator ' 100% juice and no added sugar. | functionalName is Juice Beverage | ingredients is ['APPLE JUICE CONCENTRATE (FILTERED WATER AND CONCENTRATED APPLE JUICE), ASCORBIC ACID (VITAMIN C), NATURAL FLAVORS, VITAMIN E ACETATE (VITAMIN E) AND NIACINAMIDE (VITAMIN B3). '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Juice Beverage
14
+ Pepsi Soda Cola 2 L:brand is Pepsi | descriptionShort is Pepsi Soda Cola 2 L | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
15
+ Pepsi Soda Cola 20 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 20 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
16
+ Mtn Dew 20 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew 20 Fl Oz | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, SODIUM BENZOATE (PRESERVES FRESHNESS), CAFFEINE, SODIUM CITRATE, GUM ARABIC, ERYTHORBIC ACID (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
17
+ Mtn Dew Diet Sda Citrus 20FlOz:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Sda Citrus 20FlOz | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
18
+ H2Oh! SWB Lemon-Lime 20 Fl Oz:brand is H2Oh! | descriptionShort is H2Oh! SWB Lemon-Lime 20 Fl Oz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
19
+ Aquafina Purified Dnkg Wtr 33.8FlOz:brand is Aquafina | descriptionShort is Aquafina Purified Dnkg Wtr 33.8FlOz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
20
+ Aquafina Water 20 Fl Oz:brand is Aquafina | descriptionShort is Aquafina Water 20 Fl Oz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
21
+ Aquafina Water 12FlOz 6Pk:brand is Aquafina | descriptionShort is Aquafina Water 12FlOz 6Pk | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Packaged Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Packaged Water
22
+ Mtn Dew Diet Soda Citrus 12 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Soda Citrus 12 Fl Oz | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
23
+ Diet Mtn Dew LwCl DWCtrs 12FlOz12Ct:brand is Mtn Dew | descriptionShort is Diet Mtn Dew LwCl DWCtrs 12FlOz12Ct | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
24
+ Mtn Dew Diet Ctrs 12 FlOz 24Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Ctrs 12 FlOz 24Ct | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
25
+ Starbucks Espresso & Crm 6.5FlOz:brand is Starbucks | descriptionShort is Starbucks Espresso & Crm 6.5FlOz | marketingMessage is A Premium Ready-to-Drink Coffee Beverage. Rich, bold Starbucks espresso, just the right amount of cream and a double dose of “done and done.” | functionalName is Premium Espresso Beverage | ingredients is ['Brewed Espresso Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Cream, Skim Milk.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Premium Espresso Beverage
26
+ Starbucks Frap Coffee 13.7FlOz:brand is Starbucks | descriptionShort is Starbucks Frap Coffee 13.7FlOz | marketingMessage is Starbucks coffee drinks offer the bold, delicious taste of coffee with the rich flavors you know and love. This indulgence is proof that you can enjoy a little Starbucks wherever you may be. | functionalName is Chilled Coffee Drink | ingredients is ['Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Maltodextrin, Pectin.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink
27
+ Dole Apple Juice 15.2 Fl Oz:brand is Dole | descriptionShort is Dole Apple Juice 15.2 Fl Oz | marketingMessage is | functionalName is Juice 100% | ingredients is | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Juice 100%
28
+ Brisk Iced Tea 1 Liter Plastic Bot:brand is Brisk | descriptionShort is Brisk Iced Tea 1 Liter Plastic Bot | marketingMessage is OK, we admit it. We scoured the South for the boldest Sweet Tea Recipe we could find. Then we stole it. Once you taste our Brisk Sweet Tea, we think you'll agree - the buckshot in our fenders was worth it. | functionalName is Iced Tea | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, PHOSPHORIC ACID, TEA POWDER, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), NATURAL FLAVOR, CARAMEL COLOR, POTASSIUM SORBATE (PRESERVES FRESHNESS), POTASSIUM BENZOATE (PRESERVES FRESHNESS), ACESULFAME POTASSIUM, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), SUCRALOSE, RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Iced Tea
29
+ Aquafina Water 50.7 Fl Oz:brand is Aquafina | descriptionShort is Aquafina Water 50.7 Fl Oz | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Purified Drinking Water
30
+ Brisk Green Tea Apple 12FZ:brand is Brisk | descriptionShort is Brisk Green Tea Apple 12FZ | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Iced Tea | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, APPLE JUICE CONCENTRATE, SODIUM HEXAMETAPHOSPHATE (TO PROTECT FLAVOR), NATURAL FLAVOR, INSTANT TEA, GUM ARABIC, POTASSIUM SORBATE (PRESERVES FRESHNESS), CARAMEL COLOR, GLYCEROL ESTER OF ROSIN, CITRUS PECTIN, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Can'}] | sourceType is Iced Tea
31
+ Brisk JuiceDrink Lemnde 12FlOz:brand is Brisk | descriptionShort is Brisk JuiceDrink Lemnde 12FlOz | marketingMessage is Blast your Thirst with the bold, fruit flavored taste of Brisk Juice Drinks. Available in a wide variety of great tasting fruit flavors. | functionalName is Juice Drink | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, LEMON JUICE CONCENTRATE, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), GUM ARABIC, POTASSIUM BENZOATE (PRESERVE FRESHNESS), POTASSIUM SORBATE (PRESERVE FRESHNESS), SALT, GLYCEROL ESTER OF ROSIN, NATURAL FLAVOR, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Juice Drink
32
+ Mtn Dew Code Rd Chry 20 Fl Oz:brand is Mtn Dew | descriptionShort is Mtn Dew Code Rd Chry 20 Fl Oz | marketingMessage is Mtn Dew Code Red with a rush of Cherry flavor. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), SODIUM BENZOATE (PRESERVES FRESHNESS), NATURAL FLAVOR, CAFFEINE, SODIUM CITRATE, GUM ARABIC, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40, YELLOW 5, BLUE 1. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
33
+ Mtn Dew Soda 21FlOz12Ct:brand is Mtn Dew | descriptionShort is Mtn Dew Soda 21FlOz12Ct | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. The original, the one that started it all'Mtn Dew. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium Edta (To Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
34
+ Mtn Dew Soda Citrus 2L:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 2L | marketingMessage is The original, the one that started it all. MTN DEW exhilarates and quenches with its one of a kind taste. | functionalName is Soda | ingredients is ['Carbonated Water, High Fructose Corn Syrup, Concentrated Orange Juice, Citric Acid, Natural Flavor, Sodium Benzoate (Preserves Freshness), Caffeine, Sodium Citrate, Gum Arabic, Erythorbic Acid (Preserves Freshness), Calcium Disodium EDTA (to Protect Flavor), Yellow 5.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
35
+ Pepsi Soda Cola 8 Fl Oz:brand is Pepsi | descriptionShort is Pepsi Soda Cola 8 Fl Oz | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
36
+ Caleb's Kola 10 Fluid Ounce:brand is Caleb's Kola | descriptionShort is Caleb's Kola 10 Fluid Ounce | marketingMessage is A unique kola crafted by a passionate group of kola lovers using ingredients from around the world. | functionalName is Soda | ingredients is ['SPARKLING WATER, CANE SUGAR, CARAMEL COLOR, PHOSPHORIC ACID, NATURAL FLAVOR, SODIUM CITRATE, CAFFEINE, GUM ARABIC, CITRIC ACID, KOLA NUT EXTRACT.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Soda
37
+ Starbucks Frapp Chcfdr Crml 9.5FlOz:brand is Starbucks | descriptionShort is Starbucks Frapp Chcfdr Crml 9.5FlOz | marketingMessage is Pop the cap. Savor the sip. Go. Caramel Frappuccino® chilled coffee drink is a harmonious blend of Starbucks® coffee and creamy milk swirling with carmelly flavor. For a sweet burst of delicious. | functionalName is Chilled Coffee Drink | ingredients is ['Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Maltodextrin, Natural Flavors, Pectin.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink
38
+ Brisk JuiceDrink Lemnde 20FlOz:brand is Brisk | descriptionShort is Brisk JuiceDrink Lemnde 20FlOz | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Juice Drink | ingredients is ['WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, LEMON JUICE CONCENTRATE, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), GUM ARABIC, POTASSIUM BENZOATE (PRESERVE FRESHNESS), POTASSIUM SORBATE (PRESERVE FRESHNESS), SALT, GLYCEROL ESTER OF ROSIN, NATURAL FLAVOR, ACESULFAME POTASSIUM, SUCRALOSE, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Juice Drink
39
+ Brisk Iced Tea Lemon 24FO 6Pk:brand is Brisk | descriptionShort is Brisk Iced Tea Lemon 24FO 6Pk | marketingMessage is The original iced tea with tons of attitude. The one with the bold lemon flavor that kicked iced tea off the back porch and gave it some street cred. Now that's Brisk, baby! | functionalName is Iced Tea | ingredients is ['INGREDIENTS: WATER, HIGH FRUCTOSE CORN SYRUP, CITRIC ACID, INSTANT TEA, SODIUM HEXAMETAPHOSPHATE (TO PROTECT FLAVOR), NATURAL FLAVOR, PHOSPHORIC ACID, SODIUM BENZOATE (PRESERVES FRESHNESS), POTASSIUM SORBATE (PRESERVES FRESHNESS), CARAMEL COLOR, ACESULFAME POTASSIUM, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), SUCRALOSE, RED 40.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Iced Tea
40
+ H2Oh! SWB Orange 20FlOz:brand is H2Oh! | descriptionShort is H2Oh! SWB Orange 20FlOz | marketingMessage is H20H! is a delicious and refreshing, 0 calorie sparkling water beverage infused with natural flavors. | functionalName is Sparkling Water Beverage | ingredients is ['CARBONATED WATER, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Sparkling Water Beverage
41
+ PEPSI 12OZPLSGL:brand is Pepsi | descriptionShort is PEPSI 12OZPLSGL | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle'}] | sourceType is Soda
42
+ Pepsi Soda Cola 1 L Bottle:brand is Pepsi | descriptionShort is Pepsi Soda Cola 1 L Bottle | marketingMessage is Pepsi - the bold, refreshing, robust cola | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, CAFFEINE, CITRIC ACID, NATURAL FLAVOR.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
43
+ Mtn Dew Soda Citrus 1L:brand is Mtn Dew | descriptionShort is Mtn Dew Soda Citrus 1L | marketingMessage is Mtn Dew exhilarates and quenches with its one of a kind, bold taste. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, SODIUM BENZOATE (PRESERVES FRESHNESS), CAFFEINE, SODIUM CITRATE, GUM ARABIC, ERYTHORBIC ACID (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
44
+ Pepsi Soda Wild Cherry 12 FlOz:brand is Pepsi | descriptionShort is Pepsi Soda Wild Cherry 12 FlOz | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, NATURAL FLAVOR, CAFFEINE, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Soda
45
+ Pepsi Wild Cherry 2 L:brand is Pepsi | descriptionShort is Pepsi Wild Cherry 2 L | marketingMessage is Pepsi - the bold, refreshing, robust cola. Live For Now. | functionalName is Soda | ingredients is ['CARBONATED WATER, HIGH FRUCTOSE CORN SYRUP, CARAMEL COLOR, SUGAR, PHOSPHORIC ACID, NATURAL FLAVOR, CAFFEINE, CITRIC ACID. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Soda
46
+ Brisk Diet IcdTea Lemn 12FlOz:brand is Brisk | descriptionShort is Brisk Diet IcdTea Lemn 12FlOz | marketingMessage is Brisk has become the badge of creative hustlers everywhere. By bringing in-your-face flavors and vibrant colors to the juice drink and iced tea game we help you make your mark on the world. | functionalName is Iced Tea | ingredients is ['WATER, CITRIC ACID, TEA POWDER, SODIUM POLYPHOSPHATES (TO PROTECT FLAVOR), NATURAL FLAVOR, CARAMEL COLOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), POTASSIUM SORBATE (PRESERVES FRESHNESS), ASPARTAME, ACESULFAME POTASSIUM, CITRUS PECTIN, CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), RED 40. '] | packagingInformation is [{'packagingTypeCode': 'Can', 'packagingMaterial': {'packagingMaterialTypeCode': 'Aluminum'}}] | sourceType is Iced Tea
47
+ Aquafina Water 16.9 Fl Oz 6 pack:brand is Aquafina | descriptionShort is Aquafina Water 16.9 Fl Oz 6 pack | marketingMessage is Fresh and pure, Aquafina is the perfect companion for happy bodies everywhere. | functionalName is Purified Drinking Water | ingredients is ['PURIFIED WATER.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Plastic Other'}}] | sourceType is Purified Drinking Water
48
+ Mtn Dew Diet Soda Citrus 1 L:brand is Mtn Dew | descriptionShort is Mtn Dew Diet Soda Citrus 1 L | marketingMessage is All the great, exhilarating taste of Mtn Dew, without the calories. The Only Diet With Dew In It. | functionalName is Soda | ingredients is ['CARBONATED WATER, CONCENTRATED ORANGE JUICE, CITRIC ACID, NATURAL FLAVOR, POTASSIUM BENZOATE (PRESERVES FRESHNESS), CITRUS PECTIN, ASPARTAME, POTASSIUM CITRATE, CAFFEINE, SODIUM CITRATE, ACESULFAME POTASSIUM, SUCRALOSE, GUM ARABIC, SODIUM BENZOATE (PRESERVES FRESHNESS), CALCIUM DISODIUM EDTA (TO PROTECT FLAVOR), YELLOW 5. '] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Polyethylene Terephthalate (PET)'}}] | sourceType is Soda
49
+ Starbucks Frpp ClcofDr MhLt 9.5FlOz:brand is Starbucks | descriptionShort is Starbucks Frpp ClcofDr MhLt 9.5FlOz | marketingMessage is Pop the cap. Savor the sip. Go. Mocha Light Frappuccino® chilled coffee drink is a harmonious blend of Starbucks® coffee and milk with chocolaty mocha'and 100 calories per bottle. For a chill that only tastes decadent. | functionalName is Chilled Coffee Drink | ingredients is ['Ingredients: Brewed Starbucks Coffee (Water, Coffee), Reduced-Fat Milk, Sugar, Cocoa, Pectin, Sucralose, Acesulfame Potassium.'] | packagingInformation is [{'packagingTypeCode': 'Bottle', 'packagingMaterial': {'packagingMaterialTypeCode': 'Glass'}}] | sourceType is Chilled Coffee Drink