Spaces:
Runtime error
Runtime error
update
Browse files- imagebind/models/image_bind.py +141 -141
imagebind/models/image_bind.py
CHANGED
@@ -269,12 +269,12 @@ class ImageBindModel(nn.Module):
|
|
269 |
depth_stem=None,
|
270 |
)
|
271 |
|
272 |
-
text_preprocessor = TextPreprocessor(
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
)
|
278 |
|
279 |
audio_stem = PatchEmbedGeneric(
|
280 |
proj_stem=[
|
@@ -295,73 +295,73 @@ class ImageBindModel(nn.Module):
|
|
295 |
audio_stem=audio_stem,
|
296 |
)
|
297 |
|
298 |
-
depth_stem = PatchEmbedGeneric(
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
)
|
310 |
-
|
311 |
-
depth_preprocessor = RGBDTPreprocessor(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
)
|
318 |
-
|
319 |
-
thermal_stem = PatchEmbedGeneric(
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
)
|
331 |
-
thermal_preprocessor = ThermalPreprocessor(
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
)
|
337 |
-
|
338 |
-
imu_stem = PatchEmbedGeneric(
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
)
|
348 |
-
|
349 |
-
imu_preprocessor = IMUPreprocessor(
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
)
|
357 |
|
358 |
modality_preprocessors = {
|
359 |
ModalityType.VISION: rgbt_preprocessor,
|
360 |
-
ModalityType.TEXT: text_preprocessor,
|
361 |
ModalityType.AUDIO: audio_preprocessor,
|
362 |
-
ModalityType.DEPTH: depth_preprocessor,
|
363 |
-
ModalityType.THERMAL: thermal_preprocessor,
|
364 |
-
ModalityType.IMU: imu_preprocessor,
|
365 |
}
|
366 |
|
367 |
return nn.ModuleDict(modality_preprocessors)
|
@@ -424,14 +424,14 @@ class ImageBindModel(nn.Module):
|
|
424 |
add_bias_kv=False,
|
425 |
drop_path=0.0,
|
426 |
)
|
427 |
-
modality_trunks[ModalityType.TEXT] = instantiate_trunk(
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
)
|
435 |
modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
|
436 |
audio_embed_dim,
|
437 |
audio_num_blocks,
|
@@ -440,30 +440,30 @@ class ImageBindModel(nn.Module):
|
|
440 |
add_bias_kv=True,
|
441 |
drop_path=audio_drop_path,
|
442 |
)
|
443 |
-
modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
)
|
451 |
-
modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
)
|
459 |
-
modality_trunks[ModalityType.IMU] = instantiate_trunk(
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
)
|
467 |
|
468 |
return nn.ModuleDict(modality_trunks)
|
469 |
|
@@ -486,12 +486,12 @@ class ImageBindModel(nn.Module):
|
|
486 |
nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
|
487 |
)
|
488 |
|
489 |
-
modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
)
|
495 |
|
496 |
modality_heads[ModalityType.AUDIO] = nn.Sequential(
|
497 |
nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
|
@@ -499,24 +499,24 @@ class ImageBindModel(nn.Module):
|
|
499 |
nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
|
500 |
)
|
501 |
|
502 |
-
modality_heads[ModalityType.DEPTH] = nn.Sequential(
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
)
|
507 |
-
|
508 |
-
modality_heads[ModalityType.THERMAL] = nn.Sequential(
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
)
|
513 |
-
|
514 |
-
modality_heads[ModalityType.IMU] = nn.Sequential(
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
)
|
520 |
|
521 |
return nn.ModuleDict(modality_heads)
|
522 |
|
@@ -524,25 +524,25 @@ class ImageBindModel(nn.Module):
|
|
524 |
modality_postprocessors = {}
|
525 |
|
526 |
modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
|
527 |
-
modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
|
528 |
-
|
529 |
-
)
|
530 |
modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
|
531 |
Normalize(dim=-1),
|
532 |
LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
|
533 |
)
|
534 |
-
modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
|
535 |
-
|
536 |
-
|
537 |
-
)
|
538 |
-
modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
|
539 |
-
|
540 |
-
|
541 |
-
)
|
542 |
-
modality_postprocessors[ModalityType.IMU] = nn.Sequential(
|
543 |
-
|
544 |
-
|
545 |
-
)
|
546 |
|
547 |
return nn.ModuleDict(modality_postprocessors)
|
548 |
|
@@ -612,7 +612,7 @@ def imagebind_huge(pretrained=False, freeze_imagebind=False, with_head=True, use
|
|
612 |
progress=True,
|
613 |
)
|
614 |
|
615 |
-
model.load_state_dict(torch.load(".checkpoints/imagebind_huge.pth"))
|
616 |
|
617 |
if use_blip_vision:
|
618 |
from bubogpt.models.eva_vit import create_eva_vit_g
|
|
|
269 |
depth_stem=None,
|
270 |
)
|
271 |
|
272 |
+
# text_preprocessor = TextPreprocessor(
|
273 |
+
# context_length=77,
|
274 |
+
# vocab_size=49408,
|
275 |
+
# embed_dim=text_embed_dim,
|
276 |
+
# causal_masking=True,
|
277 |
+
# )
|
278 |
|
279 |
audio_stem = PatchEmbedGeneric(
|
280 |
proj_stem=[
|
|
|
295 |
audio_stem=audio_stem,
|
296 |
)
|
297 |
|
298 |
+
# depth_stem = PatchEmbedGeneric(
|
299 |
+
# [
|
300 |
+
# nn.Conv2d(
|
301 |
+
# kernel_size=depth_kernel_size,
|
302 |
+
# in_channels=1,
|
303 |
+
# out_channels=depth_embed_dim,
|
304 |
+
# stride=depth_kernel_size,
|
305 |
+
# bias=False,
|
306 |
+
# ),
|
307 |
+
# ],
|
308 |
+
# norm_layer=nn.LayerNorm(normalized_shape=depth_embed_dim),
|
309 |
+
# )
|
310 |
+
#
|
311 |
+
# depth_preprocessor = RGBDTPreprocessor(
|
312 |
+
# img_size=[1, 224, 224],
|
313 |
+
# num_cls_tokens=1,
|
314 |
+
# pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
315 |
+
# rgbt_stem=None,
|
316 |
+
# depth_stem=depth_stem,
|
317 |
+
# )
|
318 |
+
#
|
319 |
+
# thermal_stem = PatchEmbedGeneric(
|
320 |
+
# [
|
321 |
+
# nn.Conv2d(
|
322 |
+
# kernel_size=thermal_kernel_size,
|
323 |
+
# in_channels=1,
|
324 |
+
# out_channels=thermal_embed_dim,
|
325 |
+
# stride=thermal_kernel_size,
|
326 |
+
# bias=False,
|
327 |
+
# ),
|
328 |
+
# ],
|
329 |
+
# norm_layer=nn.LayerNorm(normalized_shape=thermal_embed_dim),
|
330 |
+
# )
|
331 |
+
# thermal_preprocessor = ThermalPreprocessor(
|
332 |
+
# img_size=[1, 224, 224],
|
333 |
+
# num_cls_tokens=1,
|
334 |
+
# pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
335 |
+
# thermal_stem=thermal_stem,
|
336 |
+
# )
|
337 |
+
#
|
338 |
+
# imu_stem = PatchEmbedGeneric(
|
339 |
+
# [
|
340 |
+
# nn.Linear(
|
341 |
+
# in_features=48,
|
342 |
+
# out_features=imu_embed_dim,
|
343 |
+
# bias=False,
|
344 |
+
# ),
|
345 |
+
# ],
|
346 |
+
# norm_layer=nn.LayerNorm(normalized_shape=imu_embed_dim),
|
347 |
+
# )
|
348 |
+
#
|
349 |
+
# imu_preprocessor = IMUPreprocessor(
|
350 |
+
# img_size=[6, 2000],
|
351 |
+
# num_cls_tokens=1,
|
352 |
+
# kernel_size=8,
|
353 |
+
# embed_dim=imu_embed_dim,
|
354 |
+
# pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
|
355 |
+
# imu_stem=imu_stem,
|
356 |
+
# )
|
357 |
|
358 |
modality_preprocessors = {
|
359 |
ModalityType.VISION: rgbt_preprocessor,
|
360 |
+
# ModalityType.TEXT: text_preprocessor,
|
361 |
ModalityType.AUDIO: audio_preprocessor,
|
362 |
+
# ModalityType.DEPTH: depth_preprocessor,
|
363 |
+
# ModalityType.THERMAL: thermal_preprocessor,
|
364 |
+
# ModalityType.IMU: imu_preprocessor,
|
365 |
}
|
366 |
|
367 |
return nn.ModuleDict(modality_preprocessors)
|
|
|
424 |
add_bias_kv=False,
|
425 |
drop_path=0.0,
|
426 |
)
|
427 |
+
# modality_trunks[ModalityType.TEXT] = instantiate_trunk(
|
428 |
+
# text_embed_dim,
|
429 |
+
# text_num_blocks,
|
430 |
+
# text_num_heads,
|
431 |
+
# pre_transformer_ln=False,
|
432 |
+
# add_bias_kv=False,
|
433 |
+
# drop_path=0.0,
|
434 |
+
# )
|
435 |
modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
|
436 |
audio_embed_dim,
|
437 |
audio_num_blocks,
|
|
|
440 |
add_bias_kv=True,
|
441 |
drop_path=audio_drop_path,
|
442 |
)
|
443 |
+
# modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
|
444 |
+
# depth_embed_dim,
|
445 |
+
# depth_num_blocks,
|
446 |
+
# depth_num_heads,
|
447 |
+
# pre_transformer_ln=False,
|
448 |
+
# add_bias_kv=True,
|
449 |
+
# drop_path=depth_drop_path,
|
450 |
+
# )
|
451 |
+
# modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
|
452 |
+
# thermal_embed_dim,
|
453 |
+
# thermal_num_blocks,
|
454 |
+
# thermal_num_heads,
|
455 |
+
# pre_transformer_ln=False,
|
456 |
+
# add_bias_kv=True,
|
457 |
+
# drop_path=thermal_drop_path,
|
458 |
+
# )
|
459 |
+
# modality_trunks[ModalityType.IMU] = instantiate_trunk(
|
460 |
+
# imu_embed_dim,
|
461 |
+
# imu_num_blocks,
|
462 |
+
# imu_num_heads,
|
463 |
+
# pre_transformer_ln=False,
|
464 |
+
# add_bias_kv=True,
|
465 |
+
# drop_path=imu_drop_path,
|
466 |
+
# )
|
467 |
|
468 |
return nn.ModuleDict(modality_trunks)
|
469 |
|
|
|
486 |
nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
|
487 |
)
|
488 |
|
489 |
+
# modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
|
490 |
+
# proj=nn.Sequential(
|
491 |
+
# nn.LayerNorm(normalized_shape=text_embed_dim, eps=1e-6),
|
492 |
+
# nn.Linear(text_embed_dim, out_embed_dim, bias=False),
|
493 |
+
# )
|
494 |
+
# )
|
495 |
|
496 |
modality_heads[ModalityType.AUDIO] = nn.Sequential(
|
497 |
nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
|
|
|
499 |
nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
|
500 |
)
|
501 |
|
502 |
+
# modality_heads[ModalityType.DEPTH] = nn.Sequential(
|
503 |
+
# nn.LayerNorm(normalized_shape=depth_embed_dim, eps=1e-6),
|
504 |
+
# SelectElement(index=0) if use_selection else nn.Identity(),
|
505 |
+
# nn.Linear(depth_embed_dim, out_embed_dim, bias=False),
|
506 |
+
# )
|
507 |
+
#
|
508 |
+
# modality_heads[ModalityType.THERMAL] = nn.Sequential(
|
509 |
+
# nn.LayerNorm(normalized_shape=thermal_embed_dim, eps=1e-6),
|
510 |
+
# SelectElement(index=0) if use_selection else nn.Identity(),
|
511 |
+
# nn.Linear(thermal_embed_dim, out_embed_dim, bias=False),
|
512 |
+
# )
|
513 |
+
#
|
514 |
+
# modality_heads[ModalityType.IMU] = nn.Sequential(
|
515 |
+
# nn.LayerNorm(normalized_shape=imu_embed_dim, eps=1e-6),
|
516 |
+
# SelectElement(index=0) if use_selection else nn.Identity(),
|
517 |
+
# nn.Dropout(p=0.5),
|
518 |
+
# nn.Linear(imu_embed_dim, out_embed_dim, bias=False),
|
519 |
+
# )
|
520 |
|
521 |
return nn.ModuleDict(modality_heads)
|
522 |
|
|
|
524 |
modality_postprocessors = {}
|
525 |
|
526 |
modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
|
527 |
+
# modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
|
528 |
+
# Normalize(dim=-1), LearnableLogitScaling(learnable=True)
|
529 |
+
# )
|
530 |
modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
|
531 |
Normalize(dim=-1),
|
532 |
LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
|
533 |
)
|
534 |
+
# modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
|
535 |
+
# Normalize(dim=-1),
|
536 |
+
# LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
|
537 |
+
# )
|
538 |
+
# modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
|
539 |
+
# Normalize(dim=-1),
|
540 |
+
# LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
|
541 |
+
# )
|
542 |
+
# modality_postprocessors[ModalityType.IMU] = nn.Sequential(
|
543 |
+
# Normalize(dim=-1),
|
544 |
+
# LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
|
545 |
+
# )
|
546 |
|
547 |
return nn.ModuleDict(modality_postprocessors)
|
548 |
|
|
|
612 |
progress=True,
|
613 |
)
|
614 |
|
615 |
+
model.load_state_dict(torch.load(".checkpoints/imagebind_huge.pth"), strict=False)
|
616 |
|
617 |
if use_blip_vision:
|
618 |
from bubogpt.models.eva_vit import create_eva_vit_g
|