More debug printing inside transformer encoder
Browse files
models/GroundingDINO/transformer.py
CHANGED
@@ -560,7 +560,7 @@ class TransformerEncoder(nn.Module):
|
|
560 |
"""
|
561 |
|
562 |
output = src
|
563 |
-
|
564 |
# preparation and reshape
|
565 |
if self.num_layers > 0:
|
566 |
reference_points = self.get_reference_points(
|
@@ -591,8 +591,10 @@ class TransformerEncoder(nn.Module):
|
|
591 |
# if output.isnan().any() or memory_text.isnan().any():
|
592 |
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
|
593 |
# import ipdb; ipdb.set_trace()
|
|
|
594 |
if self.fusion_layers:
|
595 |
if self.use_checkpoint:
|
|
|
596 |
output, memory_text = checkpoint.checkpoint(
|
597 |
self.fusion_layers[layer_id],
|
598 |
output,
|
@@ -600,24 +602,30 @@ class TransformerEncoder(nn.Module):
|
|
600 |
key_padding_mask,
|
601 |
text_attention_mask,
|
602 |
)
|
|
|
603 |
else:
|
|
|
604 |
output, memory_text = self.fusion_layers[layer_id](
|
605 |
v=output,
|
606 |
l=memory_text,
|
607 |
attention_mask_v=key_padding_mask,
|
608 |
attention_mask_l=text_attention_mask,
|
609 |
)
|
|
|
610 |
|
611 |
if self.text_layers:
|
|
|
612 |
memory_text = self.text_layers[layer_id](
|
613 |
src=memory_text.transpose(0, 1),
|
614 |
src_mask=~text_self_attention_masks, # note we use ~ for mask here
|
615 |
src_key_padding_mask=text_attention_mask,
|
616 |
pos=(pos_text.transpose(0, 1) if pos_text is not None else None),
|
617 |
).transpose(0, 1)
|
|
|
618 |
|
619 |
# main process
|
620 |
if self.use_transformer_ckpt:
|
|
|
621 |
output = checkpoint.checkpoint(
|
622 |
layer,
|
623 |
output,
|
@@ -627,7 +635,9 @@ class TransformerEncoder(nn.Module):
|
|
627 |
level_start_index,
|
628 |
key_padding_mask,
|
629 |
)
|
|
|
630 |
else:
|
|
|
631 |
output = layer(
|
632 |
src=output,
|
633 |
pos=pos,
|
@@ -636,6 +646,7 @@ class TransformerEncoder(nn.Module):
|
|
636 |
level_start_index=level_start_index,
|
637 |
key_padding_mask=key_padding_mask,
|
638 |
)
|
|
|
639 |
|
640 |
return output, memory_text
|
641 |
|
|
|
560 |
"""
|
561 |
|
562 |
output = src
|
563 |
+
print("inside transformer encoder")
|
564 |
# preparation and reshape
|
565 |
if self.num_layers > 0:
|
566 |
reference_points = self.get_reference_points(
|
|
|
591 |
# if output.isnan().any() or memory_text.isnan().any():
|
592 |
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
|
593 |
# import ipdb; ipdb.set_trace()
|
594 |
+
print("layer_id: " + str(layer_id))
|
595 |
if self.fusion_layers:
|
596 |
if self.use_checkpoint:
|
597 |
+
print("using checkpoint")
|
598 |
output, memory_text = checkpoint.checkpoint(
|
599 |
self.fusion_layers[layer_id],
|
600 |
output,
|
|
|
602 |
key_padding_mask,
|
603 |
text_attention_mask,
|
604 |
)
|
605 |
+
print("got checkpoint output")
|
606 |
else:
|
607 |
+
print("not using checkpoint")
|
608 |
output, memory_text = self.fusion_layers[layer_id](
|
609 |
v=output,
|
610 |
l=memory_text,
|
611 |
attention_mask_v=key_padding_mask,
|
612 |
attention_mask_l=text_attention_mask,
|
613 |
)
|
614 |
+
print("got fusion output")
|
615 |
|
616 |
if self.text_layers:
|
617 |
+
print("getting text layers")
|
618 |
memory_text = self.text_layers[layer_id](
|
619 |
src=memory_text.transpose(0, 1),
|
620 |
src_mask=~text_self_attention_masks, # note we use ~ for mask here
|
621 |
src_key_padding_mask=text_attention_mask,
|
622 |
pos=(pos_text.transpose(0, 1) if pos_text is not None else None),
|
623 |
).transpose(0, 1)
|
624 |
+
print("got text output")
|
625 |
|
626 |
# main process
|
627 |
if self.use_transformer_ckpt:
|
628 |
+
print("use transformer ckpt")
|
629 |
output = checkpoint.checkpoint(
|
630 |
layer,
|
631 |
output,
|
|
|
635 |
level_start_index,
|
636 |
key_padding_mask,
|
637 |
)
|
638 |
+
print("got output")
|
639 |
else:
|
640 |
+
print("not use transformer ckpt")
|
641 |
output = layer(
|
642 |
src=output,
|
643 |
pos=pos,
|
|
|
646 |
level_start_index=level_start_index,
|
647 |
key_padding_mask=key_padding_mask,
|
648 |
)
|
649 |
+
print("got output")
|
650 |
|
651 |
return output, memory_text
|
652 |
|