THUDM
/

cogvlm-chat-hf

Text Generation

Model card Files Files and versions Community

nielsr HF staff commited on Mar 30

Commit

0dc1d68

•

1 Parent(s): 8f61efb

Add print statements

Files changed (1) hide show

modeling_cogvlm.py +7 -2

modeling_cogvlm.py CHANGED Viewed

@@ -117,7 +117,8 @@ def attention_fn(
         attention_mask: "torch.tensor(B, H, L, HD)",
         *,
         scaling_attention_score: bool = True,
-        attention_dropout: nn.Module = None
 ):
     attention_mask_bool = (attention_mask == 0)
     is_low_triangle = (attention_mask_bool == torch.ones_like(attention_mask_bool, dtype=torch.float).tril()).all()
@@ -126,6 +127,10 @@ def attention_fn(
         warnings.warn("It's recommended to use torch2.0 or higher.")
     if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score and (is_full or is_low_triangle):
         dropout_p = 0. if attention_dropout is None or not attention_dropout.training else attention_dropout.p
         return torch.nn.functional.scaled_dot_product_attention(
             query_layer, key_layer, value_layer,
             attn_mask=None,
@@ -302,7 +307,7 @@ class VisionExpertAttention(nn.Module):
         context_layer = attention_fn(
             query_layer=query_states, key_layer=key_states, value_layer=value_states, attention_mask=attention_mask,
-            scaling_attention_score=True, attention_dropout=None)
         if print_values:
             print("Shape of context_layer:", context_layer.shape)

         attention_mask: "torch.tensor(B, H, L, HD)",
         *,
         scaling_attention_score: bool = True,
+        attention_dropout: nn.Module = None,
+        print_values: bool = False,
 ):
     attention_mask_bool = (attention_mask == 0)
     is_low_triangle = (attention_mask_bool == torch.ones_like(attention_mask_bool, dtype=torch.float).tril()).all()
         warnings.warn("It's recommended to use torch2.0 or higher.")
     if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score and (is_full or is_low_triangle):
         dropout_p = 0. if attention_dropout is None or not attention_dropout.training else attention_dropout.p
+        if print_values:
+            print("Is_causal:", not is_full)
         return torch.nn.functional.scaled_dot_product_attention(
             query_layer, key_layer, value_layer,
             attn_mask=None,
         context_layer = attention_fn(
             query_layer=query_states, key_layer=key_states, value_layer=value_states, attention_mask=attention_mask,
+            scaling_attention_score=True, attention_dropout=None, print_values=print_values)
         if print_values:
             print("Shape of context_layer:", context_layer.shape)