small fixes and tokenizer config

Files changed (3) hide show

configuration_yalm.py CHANGED Viewed

@@ -106,7 +106,7 @@ class YalmConfig(PretrainedConfig):
         self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
         self.activation_type = activation_type
         self.max_position_embeddings = max_position_embeddings
-        self.apply_residual_connection_post_layernorm = False
         self.initializer_range = initializer_range
         self.layernorm_epsilon = layernorm_epsilon
         self.attention_dropout = attention_dropout

         self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
         self.activation_type = activation_type
         self.max_position_embeddings = max_position_embeddings
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
         self.initializer_range = initializer_range
         self.layernorm_epsilon = layernorm_epsilon
         self.attention_dropout = attention_dropout

modeling_yalm.py → modelling_yalm.py RENAMED Viewed

@@ -327,7 +327,7 @@ class YalmSelfAttention(nn.Module):
             attention_scores += attention_mask
         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
-        # attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
         # =========================
         # Context layer. [sq, b, hp]
@@ -498,9 +498,9 @@ class YalmTransformerLayer(nn.Module):
         else:
             residual = hidden_states
-        # attention_output = torch.nn.functional.dropout(
-        #     attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
-        # )
         layernorm_input = attention_output + residual
         # Layer norm post the self attention.
@@ -510,9 +510,9 @@ class YalmTransformerLayer(nn.Module):
         mlp_output = self.mlp(layernorm_output)
         residual = layernorm_input
-        # mlp_output = torch.nn.functional.dropout(
-        #     mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
-        # )
         output = mlp_output + residual
         if use_cache:

             attention_scores += attention_mask
         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
         # =========================
         # Context layer. [sq, b, hp]
         else:
             residual = hidden_states
+        attention_output = torch.nn.functional.dropout(
+            attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
+        )
         layernorm_input = attention_output + residual
         # Layer norm post the self attention.
         mlp_output = self.mlp(layernorm_output)
         residual = layernorm_input
+        mlp_output = torch.nn.functional.dropout(
+            mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
+        )
         output = mlp_output + residual
         if use_cache:

tokenizer_config.json ADDED Viewed

+{
+    "auto_map": {
+        "AutoTokenizer": ["tokenization_yalm.YalmTokenizer", null]
+    },
+    "tokenizer_class": "YalmTokenizer",
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "unk_token": "<unk>"
+}