feat: push custom model

Browse files

Files changed (9) hide show

README.md +7 -7
config.json +4 -4
configuration_bert.py +25 -16
model.safetensors +2 -2
modeling_bert.py +44 -89
special_tokens_map.json +20 -6
tokenizer.json +0 -0
tokenizer_config.json +22 -22
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -11,15 +11,15 @@ tags:
 - feature-extraction
 - sentence-similarity
 - mteb
-- Science
-- Research
-- Academic
-- Papers
-- Arxiv
 ---
-This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-en**](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) designed for the following use case:
-academic research papers search engine
 ## How to Use
 This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:

 - feature-extraction
 - sentence-similarity
 - mteb
+- Ubuntu
+- Technical
+- Support
+- Linux
+- Community
 ---
+This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-code**](https://huggingface.co/jinaai/jina-embeddings-v2-base-code) designed for the following use case:
+technical support for Ubuntu
 ## How to Use
 This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:

config.json CHANGED Viewed

@@ -8,15 +8,15 @@
   "auto_map": {
     "AutoConfig": "configuration_bert.JinaBertConfig",
     "AutoModel": "modeling_bert.JinaBertModel",
-    "AutoModelForMaskedLM": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForMaskedLM",
-    "AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification"
   },
   "classifier_dropout": null,
   "emb_pooler": "mean",
   "feed_forward_type": "geglu",
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
@@ -32,5 +32,5 @@
   "transformers_version": "4.40.2",
   "type_vocab_size": 2,
   "use_cache": true,
-  "vocab_size": 30528
 }

   "auto_map": {
     "AutoConfig": "configuration_bert.JinaBertConfig",
     "AutoModel": "modeling_bert.JinaBertModel",
+    "AutoModelForMaskedLM": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForMaskedLM",
+    "AutoModelForSequenceClassification": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForSequenceClassification"
   },
   "classifier_dropout": null,
   "emb_pooler": "mean",
   "feed_forward_type": "geglu",
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "transformers_version": "4.40.2",
   "type_vocab_size": 2,
   "use_cache": true,
+  "vocab_size": 61056
 }

configuration_bert.py CHANGED Viewed

@@ -17,11 +17,18 @@
 """ BERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
 from transformers.configuration_utils import PretrainedConfig
-from transformers.onnx import OnnxConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -128,7 +135,7 @@ class JinaBertConfig(PretrainedConfig):
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
-        attn_implementation='torch',
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -152,17 +159,19 @@ class JinaBertConfig(PretrainedConfig):
         self.emb_pooler = emb_pooler
         self.attn_implementation = attn_implementation
-class JinaBertOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        if self.task == "multiple-choice":
-            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
-        else:
-            dynamic_axis = {0: "batch", 1: "sequence"}
-        return OrderedDict(
-            [
-                ("input_ids", dynamic_axis),
-                ("attention_mask", dynamic_axis),
-                ("token_type_ids", dynamic_axis),
-            ]
-        )

 """ BERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
+import warnings
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
+try:
+    from optimum.exporters.onnx.model_configs import BertOnnxConfig
+    OPTIMUM_INSTALLED = True
+except ImportError:
+    warnings.warn("optimum is not installed. To use OnnxConfig and BertOnnxConfig, make sure that `optimum` package is installed")
+    OPTIMUM_INSTALLED = False
 logger = logging.get_logger(__name__)
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
+        attn_implementation=None,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.emb_pooler = emb_pooler
         self.attn_implementation = attn_implementation
+if OPTIMUM_INSTALLED:
+    class JinaBertOnnxConfig(BertOnnxConfig):
+        @property
+        def inputs(self) -> Mapping[str, Mapping[int, str]]:
+            if self.task == "multiple-choice":
+                dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+            else:
+                dynamic_axis = {0: "batch", 1: "sequence"}
+            return OrderedDict(
+                [
+                    ("input_ids", dynamic_axis),
+                    ("attention_mask", dynamic_axis),
+                ]
+            )

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c73260733e2c4707518a9a744b629a3446f67a44c6fbd13cc31e2320f8fbedf5
-size 549493968

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e7b416602507efbaac3f0feab5a806ea22c94e250774d95ca3bef51fb6b197b
+size 643505600

modeling_bert.py CHANGED Viewed

@@ -280,9 +280,10 @@ class JinaBertSelfAttention(nn.Module):
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        self.dropout_p = config.attention_probs_dropout_prob
-        self.dropout = nn.Dropout(self.dropout_p)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
@@ -316,7 +317,7 @@ class JinaBertSelfAttention(nn.Module):
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
@@ -329,16 +330,16 @@ class JinaBertSelfAttention(nn.Module):
             value_layer = past_key_value[1]
             attention_mask = encoder_attention_mask
         elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
             value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
             value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
         query_layer = self.transpose_for_scores(mixed_query_layer)
@@ -357,8 +358,7 @@ class JinaBertSelfAttention(nn.Module):
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
-            dropout_p = self.dropout_p if self.training else 0.0
-            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)
@@ -431,7 +431,7 @@ class JinaBertSelfAttention(nn.Module):
         context_layer = context_layer.view(new_context_layer_shape)
         outputs = (
-            (context_layer, attention_probs) if output_attentions else (context_layer,)
         )
         if self.is_decoder:
@@ -515,44 +515,29 @@ class JinaBertAttention(nn.Module):
         return outputs
-class JinaBertIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-class JinaBertOutput(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-    def forward(
-        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
-    ) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
 class JinaBertGLUMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
-        self.gated_layers = nn.Linear(
-            config.hidden_size, config.intermediate_size * 2, bias=False
-        )
         if config.feed_forward_type == 'reglu':
             self.act = nn.ReLU()
         elif config.feed_forward_type == 'geglu':
@@ -561,23 +546,21 @@ class JinaBertGLUMLP(nn.Module):
             raise ValueError(
                 f"feed_forward_type {config.feed_forward_type} not supported"
             )
-        self.wo = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        residual_connection = hidden_states
-        # compute the activation
-        hidden_states = self.gated_layers(hidden_states)
-        gated = hidden_states[:, :, : self.config.intermediate_size]
-        non_gated = hidden_states[:, :, self.config.intermediate_size :]
-        hidden_states = self.act(gated) * non_gated
-        hidden_states = self.dropout(hidden_states)
-        # multiply by the second matrix
-        hidden_states = self.wo(hidden_states)
-        # add the residual connection and post-LN
-        hidden_states = self.layernorm(hidden_states + residual_connection)
-        return hidden_states
 class JinaBertLayer(nn.Module):
@@ -589,6 +572,8 @@ class JinaBertLayer(nn.Module):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(
@@ -600,8 +585,7 @@ class JinaBertLayer(nn.Module):
         if self.feed_forward_type.endswith('glu'):
             self.mlp = JinaBertGLUMLP(config)
         else:
-            self.intermediate = JinaBertIntermediate(config)
-            self.output = JinaBertOutput(config)
     def forward(
         self,
@@ -614,6 +598,9 @@ class JinaBertLayer(nn.Module):
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
@@ -667,15 +654,9 @@ class JinaBertLayer(nn.Module):
             cross_attn_present_key_value = cross_attention_outputs[-1]
             present_key_value = present_key_value + cross_attn_present_key_value
-        if self.feed_forward_type.endswith('glu'):
-            layer_output = self.mlp(attention_output)
-        else:
-            layer_output = apply_chunking_to_forward(
-                self.feed_forward_chunk,
-                self.chunk_size_feed_forward,
-                self.seq_len_dim,
-                attention_output,
-            )
         outputs = (layer_output,) + outputs
         # if decoder, return the attn key/values as the last output
@@ -684,11 +665,6 @@ class JinaBertLayer(nn.Module):
         return outputs
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
 class JinaBertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
@@ -699,11 +675,6 @@ class JinaBertEncoder(nn.Module):
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
-        self.register_buffer(
-            "alibi",
-            self.rebuild_alibi_tensor(size=config.max_position_embeddings),
-            persistent=False,
-        )
     def rebuild_alibi_tensor(
         self, size: int, device: Optional[Union[torch.device, str]] = None
@@ -771,23 +742,7 @@ class JinaBertEncoder(nn.Module):
         # Add alibi matrix to extended_attention_mask
         _, seqlen, _ = hidden_states.size()
-        if self._current_alibi_size < seqlen:
-            # Rebuild the alibi tensor when needed
-            warnings.warn(
-                f'Increasing alibi size from {self._current_alibi_size} to {seqlen}.'
-            )
-            self.register_buffer(
-                "alibi",
-                self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(
-                    hidden_states.dtype
-                ),
-                persistent=False,
-            )
-        elif self.alibi.device != hidden_states.device:
-            # Device catch-up
-            self.alibi = self.alibi.to(hidden_states.device)
-        alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(

         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.layer_norm_q = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layer_norm_k = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.layer_norm_q(self.query(hidden_states))
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
             value_layer = past_key_value[1]
             attention_mask = encoder_attention_mask
         elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(encoder_hidden_states)))
             value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
             value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         else:
+            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
         query_layer = self.transpose_for_scores(mixed_query_layer)
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
+            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)
         context_layer = context_layer.view(new_context_layer_shape)
         outputs = (
+            (context_layer, attention_scores) if output_attentions else (context_layer,)
         )
         if self.is_decoder:
         return outputs
+class JinaBertMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.up_layer = nn.Linear(
+            config.hidden_size, config.intermediate_size, bias=False
+        )
+        self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # Up
+        hidden_mlp_states = self.act(self.up_layer(hidden_states))
+        hidden_mlp_states = self.dropout(hidden_mlp_states)
+        # Down
+        return self.down_layer(hidden_mlp_states)
 class JinaBertGLUMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
         if config.feed_forward_type == 'reglu':
             self.act = nn.ReLU()
         elif config.feed_forward_type == 'geglu':
             raise ValueError(
                 f"feed_forward_type {config.feed_forward_type} not supported"
             )
+        self.up_gated_layer = nn.Linear(
+            config.hidden_size, config.intermediate_size * 2, bias=False
+        )
+        self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # Up with gate
+        hidden_mlp_states = self.up_gated_layer(hidden_states)
+        up = hidden_mlp_states[:, :, :self.config.intermediate_size]
+        gated = hidden_mlp_states[:, :, self.config.intermediate_size:]
+        hidden_mlp_states = up * self.act(gated)
+        hidden_mlp_states = self.dropout(hidden_mlp_states)
+        # Down
+        return self.down_layer(hidden_mlp_states)
 class JinaBertLayer(nn.Module):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
+        self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(
         if self.feed_forward_type.endswith('glu'):
             self.mlp = JinaBertGLUMLP(config)
         else:
+            self.mlp = JinaBertMLP(config)
     def forward(
         self,
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
+        # Pre-Norm
+        residual = hidden_states
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
             cross_attn_present_key_value = cross_attention_outputs[-1]
             present_key_value = present_key_value + cross_attn_present_key_value
+        residual = self.layer_norm_1(residual + attention_output)
+        mlp_output = self.mlp(residual)
+        layer_output = self.layer_norm_2(residual + mlp_output)
         outputs = (layer_output,) + outputs
         # if decoder, return the attn key/values as the last output
         return outputs
 class JinaBertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
     def rebuild_alibi_tensor(
         self, size: int, device: Optional[Union[torch.device, str]] = None
         # Add alibi matrix to extended_attention_mask
         _, seqlen, _ = hidden_states.size()
+        alibi_bias = self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(hidden_states.dtype)
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(

special_tokens_map.json CHANGED Viewed

@@ -1,34 +1,48 @@
 {
   "cls_token": {
-    "content": "[CLS]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
-  "mask_token": {
-    "content": "[MASK]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
-    "content": "[PAD]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
-    "content": "[SEP]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
-    "content": "[UNK]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

 {
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "cls_token": {
+    "content": "<s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
+  "eos_token": {
+    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "pad_token": {
+    "content": "<pad>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
+    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
+    "content": "<unk>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,57 +1,57 @@
 {
   "added_tokens_decoder": {
     "0": {
-      "content": "[PAD]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "100": {
-      "content": "[UNK]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "101": {
-      "content": "[CLS]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "102": {
-      "content": "[SEP]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "mask_token": "[MASK]",
-  "model_max_length": 2147483648,
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
 }

 {
+  "add_prefix_space": false,
   "added_tokens_decoder": {
     "0": {
+      "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "1": {
+      "content": "<pad>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "2": {
+      "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "3": {
+      "content": "<unk>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "4": {
+      "content": "<mask>",
+      "lstrip": true,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bd5f0450c5acc1958d753d72574454ececd36a4850f8548341a67895160430a
 size 4719

 version https://git-lfs.github.com/spec/v1
+oid sha256:17e60880f40bee6bb3d18341065d81562805a6720c21f5acd096dcdab5103a33
 size 4719