florian-hoenicke commited on
Commit
ab27e50
1 Parent(s): 82e3595

feat: push custom model

Browse files
README.md CHANGED
@@ -11,15 +11,15 @@ tags:
11
  - feature-extraction
12
  - sentence-similarity
13
  - mteb
14
- - Science
15
- - Research
16
- - Academic
17
- - Papers
18
- - Arxiv
19
  ---
20
- This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-en**](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) designed for the following use case:
21
 
22
- academic research papers search engine
23
 
24
  ## How to Use
25
  This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
 
11
  - feature-extraction
12
  - sentence-similarity
13
  - mteb
14
+ - Ubuntu
15
+ - Technical
16
+ - Support
17
+ - Linux
18
+ - Community
19
  ---
20
+ This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-code**](https://huggingface.co/jinaai/jina-embeddings-v2-base-code) designed for the following use case:
21
 
22
+ technical support for Ubuntu
23
 
24
  ## How to Use
25
  This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
config.json CHANGED
@@ -8,15 +8,15 @@
8
  "auto_map": {
9
  "AutoConfig": "configuration_bert.JinaBertConfig",
10
  "AutoModel": "modeling_bert.JinaBertModel",
11
- "AutoModelForMaskedLM": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForMaskedLM",
12
- "AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification"
13
  },
14
  "classifier_dropout": null,
15
  "emb_pooler": "mean",
16
  "feed_forward_type": "geglu",
17
  "gradient_checkpointing": false,
18
  "hidden_act": "gelu",
19
- "hidden_dropout_prob": 0.1,
20
  "hidden_size": 768,
21
  "initializer_range": 0.02,
22
  "intermediate_size": 3072,
@@ -32,5 +32,5 @@
32
  "transformers_version": "4.40.2",
33
  "type_vocab_size": 2,
34
  "use_cache": true,
35
- "vocab_size": 30528
36
  }
 
8
  "auto_map": {
9
  "AutoConfig": "configuration_bert.JinaBertConfig",
10
  "AutoModel": "modeling_bert.JinaBertModel",
11
+ "AutoModelForMaskedLM": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForMaskedLM",
12
+ "AutoModelForSequenceClassification": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForSequenceClassification"
13
  },
14
  "classifier_dropout": null,
15
  "emb_pooler": "mean",
16
  "feed_forward_type": "geglu",
17
  "gradient_checkpointing": false,
18
  "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.0,
20
  "hidden_size": 768,
21
  "initializer_range": 0.02,
22
  "intermediate_size": 3072,
 
32
  "transformers_version": "4.40.2",
33
  "type_vocab_size": 2,
34
  "use_cache": true,
35
+ "vocab_size": 61056
36
  }
configuration_bert.py CHANGED
@@ -17,11 +17,18 @@
17
  """ BERT model configuration"""
18
  from collections import OrderedDict
19
  from typing import Mapping
 
20
 
21
  from transformers.configuration_utils import PretrainedConfig
22
- from transformers.onnx import OnnxConfig
23
  from transformers.utils import logging
24
 
 
 
 
 
 
 
 
25
 
26
  logger = logging.get_logger(__name__)
27
 
@@ -128,7 +135,7 @@ class JinaBertConfig(PretrainedConfig):
128
  classifier_dropout=None,
129
  feed_forward_type="original",
130
  emb_pooler=None,
131
- attn_implementation='torch',
132
  **kwargs,
133
  ):
134
  super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -152,17 +159,19 @@ class JinaBertConfig(PretrainedConfig):
152
  self.emb_pooler = emb_pooler
153
  self.attn_implementation = attn_implementation
154
 
155
- class JinaBertOnnxConfig(OnnxConfig):
156
- @property
157
- def inputs(self) -> Mapping[str, Mapping[int, str]]:
158
- if self.task == "multiple-choice":
159
- dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
160
- else:
161
- dynamic_axis = {0: "batch", 1: "sequence"}
162
- return OrderedDict(
163
- [
164
- ("input_ids", dynamic_axis),
165
- ("attention_mask", dynamic_axis),
166
- ("token_type_ids", dynamic_axis),
167
- ]
168
- )
 
 
 
17
  """ BERT model configuration"""
18
  from collections import OrderedDict
19
  from typing import Mapping
20
+ import warnings
21
 
22
  from transformers.configuration_utils import PretrainedConfig
 
23
  from transformers.utils import logging
24
 
25
+ try:
26
+ from optimum.exporters.onnx.model_configs import BertOnnxConfig
27
+ OPTIMUM_INSTALLED = True
28
+ except ImportError:
29
+ warnings.warn("optimum is not installed. To use OnnxConfig and BertOnnxConfig, make sure that `optimum` package is installed")
30
+ OPTIMUM_INSTALLED = False
31
+
32
 
33
  logger = logging.get_logger(__name__)
34
 
 
135
  classifier_dropout=None,
136
  feed_forward_type="original",
137
  emb_pooler=None,
138
+ attn_implementation=None,
139
  **kwargs,
140
  ):
141
  super().__init__(pad_token_id=pad_token_id, **kwargs)
 
159
  self.emb_pooler = emb_pooler
160
  self.attn_implementation = attn_implementation
161
 
162
+ if OPTIMUM_INSTALLED:
163
+
164
+ class JinaBertOnnxConfig(BertOnnxConfig):
165
+
166
+ @property
167
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
168
+ if self.task == "multiple-choice":
169
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
170
+ else:
171
+ dynamic_axis = {0: "batch", 1: "sequence"}
172
+ return OrderedDict(
173
+ [
174
+ ("input_ids", dynamic_axis),
175
+ ("attention_mask", dynamic_axis),
176
+ ]
177
+ )
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c73260733e2c4707518a9a744b629a3446f67a44c6fbd13cc31e2320f8fbedf5
3
- size 549493968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e7b416602507efbaac3f0feab5a806ea22c94e250774d95ca3bef51fb6b197b
3
+ size 643505600
modeling_bert.py CHANGED
@@ -280,9 +280,10 @@ class JinaBertSelfAttention(nn.Module):
280
  self.query = nn.Linear(config.hidden_size, self.all_head_size)
281
  self.key = nn.Linear(config.hidden_size, self.all_head_size)
282
  self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
 
283
 
284
- self.dropout_p = config.attention_probs_dropout_prob
285
- self.dropout = nn.Dropout(self.dropout_p)
286
  self.position_embedding_type = position_embedding_type or getattr(
287
  config, "position_embedding_type", "absolute"
288
  )
@@ -316,7 +317,7 @@ class JinaBertSelfAttention(nn.Module):
316
  output_attentions: Optional[bool] = False,
317
  bias: Optional[torch.FloatTensor] = None,
318
  ) -> Tuple[torch.Tensor]:
319
- mixed_query_layer = self.query(hidden_states)
320
 
321
  # If this is instantiated as a cross-attention module, the keys
322
  # and values come from an encoder; the attention mask needs to be
@@ -329,16 +330,16 @@ class JinaBertSelfAttention(nn.Module):
329
  value_layer = past_key_value[1]
330
  attention_mask = encoder_attention_mask
331
  elif is_cross_attention:
332
- key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
333
  value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
334
  attention_mask = encoder_attention_mask
335
  elif past_key_value is not None:
336
- key_layer = self.transpose_for_scores(self.key(hidden_states))
337
  value_layer = self.transpose_for_scores(self.value(hidden_states))
338
  key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
339
  value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
340
  else:
341
- key_layer = self.transpose_for_scores(self.key(hidden_states))
342
  value_layer = self.transpose_for_scores(self.value(hidden_states))
343
 
344
  query_layer = self.transpose_for_scores(mixed_query_layer)
@@ -357,8 +358,7 @@ class JinaBertSelfAttention(nn.Module):
357
  if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
358
  b, _, s, _ = query_layer.shape
359
  new_bias = attention_mask + bias
360
- dropout_p = self.dropout_p if self.training else 0.0
361
- attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
362
  attn = attn.permute(0, 2, 1, 3).contiguous()
363
  return (attn.view(b, s, self.all_head_size),)
364
 
@@ -431,7 +431,7 @@ class JinaBertSelfAttention(nn.Module):
431
  context_layer = context_layer.view(new_context_layer_shape)
432
 
433
  outputs = (
434
- (context_layer, attention_probs) if output_attentions else (context_layer,)
435
  )
436
 
437
  if self.is_decoder:
@@ -515,44 +515,29 @@ class JinaBertAttention(nn.Module):
515
  return outputs
516
 
517
 
518
- class JinaBertIntermediate(nn.Module):
519
- def __init__(self, config):
520
- super().__init__()
521
- self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
522
- if isinstance(config.hidden_act, str):
523
- self.intermediate_act_fn = ACT2FN[config.hidden_act]
524
- else:
525
- self.intermediate_act_fn = config.hidden_act
526
-
527
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
528
- hidden_states = self.dense(hidden_states)
529
- hidden_states = self.intermediate_act_fn(hidden_states)
530
- return hidden_states
531
-
532
-
533
- class JinaBertOutput(nn.Module):
534
  def __init__(self, config: JinaBertConfig):
535
  super().__init__()
536
- self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
537
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
 
 
 
538
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
539
 
540
- def forward(
541
- self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
542
- ) -> torch.Tensor:
543
- hidden_states = self.dense(hidden_states)
544
- hidden_states = self.dropout(hidden_states)
545
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
546
- return hidden_states
547
 
548
 
549
  class JinaBertGLUMLP(nn.Module):
550
  def __init__(self, config: JinaBertConfig):
551
  super().__init__()
552
  self.config = config
553
- self.gated_layers = nn.Linear(
554
- config.hidden_size, config.intermediate_size * 2, bias=False
555
- )
556
  if config.feed_forward_type == 'reglu':
557
  self.act = nn.ReLU()
558
  elif config.feed_forward_type == 'geglu':
@@ -561,23 +546,21 @@ class JinaBertGLUMLP(nn.Module):
561
  raise ValueError(
562
  f"feed_forward_type {config.feed_forward_type} not supported"
563
  )
564
- self.wo = nn.Linear(config.intermediate_size, config.hidden_size)
 
 
 
565
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
566
- self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
567
 
568
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
569
- residual_connection = hidden_states
570
- # compute the activation
571
- hidden_states = self.gated_layers(hidden_states)
572
- gated = hidden_states[:, :, : self.config.intermediate_size]
573
- non_gated = hidden_states[:, :, self.config.intermediate_size :]
574
- hidden_states = self.act(gated) * non_gated
575
- hidden_states = self.dropout(hidden_states)
576
- # multiply by the second matrix
577
- hidden_states = self.wo(hidden_states)
578
- # add the residual connection and post-LN
579
- hidden_states = self.layernorm(hidden_states + residual_connection)
580
- return hidden_states
581
 
582
 
583
  class JinaBertLayer(nn.Module):
@@ -589,6 +572,8 @@ class JinaBertLayer(nn.Module):
589
  self.is_decoder = config.is_decoder
590
  self.add_cross_attention = config.add_cross_attention
591
  self.feed_forward_type = config.feed_forward_type
 
 
592
  if self.add_cross_attention:
593
  if not self.is_decoder:
594
  raise ValueError(
@@ -600,8 +585,7 @@ class JinaBertLayer(nn.Module):
600
  if self.feed_forward_type.endswith('glu'):
601
  self.mlp = JinaBertGLUMLP(config)
602
  else:
603
- self.intermediate = JinaBertIntermediate(config)
604
- self.output = JinaBertOutput(config)
605
 
606
  def forward(
607
  self,
@@ -614,6 +598,9 @@ class JinaBertLayer(nn.Module):
614
  past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
615
  output_attentions: Optional[bool] = False,
616
  ) -> Tuple[torch.Tensor]:
 
 
 
617
  # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
618
  self_attn_past_key_value = (
619
  past_key_value[:2] if past_key_value is not None else None
@@ -667,15 +654,9 @@ class JinaBertLayer(nn.Module):
667
  cross_attn_present_key_value = cross_attention_outputs[-1]
668
  present_key_value = present_key_value + cross_attn_present_key_value
669
 
670
- if self.feed_forward_type.endswith('glu'):
671
- layer_output = self.mlp(attention_output)
672
- else:
673
- layer_output = apply_chunking_to_forward(
674
- self.feed_forward_chunk,
675
- self.chunk_size_feed_forward,
676
- self.seq_len_dim,
677
- attention_output,
678
- )
679
  outputs = (layer_output,) + outputs
680
 
681
  # if decoder, return the attn key/values as the last output
@@ -684,11 +665,6 @@ class JinaBertLayer(nn.Module):
684
 
685
  return outputs
686
 
687
- def feed_forward_chunk(self, attention_output):
688
- intermediate_output = self.intermediate(attention_output)
689
- layer_output = self.output(intermediate_output, attention_output)
690
- return layer_output
691
-
692
 
693
  class JinaBertEncoder(nn.Module):
694
  def __init__(self, config: JinaBertConfig):
@@ -699,11 +675,6 @@ class JinaBertEncoder(nn.Module):
699
  )
700
  self.gradient_checkpointing = False
701
  self.num_attention_heads = config.num_attention_heads
702
- self.register_buffer(
703
- "alibi",
704
- self.rebuild_alibi_tensor(size=config.max_position_embeddings),
705
- persistent=False,
706
- )
707
 
708
  def rebuild_alibi_tensor(
709
  self, size: int, device: Optional[Union[torch.device, str]] = None
@@ -771,23 +742,7 @@ class JinaBertEncoder(nn.Module):
771
 
772
  # Add alibi matrix to extended_attention_mask
773
  _, seqlen, _ = hidden_states.size()
774
- if self._current_alibi_size < seqlen:
775
- # Rebuild the alibi tensor when needed
776
- warnings.warn(
777
- f'Increasing alibi size from {self._current_alibi_size} to {seqlen}.'
778
- )
779
- self.register_buffer(
780
- "alibi",
781
- self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(
782
- hidden_states.dtype
783
- ),
784
- persistent=False,
785
- )
786
- elif self.alibi.device != hidden_states.device:
787
- # Device catch-up
788
- self.alibi = self.alibi.to(hidden_states.device)
789
-
790
- alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
791
  if self.gradient_checkpointing and self.training:
792
  if use_cache:
793
  logger.warning_once(
 
280
  self.query = nn.Linear(config.hidden_size, self.all_head_size)
281
  self.key = nn.Linear(config.hidden_size, self.all_head_size)
282
  self.value = nn.Linear(config.hidden_size, self.all_head_size)
283
+ self.layer_norm_q = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
284
+ self.layer_norm_k = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
285
 
286
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
287
  self.position_embedding_type = position_embedding_type or getattr(
288
  config, "position_embedding_type", "absolute"
289
  )
 
317
  output_attentions: Optional[bool] = False,
318
  bias: Optional[torch.FloatTensor] = None,
319
  ) -> Tuple[torch.Tensor]:
320
+ mixed_query_layer = self.layer_norm_q(self.query(hidden_states))
321
 
322
  # If this is instantiated as a cross-attention module, the keys
323
  # and values come from an encoder; the attention mask needs to be
 
330
  value_layer = past_key_value[1]
331
  attention_mask = encoder_attention_mask
332
  elif is_cross_attention:
333
+ key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(encoder_hidden_states)))
334
  value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
335
  attention_mask = encoder_attention_mask
336
  elif past_key_value is not None:
337
+ key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
338
  value_layer = self.transpose_for_scores(self.value(hidden_states))
339
  key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
340
  value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
341
  else:
342
+ key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
343
  value_layer = self.transpose_for_scores(self.value(hidden_states))
344
 
345
  query_layer = self.transpose_for_scores(mixed_query_layer)
 
358
  if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
359
  b, _, s, _ = query_layer.shape
360
  new_bias = attention_mask + bias
361
+ attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
 
362
  attn = attn.permute(0, 2, 1, 3).contiguous()
363
  return (attn.view(b, s, self.all_head_size),)
364
 
 
431
  context_layer = context_layer.view(new_context_layer_shape)
432
 
433
  outputs = (
434
+ (context_layer, attention_scores) if output_attentions else (context_layer,)
435
  )
436
 
437
  if self.is_decoder:
 
515
  return outputs
516
 
517
 
518
+ class JinaBertMLP(nn.Module):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  def __init__(self, config: JinaBertConfig):
520
  super().__init__()
521
+ self.config = config
522
+ self.act = ACT2FN[config.hidden_act]
523
+ self.up_layer = nn.Linear(
524
+ config.hidden_size, config.intermediate_size, bias=False
525
+ )
526
+ self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
527
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
528
 
529
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
530
+ # Up
531
+ hidden_mlp_states = self.act(self.up_layer(hidden_states))
532
+ hidden_mlp_states = self.dropout(hidden_mlp_states)
533
+ # Down
534
+ return self.down_layer(hidden_mlp_states)
 
535
 
536
 
537
  class JinaBertGLUMLP(nn.Module):
538
  def __init__(self, config: JinaBertConfig):
539
  super().__init__()
540
  self.config = config
 
 
 
541
  if config.feed_forward_type == 'reglu':
542
  self.act = nn.ReLU()
543
  elif config.feed_forward_type == 'geglu':
 
546
  raise ValueError(
547
  f"feed_forward_type {config.feed_forward_type} not supported"
548
  )
549
+ self.up_gated_layer = nn.Linear(
550
+ config.hidden_size, config.intermediate_size * 2, bias=False
551
+ )
552
+ self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
553
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
554
 
555
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
556
+ # Up with gate
557
+ hidden_mlp_states = self.up_gated_layer(hidden_states)
558
+ up = hidden_mlp_states[:, :, :self.config.intermediate_size]
559
+ gated = hidden_mlp_states[:, :, self.config.intermediate_size:]
560
+ hidden_mlp_states = up * self.act(gated)
561
+ hidden_mlp_states = self.dropout(hidden_mlp_states)
562
+ # Down
563
+ return self.down_layer(hidden_mlp_states)
 
 
 
 
564
 
565
 
566
  class JinaBertLayer(nn.Module):
 
572
  self.is_decoder = config.is_decoder
573
  self.add_cross_attention = config.add_cross_attention
574
  self.feed_forward_type = config.feed_forward_type
575
+ self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
576
+ self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
577
  if self.add_cross_attention:
578
  if not self.is_decoder:
579
  raise ValueError(
 
585
  if self.feed_forward_type.endswith('glu'):
586
  self.mlp = JinaBertGLUMLP(config)
587
  else:
588
+ self.mlp = JinaBertMLP(config)
 
589
 
590
  def forward(
591
  self,
 
598
  past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
599
  output_attentions: Optional[bool] = False,
600
  ) -> Tuple[torch.Tensor]:
601
+ # Pre-Norm
602
+ residual = hidden_states
603
+
604
  # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
605
  self_attn_past_key_value = (
606
  past_key_value[:2] if past_key_value is not None else None
 
654
  cross_attn_present_key_value = cross_attention_outputs[-1]
655
  present_key_value = present_key_value + cross_attn_present_key_value
656
 
657
+ residual = self.layer_norm_1(residual + attention_output)
658
+ mlp_output = self.mlp(residual)
659
+ layer_output = self.layer_norm_2(residual + mlp_output)
 
 
 
 
 
 
660
  outputs = (layer_output,) + outputs
661
 
662
  # if decoder, return the attn key/values as the last output
 
665
 
666
  return outputs
667
 
 
 
 
 
 
668
 
669
  class JinaBertEncoder(nn.Module):
670
  def __init__(self, config: JinaBertConfig):
 
675
  )
676
  self.gradient_checkpointing = False
677
  self.num_attention_heads = config.num_attention_heads
 
 
 
 
 
678
 
679
  def rebuild_alibi_tensor(
680
  self, size: int, device: Optional[Union[torch.device, str]] = None
 
742
 
743
  # Add alibi matrix to extended_attention_mask
744
  _, seqlen, _ = hidden_states.size()
745
+ alibi_bias = self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(hidden_states.dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  if self.gradient_checkpointing and self.training:
747
  if use_cache:
748
  logger.warning_once(
special_tokens_map.json CHANGED
@@ -1,34 +1,48 @@
1
  {
 
 
 
 
 
 
 
2
  "cls_token": {
3
- "content": "[CLS]",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
- "mask_token": {
10
- "content": "[MASK]",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
 
 
 
 
 
 
16
  "pad_token": {
17
- "content": "[PAD]",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "sep_token": {
24
- "content": "[SEP]",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "unk_token": {
31
- "content": "[UNK]",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
  "cls_token": {
10
+ "content": "<s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "eos_token": {
17
+ "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
  "pad_token": {
31
+ "content": "<pad>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "</s>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "unk_token": {
45
+ "content": "<unk>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,57 +1,57 @@
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
- "100": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "101": {
20
- "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "102": {
28
- "content": "[SEP]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "103": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": true,
45
- "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
- "mask_token": "[MASK]",
49
- "model_max_length": 2147483648,
50
- "never_split": null,
51
- "pad_token": "[PAD]",
52
- "sep_token": "[SEP]",
53
- "strip_accents": null,
54
- "tokenize_chinese_chars": true,
55
- "tokenizer_class": "BertTokenizer",
56
- "unk_token": "[UNK]"
57
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
5
+ "content": "<s>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "1": {
13
+ "content": "<pad>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "2": {
21
+ "content": "</s>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
  },
28
+ "3": {
29
+ "content": "<unk>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
33
  "single_word": false,
34
  "special": true
35
  },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
  "normalized": false,
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
43
  }
44
  },
45
+ "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 8192,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
 
 
57
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bd5f0450c5acc1958d753d72574454ececd36a4850f8548341a67895160430a
3
  size 4719
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17e60880f40bee6bb3d18341065d81562805a6720c21f5acd096dcdab5103a33
3
  size 4719