florian-hoenicke
commited on
feat: push custom model
Browse files- README.md +8 -8
- config.json +5 -6
- configuration_bert.py +25 -16
- merges.txt +0 -0
- model.safetensors +2 -2
- modeling_bert.py +44 -89
- sentence_bert_config.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
- training_args.bin +1 -1
- vocab.json +0 -0
README.md
CHANGED
@@ -4,22 +4,22 @@ datasets:
|
|
4 |
- fine-tuned/test
|
5 |
- allenai/c4
|
6 |
language:
|
7 |
-
-
|
8 |
pipeline_tag: feature-extraction
|
9 |
tags:
|
10 |
- sentence-transformers
|
11 |
- feature-extraction
|
12 |
- sentence-similarity
|
13 |
- mteb
|
14 |
-
-
|
15 |
-
-
|
16 |
-
-
|
17 |
-
-
|
18 |
-
-
|
19 |
---
|
20 |
-
This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-
|
21 |
|
22 |
-
|
23 |
|
24 |
## How to Use
|
25 |
This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
|
|
|
4 |
- fine-tuned/test
|
5 |
- allenai/c4
|
6 |
language:
|
7 |
+
- en
|
8 |
pipeline_tag: feature-extraction
|
9 |
tags:
|
10 |
- sentence-transformers
|
11 |
- feature-extraction
|
12 |
- sentence-similarity
|
13 |
- mteb
|
14 |
+
- Ubuntu
|
15 |
+
- Technical Support
|
16 |
+
- Queries
|
17 |
+
- Community
|
18 |
+
- Tech
|
19 |
---
|
20 |
+
This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-code**](https://huggingface.co/jinaai/jina-embeddings-v2-base-code) designed for the following use case:
|
21 |
|
22 |
+
technical support forum
|
23 |
|
24 |
## How to Use
|
25 |
This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
|
config.json
CHANGED
@@ -3,27 +3,26 @@
|
|
3 |
"architectures": [
|
4 |
"JinaBertModel"
|
5 |
],
|
6 |
-
"attention_probs_dropout_prob": 0.
|
7 |
"attn_implementation": null,
|
8 |
"auto_map": {
|
9 |
"AutoConfig": "configuration_bert.JinaBertConfig",
|
10 |
"AutoModel": "modeling_bert.JinaBertModel",
|
11 |
-
"AutoModelForMaskedLM": "jinaai/jina-bert-
|
12 |
-
"
|
13 |
-
"AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification",
|
14 |
-
"AutoModelForTokenClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForTokenClassification"
|
15 |
},
|
16 |
"classifier_dropout": null,
|
17 |
"emb_pooler": "mean",
|
18 |
"feed_forward_type": "geglu",
|
19 |
"gradient_checkpointing": false,
|
20 |
"hidden_act": "gelu",
|
21 |
-
"hidden_dropout_prob": 0.
|
22 |
"hidden_size": 768,
|
23 |
"initializer_range": 0.02,
|
24 |
"intermediate_size": 3072,
|
25 |
"layer_norm_eps": 1e-12,
|
26 |
"max_position_embeddings": 8192,
|
|
|
27 |
"model_type": "bert",
|
28 |
"num_attention_heads": 12,
|
29 |
"num_hidden_layers": 12,
|
|
|
3 |
"architectures": [
|
4 |
"JinaBertModel"
|
5 |
],
|
6 |
+
"attention_probs_dropout_prob": 0.0,
|
7 |
"attn_implementation": null,
|
8 |
"auto_map": {
|
9 |
"AutoConfig": "configuration_bert.JinaBertConfig",
|
10 |
"AutoModel": "modeling_bert.JinaBertModel",
|
11 |
+
"AutoModelForMaskedLM": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForMaskedLM",
|
12 |
+
"AutoModelForSequenceClassification": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForSequenceClassification"
|
|
|
|
|
13 |
},
|
14 |
"classifier_dropout": null,
|
15 |
"emb_pooler": "mean",
|
16 |
"feed_forward_type": "geglu",
|
17 |
"gradient_checkpointing": false,
|
18 |
"hidden_act": "gelu",
|
19 |
+
"hidden_dropout_prob": 0.0,
|
20 |
"hidden_size": 768,
|
21 |
"initializer_range": 0.02,
|
22 |
"intermediate_size": 3072,
|
23 |
"layer_norm_eps": 1e-12,
|
24 |
"max_position_embeddings": 8192,
|
25 |
+
"model_max_length": 8192,
|
26 |
"model_type": "bert",
|
27 |
"num_attention_heads": 12,
|
28 |
"num_hidden_layers": 12,
|
configuration_bert.py
CHANGED
@@ -17,11 +17,18 @@
|
|
17 |
""" BERT model configuration"""
|
18 |
from collections import OrderedDict
|
19 |
from typing import Mapping
|
|
|
20 |
|
21 |
from transformers.configuration_utils import PretrainedConfig
|
22 |
-
from transformers.onnx import OnnxConfig
|
23 |
from transformers.utils import logging
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
logger = logging.get_logger(__name__)
|
27 |
|
@@ -128,7 +135,7 @@ class JinaBertConfig(PretrainedConfig):
|
|
128 |
classifier_dropout=None,
|
129 |
feed_forward_type="original",
|
130 |
emb_pooler=None,
|
131 |
-
attn_implementation=
|
132 |
**kwargs,
|
133 |
):
|
134 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
@@ -152,17 +159,19 @@ class JinaBertConfig(PretrainedConfig):
|
|
152 |
self.emb_pooler = emb_pooler
|
153 |
self.attn_implementation = attn_implementation
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
17 |
""" BERT model configuration"""
|
18 |
from collections import OrderedDict
|
19 |
from typing import Mapping
|
20 |
+
import warnings
|
21 |
|
22 |
from transformers.configuration_utils import PretrainedConfig
|
|
|
23 |
from transformers.utils import logging
|
24 |
|
25 |
+
try:
|
26 |
+
from optimum.exporters.onnx.model_configs import BertOnnxConfig
|
27 |
+
OPTIMUM_INSTALLED = True
|
28 |
+
except ImportError:
|
29 |
+
warnings.warn("optimum is not installed. To use OnnxConfig and BertOnnxConfig, make sure that `optimum` package is installed")
|
30 |
+
OPTIMUM_INSTALLED = False
|
31 |
+
|
32 |
|
33 |
logger = logging.get_logger(__name__)
|
34 |
|
|
|
135 |
classifier_dropout=None,
|
136 |
feed_forward_type="original",
|
137 |
emb_pooler=None,
|
138 |
+
attn_implementation=None,
|
139 |
**kwargs,
|
140 |
):
|
141 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
|
|
159 |
self.emb_pooler = emb_pooler
|
160 |
self.attn_implementation = attn_implementation
|
161 |
|
162 |
+
if OPTIMUM_INSTALLED:
|
163 |
+
|
164 |
+
class JinaBertOnnxConfig(BertOnnxConfig):
|
165 |
+
|
166 |
+
@property
|
167 |
+
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
168 |
+
if self.task == "multiple-choice":
|
169 |
+
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
|
170 |
+
else:
|
171 |
+
dynamic_axis = {0: "batch", 1: "sequence"}
|
172 |
+
return OrderedDict(
|
173 |
+
[
|
174 |
+
("input_ids", dynamic_axis),
|
175 |
+
("attention_mask", dynamic_axis),
|
176 |
+
]
|
177 |
+
)
|
merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b13d70f780891db6b8de2daac7e4f22272ceffdf0d60e9e31399f432c6de798b
|
3 |
+
size 643505600
|
modeling_bert.py
CHANGED
@@ -280,9 +280,10 @@ class JinaBertSelfAttention(nn.Module):
|
|
280 |
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
281 |
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
282 |
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
|
|
|
|
283 |
|
284 |
-
self.
|
285 |
-
self.dropout = nn.Dropout(self.dropout_p)
|
286 |
self.position_embedding_type = position_embedding_type or getattr(
|
287 |
config, "position_embedding_type", "absolute"
|
288 |
)
|
@@ -316,7 +317,7 @@ class JinaBertSelfAttention(nn.Module):
|
|
316 |
output_attentions: Optional[bool] = False,
|
317 |
bias: Optional[torch.FloatTensor] = None,
|
318 |
) -> Tuple[torch.Tensor]:
|
319 |
-
mixed_query_layer = self.query(hidden_states)
|
320 |
|
321 |
# If this is instantiated as a cross-attention module, the keys
|
322 |
# and values come from an encoder; the attention mask needs to be
|
@@ -329,16 +330,16 @@ class JinaBertSelfAttention(nn.Module):
|
|
329 |
value_layer = past_key_value[1]
|
330 |
attention_mask = encoder_attention_mask
|
331 |
elif is_cross_attention:
|
332 |
-
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
|
333 |
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
|
334 |
attention_mask = encoder_attention_mask
|
335 |
elif past_key_value is not None:
|
336 |
-
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
337 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
338 |
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
339 |
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
340 |
else:
|
341 |
-
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
342 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
343 |
|
344 |
query_layer = self.transpose_for_scores(mixed_query_layer)
|
@@ -357,8 +358,7 @@ class JinaBertSelfAttention(nn.Module):
|
|
357 |
if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
|
358 |
b, _, s, _ = query_layer.shape
|
359 |
new_bias = attention_mask + bias
|
360 |
-
|
361 |
-
attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
|
362 |
attn = attn.permute(0, 2, 1, 3).contiguous()
|
363 |
return (attn.view(b, s, self.all_head_size),)
|
364 |
|
@@ -431,7 +431,7 @@ class JinaBertSelfAttention(nn.Module):
|
|
431 |
context_layer = context_layer.view(new_context_layer_shape)
|
432 |
|
433 |
outputs = (
|
434 |
-
(context_layer,
|
435 |
)
|
436 |
|
437 |
if self.is_decoder:
|
@@ -515,44 +515,29 @@ class JinaBertAttention(nn.Module):
|
|
515 |
return outputs
|
516 |
|
517 |
|
518 |
-
class
|
519 |
-
def __init__(self, config):
|
520 |
-
super().__init__()
|
521 |
-
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
522 |
-
if isinstance(config.hidden_act, str):
|
523 |
-
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
524 |
-
else:
|
525 |
-
self.intermediate_act_fn = config.hidden_act
|
526 |
-
|
527 |
-
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
528 |
-
hidden_states = self.dense(hidden_states)
|
529 |
-
hidden_states = self.intermediate_act_fn(hidden_states)
|
530 |
-
return hidden_states
|
531 |
-
|
532 |
-
|
533 |
-
class JinaBertOutput(nn.Module):
|
534 |
def __init__(self, config: JinaBertConfig):
|
535 |
super().__init__()
|
536 |
-
self.
|
537 |
-
self.
|
|
|
|
|
|
|
|
|
538 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
539 |
|
540 |
-
def forward(
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
return hidden_states
|
547 |
|
548 |
|
549 |
class JinaBertGLUMLP(nn.Module):
|
550 |
def __init__(self, config: JinaBertConfig):
|
551 |
super().__init__()
|
552 |
self.config = config
|
553 |
-
self.gated_layers = nn.Linear(
|
554 |
-
config.hidden_size, config.intermediate_size * 2, bias=False
|
555 |
-
)
|
556 |
if config.feed_forward_type == 'reglu':
|
557 |
self.act = nn.ReLU()
|
558 |
elif config.feed_forward_type == 'geglu':
|
@@ -561,23 +546,21 @@ class JinaBertGLUMLP(nn.Module):
|
|
561 |
raise ValueError(
|
562 |
f"feed_forward_type {config.feed_forward_type} not supported"
|
563 |
)
|
564 |
-
self.
|
|
|
|
|
|
|
565 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
566 |
-
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
567 |
|
568 |
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
gated =
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
hidden_states = self.wo(hidden_states)
|
578 |
-
# add the residual connection and post-LN
|
579 |
-
hidden_states = self.layernorm(hidden_states + residual_connection)
|
580 |
-
return hidden_states
|
581 |
|
582 |
|
583 |
class JinaBertLayer(nn.Module):
|
@@ -589,6 +572,8 @@ class JinaBertLayer(nn.Module):
|
|
589 |
self.is_decoder = config.is_decoder
|
590 |
self.add_cross_attention = config.add_cross_attention
|
591 |
self.feed_forward_type = config.feed_forward_type
|
|
|
|
|
592 |
if self.add_cross_attention:
|
593 |
if not self.is_decoder:
|
594 |
raise ValueError(
|
@@ -600,8 +585,7 @@ class JinaBertLayer(nn.Module):
|
|
600 |
if self.feed_forward_type.endswith('glu'):
|
601 |
self.mlp = JinaBertGLUMLP(config)
|
602 |
else:
|
603 |
-
self.
|
604 |
-
self.output = JinaBertOutput(config)
|
605 |
|
606 |
def forward(
|
607 |
self,
|
@@ -614,6 +598,9 @@ class JinaBertLayer(nn.Module):
|
|
614 |
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
615 |
output_attentions: Optional[bool] = False,
|
616 |
) -> Tuple[torch.Tensor]:
|
|
|
|
|
|
|
617 |
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
618 |
self_attn_past_key_value = (
|
619 |
past_key_value[:2] if past_key_value is not None else None
|
@@ -667,15 +654,9 @@ class JinaBertLayer(nn.Module):
|
|
667 |
cross_attn_present_key_value = cross_attention_outputs[-1]
|
668 |
present_key_value = present_key_value + cross_attn_present_key_value
|
669 |
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
layer_output = apply_chunking_to_forward(
|
674 |
-
self.feed_forward_chunk,
|
675 |
-
self.chunk_size_feed_forward,
|
676 |
-
self.seq_len_dim,
|
677 |
-
attention_output,
|
678 |
-
)
|
679 |
outputs = (layer_output,) + outputs
|
680 |
|
681 |
# if decoder, return the attn key/values as the last output
|
@@ -684,11 +665,6 @@ class JinaBertLayer(nn.Module):
|
|
684 |
|
685 |
return outputs
|
686 |
|
687 |
-
def feed_forward_chunk(self, attention_output):
|
688 |
-
intermediate_output = self.intermediate(attention_output)
|
689 |
-
layer_output = self.output(intermediate_output, attention_output)
|
690 |
-
return layer_output
|
691 |
-
|
692 |
|
693 |
class JinaBertEncoder(nn.Module):
|
694 |
def __init__(self, config: JinaBertConfig):
|
@@ -699,11 +675,6 @@ class JinaBertEncoder(nn.Module):
|
|
699 |
)
|
700 |
self.gradient_checkpointing = False
|
701 |
self.num_attention_heads = config.num_attention_heads
|
702 |
-
self.register_buffer(
|
703 |
-
"alibi",
|
704 |
-
self.rebuild_alibi_tensor(size=config.max_position_embeddings),
|
705 |
-
persistent=False,
|
706 |
-
)
|
707 |
|
708 |
def rebuild_alibi_tensor(
|
709 |
self, size: int, device: Optional[Union[torch.device, str]] = None
|
@@ -771,23 +742,7 @@ class JinaBertEncoder(nn.Module):
|
|
771 |
|
772 |
# Add alibi matrix to extended_attention_mask
|
773 |
_, seqlen, _ = hidden_states.size()
|
774 |
-
|
775 |
-
# Rebuild the alibi tensor when needed
|
776 |
-
warnings.warn(
|
777 |
-
f'Increasing alibi size from {self._current_alibi_size} to {seqlen}.'
|
778 |
-
)
|
779 |
-
self.register_buffer(
|
780 |
-
"alibi",
|
781 |
-
self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(
|
782 |
-
hidden_states.dtype
|
783 |
-
),
|
784 |
-
persistent=False,
|
785 |
-
)
|
786 |
-
elif self.alibi.device != hidden_states.device:
|
787 |
-
# Device catch-up
|
788 |
-
self.alibi = self.alibi.to(hidden_states.device)
|
789 |
-
|
790 |
-
alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
|
791 |
if self.gradient_checkpointing and self.training:
|
792 |
if use_cache:
|
793 |
logger.warning_once(
|
|
|
280 |
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
281 |
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
282 |
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
283 |
+
self.layer_norm_q = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
284 |
+
self.layer_norm_k = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
285 |
|
286 |
+
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
|
|
287 |
self.position_embedding_type = position_embedding_type or getattr(
|
288 |
config, "position_embedding_type", "absolute"
|
289 |
)
|
|
|
317 |
output_attentions: Optional[bool] = False,
|
318 |
bias: Optional[torch.FloatTensor] = None,
|
319 |
) -> Tuple[torch.Tensor]:
|
320 |
+
mixed_query_layer = self.layer_norm_q(self.query(hidden_states))
|
321 |
|
322 |
# If this is instantiated as a cross-attention module, the keys
|
323 |
# and values come from an encoder; the attention mask needs to be
|
|
|
330 |
value_layer = past_key_value[1]
|
331 |
attention_mask = encoder_attention_mask
|
332 |
elif is_cross_attention:
|
333 |
+
key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(encoder_hidden_states)))
|
334 |
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
|
335 |
attention_mask = encoder_attention_mask
|
336 |
elif past_key_value is not None:
|
337 |
+
key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
|
338 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
339 |
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
340 |
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
341 |
else:
|
342 |
+
key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
|
343 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
344 |
|
345 |
query_layer = self.transpose_for_scores(mixed_query_layer)
|
|
|
358 |
if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
|
359 |
b, _, s, _ = query_layer.shape
|
360 |
new_bias = attention_mask + bias
|
361 |
+
attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
|
|
|
362 |
attn = attn.permute(0, 2, 1, 3).contiguous()
|
363 |
return (attn.view(b, s, self.all_head_size),)
|
364 |
|
|
|
431 |
context_layer = context_layer.view(new_context_layer_shape)
|
432 |
|
433 |
outputs = (
|
434 |
+
(context_layer, attention_scores) if output_attentions else (context_layer,)
|
435 |
)
|
436 |
|
437 |
if self.is_decoder:
|
|
|
515 |
return outputs
|
516 |
|
517 |
|
518 |
+
class JinaBertMLP(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
def __init__(self, config: JinaBertConfig):
|
520 |
super().__init__()
|
521 |
+
self.config = config
|
522 |
+
self.act = ACT2FN[config.hidden_act]
|
523 |
+
self.up_layer = nn.Linear(
|
524 |
+
config.hidden_size, config.intermediate_size, bias=False
|
525 |
+
)
|
526 |
+
self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
|
527 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
528 |
|
529 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
530 |
+
# Up
|
531 |
+
hidden_mlp_states = self.act(self.up_layer(hidden_states))
|
532 |
+
hidden_mlp_states = self.dropout(hidden_mlp_states)
|
533 |
+
# Down
|
534 |
+
return self.down_layer(hidden_mlp_states)
|
|
|
535 |
|
536 |
|
537 |
class JinaBertGLUMLP(nn.Module):
|
538 |
def __init__(self, config: JinaBertConfig):
|
539 |
super().__init__()
|
540 |
self.config = config
|
|
|
|
|
|
|
541 |
if config.feed_forward_type == 'reglu':
|
542 |
self.act = nn.ReLU()
|
543 |
elif config.feed_forward_type == 'geglu':
|
|
|
546 |
raise ValueError(
|
547 |
f"feed_forward_type {config.feed_forward_type} not supported"
|
548 |
)
|
549 |
+
self.up_gated_layer = nn.Linear(
|
550 |
+
config.hidden_size, config.intermediate_size * 2, bias=False
|
551 |
+
)
|
552 |
+
self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
|
553 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
|
554 |
|
555 |
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
556 |
+
# Up with gate
|
557 |
+
hidden_mlp_states = self.up_gated_layer(hidden_states)
|
558 |
+
up = hidden_mlp_states[:, :, :self.config.intermediate_size]
|
559 |
+
gated = hidden_mlp_states[:, :, self.config.intermediate_size:]
|
560 |
+
hidden_mlp_states = up * self.act(gated)
|
561 |
+
hidden_mlp_states = self.dropout(hidden_mlp_states)
|
562 |
+
# Down
|
563 |
+
return self.down_layer(hidden_mlp_states)
|
|
|
|
|
|
|
|
|
564 |
|
565 |
|
566 |
class JinaBertLayer(nn.Module):
|
|
|
572 |
self.is_decoder = config.is_decoder
|
573 |
self.add_cross_attention = config.add_cross_attention
|
574 |
self.feed_forward_type = config.feed_forward_type
|
575 |
+
self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
576 |
+
self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
577 |
if self.add_cross_attention:
|
578 |
if not self.is_decoder:
|
579 |
raise ValueError(
|
|
|
585 |
if self.feed_forward_type.endswith('glu'):
|
586 |
self.mlp = JinaBertGLUMLP(config)
|
587 |
else:
|
588 |
+
self.mlp = JinaBertMLP(config)
|
|
|
589 |
|
590 |
def forward(
|
591 |
self,
|
|
|
598 |
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
599 |
output_attentions: Optional[bool] = False,
|
600 |
) -> Tuple[torch.Tensor]:
|
601 |
+
# Pre-Norm
|
602 |
+
residual = hidden_states
|
603 |
+
|
604 |
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
605 |
self_attn_past_key_value = (
|
606 |
past_key_value[:2] if past_key_value is not None else None
|
|
|
654 |
cross_attn_present_key_value = cross_attention_outputs[-1]
|
655 |
present_key_value = present_key_value + cross_attn_present_key_value
|
656 |
|
657 |
+
residual = self.layer_norm_1(residual + attention_output)
|
658 |
+
mlp_output = self.mlp(residual)
|
659 |
+
layer_output = self.layer_norm_2(residual + mlp_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
660 |
outputs = (layer_output,) + outputs
|
661 |
|
662 |
# if decoder, return the attn key/values as the last output
|
|
|
665 |
|
666 |
return outputs
|
667 |
|
|
|
|
|
|
|
|
|
|
|
668 |
|
669 |
class JinaBertEncoder(nn.Module):
|
670 |
def __init__(self, config: JinaBertConfig):
|
|
|
675 |
)
|
676 |
self.gradient_checkpointing = False
|
677 |
self.num_attention_heads = config.num_attention_heads
|
|
|
|
|
|
|
|
|
|
|
678 |
|
679 |
def rebuild_alibi_tensor(
|
680 |
self, size: int, device: Optional[Union[torch.device, str]] = None
|
|
|
742 |
|
743 |
# Add alibi matrix to extended_attention_mask
|
744 |
_, seqlen, _ = hidden_states.size()
|
745 |
+
alibi_bias = self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(hidden_states.dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
if self.gradient_checkpointing and self.training:
|
747 |
if use_cache:
|
748 |
logger.warning_once(
|
sentence_bert_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
"max_seq_length":
|
3 |
"do_lower_case": false
|
4 |
}
|
|
|
1 |
{
|
2 |
+
"max_seq_length": 8192,
|
3 |
"do_lower_case": false
|
4 |
}
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -48,7 +48,7 @@
|
|
48 |
"eos_token": "</s>",
|
49 |
"errors": "replace",
|
50 |
"mask_token": "<mask>",
|
51 |
-
"model_max_length":
|
52 |
"pad_token": "<pad>",
|
53 |
"sep_token": "</s>",
|
54 |
"tokenizer_class": "RobertaTokenizer",
|
|
|
48 |
"eos_token": "</s>",
|
49 |
"errors": "replace",
|
50 |
"mask_token": "<mask>",
|
51 |
+
"model_max_length": 8192,
|
52 |
"pad_token": "<pad>",
|
53 |
"sep_token": "</s>",
|
54 |
"tokenizer_class": "RobertaTokenizer",
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4719
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:319178590e1a387af3f5fd9616331719193c198e3e30efe9fd546a9cd494ae1a
|
3 |
size 4719
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|