jinaai/jina-embeddings-v2-base-en · Saving and Loading the fine-tuned model

Nov 8, 2023

I fine-tuned the model on my data with SentenceTransformers library, but obviously just model.save() does not work (it saves without errors, but when I reload in next session - I get a
Some weights of BertModel were not initialized from the model checkpoint at ... and are newly initialized)
Can you please help, how can I save and reload the model (ideally with SentenceTransformers library)

Maiia

Nov 8, 2023

•

edited Nov 8, 2023

Ok, I think I got a walk around:

!git clone https://huggingface.co/jinaai/jina-bert-implementation
!mv jina-bert-implementation jina_bert_implementation
!touch jina_bert_implementation/__init__.py

from jina_bert_implementation.modeling_bert import JinaBertModel

checkpoint = "my_checkpoint"
model = JinaBertModel.from_pretrained(checkpoint)
model.to(device)

from transformers import AutoTokenizer
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {
    key: val.to(device) for key, val in encoded_input.items()
}
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

Still would appreciate the help, because I want to load it as SentenceTransformer for ease of use.

bwang0911

Jina AI org Nov 8, 2023

hi @Maiia can you manually edit the SentenceTransformer class, add trust_remote_code=True when sbert doing the AutoModel.from_pretrained(...) thingy?

i think in SBert main branch they support it, not in the latest pypi release.

Maiia

Nov 8, 2023

•

edited Nov 8, 2023

Was not able to find where to change it, but I adapted the function and created a class similar to SentenceTransformer (at least it does the encoding efficiently)
Maybe someone else finds it useful:

from tqdm.notebook import trange
import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')

class JinaSentEmbedder(AutoModel):
    def __init__(self, path):
        
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = AutoModel.from_pretrained(
          path, 
          trust_remote_code=True
        )
          
        self.model = self.model.to(self.device)
        self.tokenize = AutoTokenizer.from_pretrained(
            "jinaai/jina-embeddings-v2-base-en"
        )
        
    def _text_length(self, text):
        if isinstance(text, dict):              #{key: value} case
            return len(next(iter(text.values())))
        elif not hasattr(text, '__len__'):      #Object has no len() method
            return 1
        elif len(text) == 0 or isinstance(text[0], int):    #Empty string or list of ints
            return len(text)
        else:
            return sum([len(t) for t in text])
    def encode(self, sentences,
               batch_size = 32,
               show_progress_bar = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False):

        self.model.eval()

        if convert_to_tensor:
            convert_to_numpy = False

        if output_value != 'sentence_embedding':
            convert_to_tensor = False
            convert_to_numpy = False

        input_was_string = False
        if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
            sentences = [sentences]
            input_was_string = True

        all_embeddings = []
        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]

        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
            sentences_batch = sentences_sorted[start_index:start_index+batch_size]
            encoded_input = self.tokenize(sentences_batch, padding=True, truncation=True, return_tensors='pt')
            encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}

            with torch.no_grad():
                model_output = self.model(**encoded_input)
                sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
            all_embeddings.extend(sentences_embeddings)

        all_embeddings = [all_embeddings[idx].cpu() for idx in np.argsort(length_sorted_idx)]

        if convert_to_tensor:
            all_embeddings = torch.stack(all_embeddings)
        elif convert_to_numpy:
            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])

        if input_was_string:
            all_embeddings = all_embeddings[0]

        return all_embeddings

Maiia changed discussion status to closed Nov 8, 2023

metalwhale

Nov 27, 2023

Hi @Maiia , could you please share the code you utilized for fine-tuning this model?
Thank you in advance!

Maiia

Nov 27, 2023

•

edited Nov 27, 2023

@metalwhale
Hello, it's just normal SentenceTransformers fine-tuning, I have marked up pairs of phrases with labels (so phrase1, phrase2, label) where "label" can be either "pos" or "neg"

title_df = pl.DataFrame({
    "title 1": [el[0] for el in dedup_negatives] + [el[0] for el in hard_positives],
    'title 2': [el[1] for el in dedup_negatives] + [el[1] for el in hard_positives],
    'label': ['neg'] * len(dedup_negatives) + ['pos'] * len(hard_positives)
})

for _ in range(5):
    title_df = title_df.sample(fraction=1, shuffle=True)
train_df, val_df = train_test_split(title_df, random_state=42,
                                    test_size=0.1,
                                    stratify=title_df['label'].to_list())
train_examples = []
for row in train_df.iter_rows(named=True):
    train_examples.append(
        InputExample(texts=[row['title 1'], row['title 2']],
                     label=torch.tensor(1 if row['label'] == 'pos' else 0).to(torch.float32))
    )
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

sentences1 = val_df['title 1'].to_list()
sentences2 = val_df['title 2'].to_list()
scores = [torch.tensor(1 if el == 'pos' else 0).to(torch.float32) for el in val_df['label'].to_list()]

evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1,
          warmup_steps=len(train_dataloader)//10,
          evaluator=evaluator, evaluation_steps=len(train_dataloader)//10)

Maiia

Nov 27, 2023

@metalwhale I hope it helps, here is the documentation: https://www.sbert.net/docs/training/overview.html

metalwhale

Nov 27, 2023

@Maiia thank you so much for your kind help. I really appreciate it!