Saving and Loading the fine-tuned model
I fine-tuned the model on my data with SentenceTransformers
library, but obviously just model.save() does not work (it saves without errors, but when I reload in next session - I get aSome weights of BertModel were not initialized from the model checkpoint at ... and are newly initialized
)
Can you please help, how can I save and reload the model (ideally with SentenceTransformers
library)
Ok, I think I got a walk around:
!git clone https://huggingface.co/jinaai/jina-bert-implementation
!mv jina-bert-implementation jina_bert_implementation
!touch jina_bert_implementation/__init__.py
from jina_bert_implementation.modeling_bert import JinaBertModel
checkpoint = "my_checkpoint"
model = JinaBertModel.from_pretrained(checkpoint)
model.to(device)
from transformers import AutoTokenizer
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {
key: val.to(device) for key, val in encoded_input.items()
}
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, max pooling.
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
Still would appreciate the help, because I want to load it as SentenceTransformer for ease of use.
Was not able to find where to change it, but I adapted the function and created a class similar to SentenceTransformer (at least it does the encoding efficiently)
Maybe someone else finds it useful:
from tqdm.notebook import trange
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
class JinaSentEmbedder(AutoModel):
def __init__(self, path):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = AutoModel.from_pretrained(
path,
trust_remote_code=True
)
self.model = self.model.to(self.device)
self.tokenize = AutoTokenizer.from_pretrained(
"jinaai/jina-embeddings-v2-base-en"
)
def _text_length(self, text):
if isinstance(text, dict): #{key: value} case
return len(next(iter(text.values())))
elif not hasattr(text, '__len__'): #Object has no len() method
return 1
elif len(text) == 0 or isinstance(text[0], int): #Empty string or list of ints
return len(text)
else:
return sum([len(t) for t in text])
def encode(self, sentences,
batch_size = 32,
show_progress_bar = None,
output_value: str = 'sentence_embedding',
convert_to_numpy: bool = True,
convert_to_tensor: bool = False,
device: str = None,
normalize_embeddings: bool = False):
self.model.eval()
if convert_to_tensor:
convert_to_numpy = False
if output_value != 'sentence_embedding':
convert_to_tensor = False
convert_to_numpy = False
input_was_string = False
if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
sentences = [sentences]
input_was_string = True
all_embeddings = []
length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
sentences_batch = sentences_sorted[start_index:start_index+batch_size]
encoded_input = self.tokenize(sentences_batch, padding=True, truncation=True, return_tensors='pt')
encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
all_embeddings.extend(sentences_embeddings)
all_embeddings = [all_embeddings[idx].cpu() for idx in np.argsort(length_sorted_idx)]
if convert_to_tensor:
all_embeddings = torch.stack(all_embeddings)
elif convert_to_numpy:
all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
if input_was_string:
all_embeddings = all_embeddings[0]
return all_embeddings
Hi
@Maiia
, could you please share the code you utilized for fine-tuning this model?
Thank you in advance!
@metalwhale
Hello, it's just normal SentenceTransformers fine-tuning, I have marked up pairs of phrases with labels (so phrase1, phrase2, label) where "label" can be either "pos" or "neg"
title_df = pl.DataFrame({
"title 1": [el[0] for el in dedup_negatives] + [el[0] for el in hard_positives],
'title 2': [el[1] for el in dedup_negatives] + [el[1] for el in hard_positives],
'label': ['neg'] * len(dedup_negatives) + ['pos'] * len(hard_positives)
})
for _ in range(5):
title_df = title_df.sample(fraction=1, shuffle=True)
train_df, val_df = train_test_split(title_df, random_state=42,
test_size=0.1,
stratify=title_df['label'].to_list())
train_examples = []
for row in train_df.iter_rows(named=True):
train_examples.append(
InputExample(texts=[row['title 1'], row['title 2']],
label=torch.tensor(1 if row['label'] == 'pos' else 0).to(torch.float32))
)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)
sentences1 = val_df['title 1'].to_list()
sentences2 = val_df['title 2'].to_list()
scores = [torch.tensor(1 if el == 'pos' else 0).to(torch.float32) for el in val_df['label'].to_list()]
evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1,
warmup_steps=len(train_dataloader)//10,
evaluator=evaluator, evaluation_steps=len(train_dataloader)//10)
@metalwhale I hope it helps, here is the documentation: https://www.sbert.net/docs/training/overview.html