# CoreML Conversion of the mxbai-embed-large-v1 sentence embedding model After extensive testing (and a lot of debugging with ChatGPT), I was able to convert the mxbai-embed-large-v1 model to CoreML and run it mostly on the GPU. ```Python3 import torch from transformers import AutoModel, AutoTokenizer import coremltools as ct # Define a wrapper class for the AutoModel to return only the last_hidden_state class ModelWrapper(torch.nn.Module): def __init__(self, model): super(ModelWrapper, self).__init__() self.model = model def forward(self, input_ids, attention_mask): # Extract the 'last_hidden_state' from the model output output = self.model(input_ids=input_ids, attention_mask=attention_mask) return output.last_hidden_state # or use 'pooler_output' if needed # Load your SentenceTransformer model and tokenizer model_name = "mixedbread-ai/mxbai-embed-large-v1" # Replace with your model model = AutoModel.from_pretrained(model_name) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) # Wrap the model to return only the tensor output wrapped_model = ModelWrapper(model) wrapped_model.eval() # Sample input to export the model dummy_input = tokenizer("This is a sample input", return_tensors="pt") # Trace the model using tensor inputs (input_ids, attention_mask) traced_model = torch.jit.trace(wrapped_model, (dummy_input['input_ids'], dummy_input['attention_mask'])) # Convert the traced PyTorch model to CoreML using the ML Program format model_from_torch = ct.convert( traced_model, inputs=[ ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32), ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32) ], minimum_deployment_target=ct.target.iOS17, convert_to="mlprogram", compute_precision=ct.precision.FLOAT16 ) # Save the CoreML model as an mlpackage model_from_torch.save("mxbai-embed-large-v1.mlpackage") ``` It can be run like this: ```Python import coremltools as ct from transformers import AutoTokenizer import numpy as np # Load the CoreML model model = ct.models.MLModel("mxbai-embed-large-v1.mlpackage") # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1") # Prepare some input text input_text = "This is a test sentence for the CoreML model" inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=512) # Extract input tensors input_ids = inputs['input_ids'].astype(np.float32) # CoreML expects float32 attention_mask = inputs['attention_mask'].astype(np.float32) # Prepare inputs for the CoreML model coreml_input = {"input_ids": input_ids, "attention_mask": attention_mask} predictions = model.predict(coreml_input) hidden_states = predictions['hidden_states'] cls_embedding = hidden_states[0, 0, :] np.set_printoptions(threshold=np.inf) # Print the CLS token embedding, which is a 1024-dimensional vector print("CLS Token Embedding:", cls_embedding, len(cls_embedding)) ``` I verified the output with ollama: ``` curl http://localhost:11434/api/embeddings -d '{ "model": "mxbai-embed-large", "prompt": "This is a test sentence for the CoreML model" }' ``` Environment: Python 3.11 coremltools 8.0 sentence-transformers 3.1.0 transformers 4.44.2