Memory leak(memory increasing slowly in each inference in cpu)

#19

by scancet - opened Nov 3, 2023

Nov 3, 2023

Hello,
I used transformers library of intfloat/multilingual-e5-large this model. I used the same code that is shared in its model card. I dockerized it and started to use. It increases the memory in each inference and then it exceeding my memory limit after a while.

Here is my code,

import torch.nn.functional as F
from torch import Tensor, no_grad, cuda, device
from transformers import AutoTokenizer, AutoModel
import gc

class Model():

def __init__(self, path='resources/intfloat_multilingual-e5-large'):
    self.tokenizer = AutoTokenizer.from_pretrained(path)
    self.model = AutoModel.from_pretrained(path)
    dvc = device('cpu')
    self.model.to(dvc)
    self.model.eval()

def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def inference(self, texts): 
    with no_grad():
        batch_dict = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
        #print(batch_dict)
        #print(self.model.config)
        outputs = self.model(**batch_dict)
        embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        del outputs
        embeddings = F.normalize(embeddings, p=2, dim=1)
        embeddings = embeddings.numpy().tolist()
        gc.collect()
        cuda.empty_cache()
    return embeddings

model = Model()

Here is my docker stats. Its initially uses around 2.6gb ram in memory. But in each iteration it increases slowly.
Please let me know if I can clear the cache of the memory or in any way I can stop this memory leak.

Thanks

intfloat

Owner Nov 4, 2023

This looks strange, python and pytorch should do GC automatically. Is it possible that you store too many embedding vectors that cause the OOM issue?

skapadia-zalando

Oct 24, 2024

I can confirm this issue - my input_texts are only about 100MB in memory but getting embeddings for them takes reaches 200GB (which is where my server crashes)

canrager

Feb 11

I have this issue too

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
    'intfloat/multilingual-e5-large', 
    cache_dir=CACHE_DIR, 
    device_map=DEVICE,
    torch_dtype=torch.bfloat16
)

model.eval()

# Tokenize the input texts

batch_size = 128
hidden_size = model.config.hidden_size

all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")

for batch_start in trange(0, len(input_texts), batch_size):
    print_gpu_memory()
    batch_end = batch_start + batch_size
    batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
    batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)

    outputs = model(**batch_dict)

    # outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
    batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length

    # normalize embeddings
    batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
    batch_embeddings = batch_embeddings.to('cpu')
    
    all_embeddings[batch_start:batch_end] = batch_embeddings
    all_embeddings = all_embeddings.to("cpu")
    del outputs.last_hidden_state
    del outputs
    del batch_embeddings, batch_dict
    torch.cuda.empty_cache()
    gc.collect()

canrager

Feb 11

actually, using the with torch.no_grad() context did the trick and resolved OOM issues.
@intfloat why is the no_grad context necessary?

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
    'intfloat/multilingual-e5-large', 
    cache_dir=CACHE_DIR, 
    device_map=DEVICE,
    torch_dtype=torch.bfloat16
)
#%%
# Tokenize the input texts

batch_size = 128
hidden_size = model.config.hidden_size

all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")

with torch.no_grad():
    for batch_start in trange(0, len(input_texts), batch_size):
        print_gpu_memory()
        batch_end = batch_start + batch_size
    batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
    batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)

    outputs = model(**batch_dict)

    # outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
    batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length

    # normalize embeddings
    batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
    batch_embeddings = batch_embeddings.to('cpu')
    
    all_embeddings[batch_start:batch_end] = batch_embeddings
    all_embeddings = all_embeddings.to("cpu")
    del outputs.last_hidden_state
    del outputs
    del batch_embeddings, batch_dict
    torch.cuda.empty_cache()
    gc.collect()

intfloat

Owner Feb 13

@canrager Thanks for letting us know! I think the default behavior of pytorch is to save intermediate activations for backward calculation later, so the memory keeps increasing without torch.no_grad().

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment