# Proposal of the request

Fix the ValueError when save model using `.save_pretrained` due to the non-contiguous tensor in Midm model

## Err log

- ValueError happends on both directly calling `save_pretrained` and when using `Trainer` of the transformers library.

ValueError: You are trying to save a non contiguous tensor: `transformer.h.0.attn.c_attn.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

## Full Code & Traceback

In [1]: from transformers import AutoModelForCausalLM
import to
In [2]: import torch

In [3]: model = AutoModelForCausalLM.from_pretrained(
...: 'KT-AI/midm-bitext-S-7B-inst-v1', device_map={'':1})
Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:18<00:00, 9.44s/it]

In [4]: model
(transformer): MidmModel(
(wte): Embedding(72192, 4096)
(rotary_pos_emb): RotaryEmbedding()
(drop): Dropout(p=0.0, inplace=False)
(h): ModuleList(
(0-31): 32 x MidmBlock(
(ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
(attn): MidmAttention(
(c_attn): Linear(in_features=4096, out_features=12288, bias=False)
(c_proj): Linear(in_features=4096, out_features=4096, bias=False)
(attn_dropout): Dropout(p=0.0, inplace=False)
(resid_dropout): Dropout(p=0.0, inplace=False)
(ln_2): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
(mlp): MidmMLP(
(c_fc): Linear(in_features=4096, out_features=21760, bias=False)
(c_proj): Linear(in_features=10880, out_features=4096, bias=False)
(dropout): Dropout(p=0.0, inplace=False)
(ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
(lm_head): Linear(in_features=4096, out_features=72192, bias=False)

In [5]: model.save_pretrained('test')
ValueError Traceback (most recent call last)
Cell In[5], line 1
----> 1 model.save_pretrained('test')

File ~/anaconda3/envs/career-chatbot-trainer-clm/lib/python3.10/site-packages/transformers/, in PreTrainedModel.save_pretrained(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)
2183 for shard_file, shard in shards.items():
2184 if safe_serialization:
2185 # At some point we will need to deal better with save_function (used for TPU and other distributed
2186 # joyfulness), but for now this enough.
-> 2187 safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "pt"})
2188 else:
2189 save_function(shard, os.path.join(save_directory, shard_file))

File ~/anaconda3/envs/career-chatbot-trainer-clm/lib/python3.10/site-packages/safetensors/, in save_file(tensors, filename, metadata)
250 def save_file(
251 tensors: Dict[str, torch.Tensor],
252 filename: Union[str, os.PathLike],
253 metadata: Optional[Dict[str, str]] = None,
254 ):
255 """
256 Saves a dictionary of tensors into raw bytes in safetensors format.
279 ```
280 """
--> 281 serialize_file(_flatten(tensors), filename, metadata=metadata)

File ~/anaconda3/envs/career-chatbot-trainer-clm/lib/python3.10/site-packages/safetensors/, in _flatten(tensors)
466 if failing:
467 raise RuntimeError(
468 f"""
469 Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: {failing}.
472 """
473 )
--> 475 return {
476 k: {
477 "dtype": str(v.dtype).split(".")[-1],
478 "shape": v.shape,
479 "data": _tobytes(v, k),
480 }
481 for k, v in tensors.items()
482 }

File ~/anaconda3/envs/career-chatbot-trainer-clm/lib/python3.10/site-packages/safetensors/, in <dictcomp>(.0)
466 if failing:
467 raise RuntimeError(
468 f"""
469 Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: {failing}.
472 """
473 )
475 return {
476 k: {
477 "dtype": str(v.dtype).split(".")[-1],
478 "shape": v.shape,
--> 479 "data": _tobytes(v, k),
480 }
481 for k, v in tensors.items()
482 }

File ~/anaconda3/envs/career-chatbot-trainer-clm/lib/python3.10/site-packages/safetensors/, in _tobytes(tensor, name)
389 raise ValueError(
390 f"You are trying to save a sparse tensor: `{name}` which this library does not support."
391 " You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
392 " make a much larger file than needed."
393 )
395 if not tensor.is_contiguous():
--> 396 raise ValueError(
397 f"You are trying to save a non contiguous tensor: `{name}` which is not allowed. It either means you"
398 " are trying to save tensors which are reference of each other in which case it's recommended to save"
399 " only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to"
400 " pack it before saving."
401 )
402 if tensor.device.type != "cpu":
403 # Moving tensor to cpu before saving
404 tensor ="cpu")

ValueError: You are trying to save a non contiguous tensor: `transformer.h.0.attn.c_attn.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

# Solution

Override `.save_pretrained` method on `MidmPreTrainedModel` to make model's tensor contiguous.

class MidmPreTrainedModel(PreTrainedModel):
# ... [other methods and properties of the class]

def make_tensors_contiguous(self):
for name, param in self.named_parameters():
if not param.is_contiguous(): =

def save_pretrained(self, save_directory, **kwargs):
# Make tensors contiguous

# Call the original save_pretrained method
super().save_pretrained(save_directory, **kwargs)

# Other class definitions remain unchanged

# Result

`save_pretrained` method works fine without error.

In [1]: from modeling_midm import MidmLMHeadModel

In [2]: model = MidmLMHeadModel.from_pretrained('KT-AI/midm-bitext-S-7B-inst-v1', device_map={'':1})
Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:16<00:00, 8.30s/it]

In [3]: model.save_pretrained('test')

In [4]: exit

@@ -460,6 +460,18 @@ class MidmPreTrainedModel(PreTrainedModel):
  if isinstance(module, MidmModel):
  module.gradient_checkpointing = value
  class MidmDoubleHeadsModelOutput(ModelOutput):
+ def make_tensors_contiguous(self):
+ for name, param in self.named_parameters():
+ if not param.is_contiguous():
+ =
+ def save_pretrained(self, save_directory, **kwargs):
+ # Make tensors contiguous
+ self.make_tensors_contiguous()
+ # Call the original save_pretrained method
+ super().save_pretrained(save_directory, **kwargs)
  class MidmDoubleHeadsModelOutput(ModelOutput):