Inference fails on CPU: `ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)`
When one runs the below code, taken exactly from the README except for the addition of device = 'cpu'
, they get the error ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
:
import torch
from transformers import pipeline
from pprint import pprint
pipe = pipeline(
"fill-mask",
model="answerdotai/ModernBERT-base",
torch_dtype=torch.bfloat16,
device='cpu',
)
input_text = "He walked to the [MASK]."
results = pipe(input_text)
pprint(results)
Here is the full traceback of the error:
ValueError Traceback (most recent call last)
Cell In[1], line 13
5 pipe = pipeline(
6 "fill-mask",
7 model="answerdotai/ModernBERT-base",
8 torch_dtype=torch.bfloat16,
9 device='cpu',
10 )
12 input_text = "He walked to the [MASK]."
---> 13 results = pipe(input_text)
14 pprint(results)
File ~/dev/.venv/lib/python3.12/site-packages/transformers/pipelines/fill_mask.py:270, in FillMaskPipeline.__call__(self, inputs, **kwargs)
248 def __call__(self, inputs, **kwargs):
249 """
250 Fill the masked token in the text(s) given as inputs.
251
(...)
268 - **token_str** (str) -- The predicted token (to replace the masked one).
269 """
--> 270 outputs = super().__call__(inputs, **kwargs)
271 if isinstance(inputs, list) and len(inputs) == 1:
272 return outputs[0]
File ~/dev/.venv/lib/python3.12/site-packages/transformers/pipelines/base.py:1301, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1293 return next(
1294 iter(
1295 self.get_iterator(
(...)
1298 )
1299 )
1300 else:
-> 1301 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File ~/dev/.venv/lib/python3.12/site-packages/transformers/pipelines/base.py:1308, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1306 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1307 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1308 model_outputs = self.forward(model_inputs, **forward_params)
1309 outputs = self.postprocess(model_outputs, **postprocess_params)
1310 return outputs
File ~/dev/.venv/lib/python3.12/site-packages/transformers/pipelines/base.py:1208, in Pipeline.forward(self, model_inputs, **forward_params)
1206 with inference_context():
1207 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1208 model_outputs = self._forward(model_inputs, **forward_params)
1209 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1210 else:
File ~/dev/.venv/lib/python3.12/site-packages/transformers/pipelines/fill_mask.py:127, in FillMaskPipeline._forward(self, model_inputs)
126 def _forward(self, model_inputs):
--> 127 model_outputs = self.model(**model_inputs)
128 model_outputs["input_ids"] = model_inputs["input_ids"]
129 return model_outputs
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:1059, in ModernBertForMaskedLM.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, labels, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict, **kwargs)
1054 with torch.no_grad():
1055 input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
1056 inputs=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels
1057 )
-> 1059 outputs = self.model(
1060 input_ids,
1061 attention_mask=attention_mask,
1062 sliding_window_mask=sliding_window_mask,
1063 position_ids=position_ids,
1064 indices=indices,
1065 cu_seqlens=cu_seqlens,
1066 max_seqlen=max_seqlen,
1067 batch_size=batch_size,
1068 seq_len=seq_len,
1069 output_attentions=output_attentions,
1070 output_hidden_states=output_hidden_states,
1071 return_dict=return_dict,
1072 )
1073 last_hidden_state = outputs[0]
1075 if self.sparse_prediction and labels is not None:
1076 # flatten labels and output first
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:913, in ModernBertModel.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict)
902 layer_outputs = self._gradient_checkpointing_func(
903 encoder_layer.__call__,
904 hidden_states,
(...)
910 output_attentions,
911 )
912 else:
--> 913 layer_outputs = encoder_layer(
914 hidden_states,
915 attention_mask=attention_mask,
916 sliding_window_mask=sliding_window_mask,
917 position_ids=position_ids,
918 cu_seqlens=cu_seqlens,
919 max_seqlen=max_seqlen,
920 output_attentions=output_attentions,
921 )
922 hidden_states = layer_outputs[0]
923 if output_attentions and len(layer_outputs) > 1:
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:529, in ModernBertEncoderLayer.forward(self, hidden_states, attention_mask, sliding_window_mask, position_ids, cu_seqlens, max_seqlen, output_attentions)
519 def forward(
520 self,
521 hidden_states: torch.Tensor,
(...)
527 output_attentions: Optional[bool] = False,
528 ) -> torch.Tensor:
--> 529 attn_outputs = self.attn(
530 self.attn_norm(hidden_states),
531 attention_mask=attention_mask,
532 sliding_window_mask=sliding_window_mask,
533 position_ids=position_ids,
534 cu_seqlens=cu_seqlens,
535 max_seqlen=max_seqlen,
536 output_attentions=output_attentions,
537 )
538 hidden_states = hidden_states + attn_outputs[0]
539 mlp_output = (
540 self.compiled_mlp(hidden_states)
541 if self.config.reference_compile
542 else self.mlp(self.mlp_norm(hidden_states))
543 )
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:487, in ModernBertAttention.forward(self, hidden_states, output_attentions, **kwargs)
484 else:
485 qkv = qkv.view(bs, -1, 3, self.num_heads, self.head_dim)
--> 487 attn_outputs = MODERNBERT_ATTENTION_FUNCTION[self.config._attn_implementation](
488 self,
489 qkv=qkv,
490 rotary_emb=self.rotary_emb,
491 local_attention=self.local_attention,
492 bs=bs,
493 dim=self.all_head_size,
494 output_attentions=output_attentions,
495 **kwargs,
496 )
497 hidden_states = attn_outputs[0]
498 hidden_states = self.out_drop(self.Wo(hidden_states))
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:349, in flash_attention_forward(module, qkv, rotary_emb, cu_seqlens, max_seqlen, local_attention, bs, dim, target_dtype, **_kwargs)
336 def flash_attention_forward(
337 module: "ModernBertAttention",
338 qkv: torch.Tensor,
(...)
347 ) -> Tuple[torch.Tensor]:
348 # (total_seqlen, 3, nheads, headdim)
--> 349 qkv = rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
351 convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
352 if convert_dtype:
353 # FA2 implementation only supports fp16 and bf16. If FA2 is supported,
354 # bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:178, in ModernBertUnpaddedRotaryEmbedding.forward(self, qkv, cu_seqlens, max_seqlen)
175 if max_seqlen is not None:
176 self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
--> 178 qkv = apply_rotary_unpadded(
179 qkv,
180 self._cos_cached,
181 self._sin_cached,
182 cu_seqlens=cu_seqlens,
183 max_seqlen=max_seqlen,
184 )
186 return qkv
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:136, in apply_rotary_unpadded(qkv, cos, sin, cu_seqlens, max_seqlen)
113 def apply_rotary_unpadded(
114 qkv,
115 cos,
(...)
118 max_seqlen: Optional[int] = None,
119 ):
120 """
121 Arguments:
122 qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
(...)
134 Apply rotary embedding to the first rotary_dim of x.
135 """
--> 136 return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
File ~/dev/.venv/lib/python3.12/site-packages/torch/autograd/function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 "https://pytorch.org/docs/main/notes/extending.func.html"
583 )
File ~/dev/.venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:75, in ApplyRotaryEmbUnpad.forward(ctx, qkv, cos, sin, cu_seqlens, max_seqlen)
71 # We need qkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
72 # we get the same tensor
73 # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
74 qk = qkv[:, :2].view(total_nnz, -1, headdim)
---> 75 apply_rotary(
76 qk,
77 cos,
78 sin,
79 seqlen_offsets=0,
80 cu_seqlens=cu_seqlens,
81 max_seqlen=max_seqlen,
82 interleaved=False,
83 inplace=True,
84 )
86 ctx.save_for_backward(cos, sin, cu_seqlens)
87 ctx.max_seqlen = max_seqlen
File ~/dev/.venv/lib/python3.12/site-packages/flash_attn/ops/triton/rotary.py:202, in apply_rotary(x, cos, sin, seqlen_offsets, cu_seqlens, max_seqlen, interleaved, inplace, conjugate)
199 # Need this, otherwise Triton tries to launch from cuda:0 and we get
200 # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
201 with torch.cuda.device(x.device.index):
--> 202 rotary_kernel[grid](
203 output, # data ptrs
204 x,
205 cos,
206 sin,
207 cu_seqlens,
208 seqlen_offsets,
209 seqlen, # shapes
210 rotary_dim,
211 seqlen_ro,
212 output.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
213 output.stride(-3), # seqlen_stride or total_seqlen_stride
214 output.stride(-2), # nheads_stride
215 output.stride(-1), # headdim_stride
216 x.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
217 x.stride(-3), # seqlen stride or total_seqlen_stride
218 x.stride(-2), # nheads stride
219 x.stride(-1), # headdim stride
220 BLOCK_K,
221 isinstance(seqlen_offsets, torch.Tensor),
222 is_varlen,
223 interleaved,
224 conjugate,
225 BLOCK_M,
226 )
227 return output
File ~/dev/.venv/lib/python3.12/site-packages/triton/runtime/jit.py:345, in KernelInterface.__getitem__.<locals>.<lambda>(*args, **kwargs)
339 def __getitem__(self, grid) -> T:
340 """
341 A JIT function is launched with: fn[grid](*args, **kwargs).
342 Hence JITFunction.__getitem__ returns a callable proxy that
343 memorizes the grid.
344 """
--> 345 return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File ~/dev/.venv/lib/python3.12/site-packages/triton/runtime/jit.py:691, in JITFunction.run(self, grid, warmup, *args, **kwargs)
689 # launch kernel
690 launch_metadata = kernel.launch_metadata(grid, stream, *non_constexpr_vals)
--> 691 kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
692 self.CompiledKernel.launch_enter_hook, self.CompiledKernel.launch_exit_hook, *non_constexpr_vals)
693 return kernel
File ~/dev/.venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py:365, in CudaLauncher.__call__(self, *args, **kwargs)
364 def __call__(self, *args, **kwargs):
--> 365 self.launch(*args, **kwargs)
ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
I have the same problem.
Apologies, please load with reference_compile=False
, either in .from_pretrained("...", reference_compile=False)
or in pipeline("fill-mask", ..., config_kwargs={"reference_compile": False})
to disable compilation which is incompatible with CPUs.
- Tom Aarsen
Thanks for the prompt response. I made the suggested changes and still encounter the same error
File ".../miniconda3/envs/modern-bert/lib/python3.11/site-packages/triton/runtime/jit.py", line 691, in run
kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
File ".../miniconda3/envs/modern-bert/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 365, in __call__
self.launch(*args, **kwargs)
ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
I am loading the model in the following way
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_DIR,
reference_compile=False
)
Oh, that is odd! A potential solution is uninstalling triton
. If not set manually (even though you're setting it manually), the compilation is enabled if triton
is installed. So, uninstalling it might help.
pip uninstall triton
- Tom Aarsen
Thanks for the response. Actually I resolved the issue
reference_compile=False
- created a new environment and did not install flassh_attn. Works just fine. I did not try uninstalling triton