(From Examples) Cohere Model is Non-functional: Chat Templates are not present, cublasLt errors
#10
by
blevlabs
- opened
# pip install transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "CohereForAI/c4ai-command-r-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True,load_in_8bit=True)
# Format message with the command-r chat template
messages = [{"role": "user", "content": "Hello, how are you?"}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
print(input_ids)
Output:
tensor([[ 5, 255000, 255006, 28339, 19, 2991, 1955, 1933, 38,
255001, 255000, 255007]], device='cuda:0')
gen_tokens = model.generate(
input_ids,
do_sample=True,
temperature=0.3,
max_new_tokens=100,
)
gen_text = tokenizer.decode(gen_tokens[0])
print(gen_text)
Output:
A: torch.Size([12, 8192]), B: torch.Size([8192, 8192]), C: (12, 8192); (lda, ldb, ldc): (c_int(384), c_int(262144), c_int(384)); (m, n, k): (c_int(12), c_int(8192), c_int(8192))
cuBLAS API failed with status 15
error detected
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
/tmp/ipykernel_4623/3393745038.py in <module>
----> 1 gen_tokens = model.generate(
2 input_ids,
3 do_sample=True,
4 temperature=0.3,
5 max_new_tokens=100,
/usr/lib/python3/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
116
117 return decorate_context
~/.local/lib/python3.10/site-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1568
1569 # 13. run sample
-> 1570 result = self._sample(
1571 input_ids,
1572 logits_processor=prepared_logits_processor,
~/.local/lib/python3.10/site-packages/transformers/generation/utils.py in _sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2703
2704 # forward pass to get next token
-> 2705 outputs = self(
2706 **model_inputs,
2707 return_dict=True,
/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
168
~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
1142
1143 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1144 outputs = self.model(
1145 input_ids=input_ids,
1146 attention_mask=attention_mask,
/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
168
~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
976 )
977 else:
--> 978 layer_outputs = decoder_layer(
979 hidden_states,
980 attention_mask=causal_mask,
/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
168
~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, **kwargs)
692
693 # Self Attention
--> 694 hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
695 hidden_states=hidden_states,
696 attention_mask=attention_mask,
/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
168
~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, **kwargs)
303
304 else:
--> 305 query_states = self.q_proj(hidden_states)
306 key_states = self.k_proj(hidden_states)
307 value_states = self.v_proj(hidden_states)
/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
168
~/.local/lib/python3.10/site-packages/bitsandbytes/nn/modules.py in forward(self, x)
685 self.bias.data = self.bias.data.to(x.dtype)
686
--> 687 out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
688
689 if not self.state.has_fp16_weights:
~/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py in matmul(A, B, out, state, threshold, bias)
560 if threshold > 0.0:
561 state.threshold = threshold
--> 562 return MatMul8bitLt.apply(A, B, out, bias, state)
563
564
/usr/lib/python3/dist-packages/torch/autograd/function.py in apply(cls, *args, **kwargs)
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
507
508 if cls.setup_context == _SingleLevelFunction.setup_context:
~/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py in forward(ctx, A, B, out, bias, state)
399 if using_igemmlt:
400 C32A, SA = F.transform(CA, "col32")
--> 401 out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
402 if bias is None or bias.dtype == torch.float16:
403 # we apply the fused bias here
~/.local/lib/python3.10/site-packages/bitsandbytes/functional.py in igemmlt(A, B, SA, SB, out, Sout, dtype)
1965 if has_error:
1966 print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-> 1967 raise Exception('cublasLt ran into an error!')
1968
1969 torch.cuda.set_device(prev_device)
Exception: cublasLt ran into an error!
Any ideas on how to resolve this? This model cannot even be loaded into an H100 80GB which is what this is running on in full precision, so I tried 8-bit, and cannot get the model to work properly.
blevlabs
changed discussion title from
Cohere Model is Non-functional: Chat Templates are not present, cublasLt errors
to (From Examples) Cohere Model is Non-functional: Chat Templates are not present, cublasLt errors
Are you able to load the fp16 model?model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True,torch_dtype=torch.float16).to("cuda")
@saurabhdash Thanks, I will have to try this when I can get access to a machine that can run this model.
ahmetustun
changed discussion status to
closed