CohereForAI/c4ai-command-r-v01 · (From Examples) Cohere Model is Non-functional: Chat Templates are not present, cublasLt errors

# pip install transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "CohereForAI/c4ai-command-r-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True,load_in_8bit=True)

# Format message with the command-r chat template
messages = [{"role": "user", "content": "Hello, how are you?"}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
print(input_ids)

Output:

tensor([[     5, 255000, 255006,  28339,     19,   2991,   1955,   1933,     38,
         255001, 255000, 255007]], device='cuda:0')

gen_tokens = model.generate(
input_ids,
do_sample=True,
temperature=0.3,
max_new_tokens=100,
)

gen_text = tokenizer.decode(gen_tokens[0])
print(gen_text)

Output:

A: torch.Size([12, 8192]), B: torch.Size([8192, 8192]), C: (12, 8192); (lda, ldb, ldc): (c_int(384), c_int(262144), c_int(384)); (m, n, k): (c_int(12), c_int(8192), c_int(8192))
cuBLAS API failed with status 15
error detected

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
/tmp/ipykernel_4623/3393745038.py in <module>
----> 1 gen_tokens = model.generate(
      2 input_ids,
      3 do_sample=True,
      4 temperature=0.3,
      5 max_new_tokens=100,

/usr/lib/python3/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    113     def decorate_context(*args, **kwargs):
    114         with ctx_factory():
--> 115             return func(*args, **kwargs)
    116 
    117     return decorate_context

~/.local/lib/python3.10/site-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   1568 
   1569             # 13. run sample
-> 1570             result = self._sample(
   1571                 input_ids,
   1572                 logits_processor=prepared_logits_processor,

~/.local/lib/python3.10/site-packages/transformers/generation/utils.py in _sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
   2703 
   2704             # forward pass to get next token
-> 2705             outputs = self(
   2706                 **model_inputs,
   2707                 return_dict=True,

/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    164                 output = module._old_forward(*args, **kwargs)
    165         else:
--> 166             output = module._old_forward(*args, **kwargs)
    167         return module._hf_hook.post_forward(module, output)
    168 

~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
   1142 
   1143         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1144         outputs = self.model(
   1145             input_ids=input_ids,
   1146             attention_mask=attention_mask,

/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    164                 output = module._old_forward(*args, **kwargs)
    165         else:
--> 166             output = module._old_forward(*args, **kwargs)
    167         return module._hf_hook.post_forward(module, output)
    168 

~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
    976                 )
    977             else:
--> 978                 layer_outputs = decoder_layer(
    979                     hidden_states,
    980                     attention_mask=causal_mask,

/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    164                 output = module._old_forward(*args, **kwargs)
    165         else:
--> 166             output = module._old_forward(*args, **kwargs)
    167         return module._hf_hook.post_forward(module, output)
    168 

~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, **kwargs)
    692 
    693         # Self Attention
--> 694         hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
    695             hidden_states=hidden_states,
    696             attention_mask=attention_mask,

/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    164                 output = module._old_forward(*args, **kwargs)
    165         else:
--> 166             output = module._old_forward(*args, **kwargs)
    167         return module._hf_hook.post_forward(module, output)
    168 

~/.cache/huggingface/modules/transformers_modules/CohereForAI/c4ai-command-r-v01/38316f324500931c1b47aa45027aafd0d7fdeadc/modeling_cohere.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, **kwargs)
    303 
    304         else:
--> 305             query_states = self.q_proj(hidden_states)
    306             key_states = self.k_proj(hidden_states)
    307             value_states = self.v_proj(hidden_states)

/usr/lib/python3/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.10/site-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    164                 output = module._old_forward(*args, **kwargs)
    165         else:
--> 166             output = module._old_forward(*args, **kwargs)
    167         return module._hf_hook.post_forward(module, output)
    168 

~/.local/lib/python3.10/site-packages/bitsandbytes/nn/modules.py in forward(self, x)
    685             self.bias.data = self.bias.data.to(x.dtype)
    686 
--> 687         out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
    688 
    689         if not self.state.has_fp16_weights:

~/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py in matmul(A, B, out, state, threshold, bias)
    560     if threshold > 0.0:
    561         state.threshold = threshold
--> 562     return MatMul8bitLt.apply(A, B, out, bias, state)
    563 
    564 

/usr/lib/python3/dist-packages/torch/autograd/function.py in apply(cls, *args, **kwargs)
    504             # See NOTE: [functorch vjp and autograd interaction]
    505             args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506             return super().apply(*args, **kwargs)  # type: ignore[misc]
    507 
    508         if cls.setup_context == _SingleLevelFunction.setup_context:

~/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py in forward(ctx, A, B, out, bias, state)
    399         if using_igemmlt:
    400             C32A, SA = F.transform(CA, "col32")
--> 401             out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
    402             if bias is None or bias.dtype == torch.float16:
    403                 # we apply the fused bias here

~/.local/lib/python3.10/site-packages/bitsandbytes/functional.py in igemmlt(A, B, SA, SB, out, Sout, dtype)
   1965     if has_error:
   1966         print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-> 1967         raise Exception('cublasLt ran into an error!')
   1968 
   1969     torch.cuda.set_device(prev_device)

Exception: cublasLt ran into an error!

Any ideas on how to resolve this? This model cannot even be loaded into an H100 80GB which is what this is running on in full precision, so I tried 8-bit, and cannot get the model to work properly.