vilsonrodrigues/falcon-7b-instruct-sharded · RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'

Jul 12, 2023

Running on a Mac with no GPU
Getting below error

I added llm_int8_enable_fp32_cpu_offload=True
to
quantization_config

used a custom device map
device_maps = {
"transformer.word_embeddings": "cpu",
"transformer.word_embeddings_layernorm": "cpu",
"lm_head": "cpu",
"transformer.h": "cpu",
"transformer.ln_f": "cpu",
}

I downloaded your entire model to my laptop
model_id = "/Users/falcon-7b-instruct-sharded"

model_4bit = AutoModelForCausalLM.from_pretrained(
model_id,
device_map=device_maps,
quantization_config=quantization_config,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Getting error for below line

sequences = pipeline(
"Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:")

RuntimeError Traceback (most recent call last)
Cell In[23], line 1
----> 1 sequences = pipeline(
2 "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:")

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:200, in TextGenerationPipeline.call(self, text_inputs, **kwargs)
159 def call(self, text_inputs, **kwargs):
160 """
161 Complete the prompt(s) given as inputs.
162
(...)
198 ids of the generated text.
199 """
--> 200 return super().call(text_inputs, **kwargs)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/pipelines/base.py:1122, in Pipeline.call(self, inputs, num_workers, batch_size, *args, **kwargs)
1114 return next(
1115 iter(
1116 self.get_iterator(
(...)
1119 )
1120 )
1121 else:
-> 1122 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/pipelines/base.py:1129, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1127 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1128 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1129 model_outputs = self.forward(model_inputs, **forward_params)
1130 outputs = self.postprocess(model_outputs, **postprocess_params)
1131 return outputs

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/pipelines/base.py:1028, in Pipeline.forward(self, model_inputs, **forward_params)
1026 with inference_context():
1027 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1028 model_outputs = self._forward(model_inputs, **forward_params)
1029 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1030 else:

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:261, in TextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
258 generate_kwargs["min_length"] += prefix_length
260 # BS x SL
--> 261 generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
262 out_b = generated_sequence.shape[0]
263 if self.framework == "pt":

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/generation/utils.py:1588, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
1580 input_ids, model_kwargs = self._expand_inputs_for_generation(
1581 input_ids=input_ids,
1582 expand_size=generation_config.num_return_sequences,
1583 is_encoder_decoder=self.config.is_encoder_decoder,
1584 **model_kwargs,
1585 )
1587 # 13. run sample
-> 1588 return self.sample(
1589 input_ids,
1590 logits_processor=logits_processor,
1591 logits_warper=logits_warper,
1592 stopping_criteria=stopping_criteria,
1593 pad_token_id=generation_config.pad_token_id,
1594 eos_token_id=generation_config.eos_token_id,
1595 output_scores=generation_config.output_scores,
1596 return_dict_in_generate=generation_config.return_dict_in_generate,
1597 synced_gpus=synced_gpus,
1598 streamer=streamer,
1599 **model_kwargs,
1600 )
1602 elif is_beam_gen_mode:
1603 if generation_config.num_return_sequences > generation_config.num_beams:

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/transformers/generation/utils.py:2642, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2639 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2641 # forward pass to get next token
-> 2642 outputs = self(
2643 **model_inputs,
2644 return_dict=True,
2645 output_attentions=output_attentions,
2646 output_hidden_states=output_hidden_states,
2647 )
2649 if synced_gpus and this_peer_finished:
2650 continue # don't waste resources running the code we don't need

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/c7f670a03d987254220f343c6b026ea0c5147185/modelling_RW.py:753, in RWForCausalLM.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)
749 raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
751 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 753 transformer_outputs = self.transformer(
754 input_ids,
755 past_key_values=past_key_values,
756 attention_mask=attention_mask,
757 head_mask=head_mask,
758 inputs_embeds=inputs_embeds,
759 use_cache=use_cache,
760 output_attentions=output_attentions,
761 output_hidden_states=output_hidden_states,
762 return_dict=return_dict,
763 )
764 hidden_states = transformer_outputs[0]
766 lm_logits = self.lm_head(hidden_states)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/c7f670a03d987254220f343c6b026ea0c5147185/modelling_RW.py:648, in RWModel.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)
640 outputs = torch.utils.checkpoint.checkpoint(
641 create_custom_forward(block),
642 hidden_states,
(...)
645 head_mask[i],
646 )
647 else:
--> 648 outputs = block(
649 hidden_states,
650 layer_past=layer_past,
651 attention_mask=causal_mask,
652 head_mask=head_mask[i],
653 use_cache=use_cache,
654 output_attentions=output_attentions,
655 alibi=alibi,
656 )
658 hidden_states = outputs[0]
659 if use_cache is True:

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/c7f670a03d987254220f343c6b026ea0c5147185/modelling_RW.py:381, in DecoderLayer.forward(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions)
370 def forward(
371 self,
372 hidden_states: torch.Tensor,
(...)
378 output_attentions: bool = False,
379 ):
--> 381 layernorm_output = self.input_layernorm(hidden_states)
382 residual = hidden_states
384 # Self attention.

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/modules/normalization.py:190, in LayerNorm.forward(self, input)
189 def forward(self, input: Tensor) -> Tensor:
--> 190 return F.layer_norm(
191 input, self.normalized_shape, self.weight, self.bias, self.eps)

File ~/opt/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/nn/functional.py:2515, in layer_norm(input, normalized_shape, weight, bias, eps)
2511 if has_torch_function_variadic(input, weight, bias):
2512 return handle_torch_function(
2513 layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps
2514 )
-> 2515 return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)

RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'

vilsonrodrigues

Owner Jul 12, 2023

Your problem is that CPU does not support bnb quantization. The tutorial teaches is to take some layers and unload them on the CPU, but it will not keep these weights quantized

you should remove quantization and use device="auto" (auto detect backend using Accelerate, to use CPU in your case)

https://huggingface.co/docs/transformers/main_classes/quantization

vilsonrodrigues changed discussion status to closed Jul 12, 2023