How can this be trained using trainer?

#1
by Kabumbus - opened

Getting

-> 2770 outputs = model(**inputs)
   2771 # Save past state if it exists
   2772 # TODO: this needs to be fixed and made cleaner later.
   2773 if self.args.past_index >= 0:

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:680, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
    679 def forward(*args, **kwargs):
--> 680     return model_forward(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:668, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
    667 def __call__(self, *args, **kwargs):
--> 668     return convert_to_fp32(self.model_forward(*args, **kwargs))

File /usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py:16, in autocast_decorator.<locals>.decorate_autocast(*args, **kwargs)
     13 @functools.wraps(func)
     14 def decorate_autocast(*args, **kwargs):
     15     with autocast_instance:
---> 16         return func(*args, **kwargs)

TypeError: MambaForCausalLM.forward() got an unexpected keyword argument 'attention_mask'

On code like:

model_name = "Q-bert/Mamba-130M"#"euclaise/falcon_1b_stage2"#"mosaicml/mpt-7b-storywriter"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#...

training_args = TrainingArguments(
    output_dir="./results",
    max_steps=100000,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    fp16=True,
    learning_rate=1e-5,
    evaluation_strategy="steps",  # Add this line
    eval_steps=10000,  # Add this line to run validation every 1000 steps
    save_steps=10000,
    neftune_noise_alpha=5,
)

trainer = Trainer( 
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_data["train"],
    eval_dataset=tokenized_val_data["validation"],
    args=training_args
)

trainer.train()

Also fails on SFTTrainer.

Yes so i'm working on this. Trainer does not support. I will solve but i am creating new trainer class . And this isn't gonna be happen
But there is an error in the my class, I am trying to solve it right now, I am having some problems with loss.

from transformers import Trainer ,TrainingArguments
import torch
import os


class MambaTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        input_ids = inputs.pop("input_ids")
        lm_logits = model(input_ids)[0]

        labels = input_ids.to(lm_logits.device)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        labels = labels[:, 1:].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss()
        lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))

        return lm_loss

You can use this trainer but fp16 must be False else loss will be NaN or 0.

I will solve fp16 problem as soon as possible. I'm searching why i guess norm is problem. Like old t5 models.

Thank you! That is great news!=)

Tried it out... And oh boy it is Slooooooow to finetune on 4090... 200h...
vs 2h for similar sized gpt2, and even 8h for falcon-1b...

Owner

Tried it out... And oh boy it is Slooooooow to finetune on 4090... 200h...
vs 2h for similar sized gpt2, and even 8h for falcon-1b...

https://huggingface.co/Q-bert/Mamba-370M/discussions/1

I answered your question at here.

I've found that fine-tuning the mamba model using the approach you recommended works really well, but the only downside is that the training speed is too slow. It would be brilliant if you could speed it up in an update. Thannnnnnks ~♥!

Sign up or log in to comment