expected sequence of length 706 at dim 1 (got 220)

#103
by Buggod - opened

It's been bugging me for days. it can't start training due to expected sequence of length issue.

i'm using personal datasets and this is how my codes look like (modified from samsum_dataset):
"

import datasets
from datasets import load_dataset

def get_preprocessed_ultimate_csv(dataset_path, tokenizer, split):
# Load your custom dataset from a CSV file
dataset = load_dataset("csv", data_files=dataset_path, split=split)

prompt = (
    f"Answer this query:\n{{query}}\n---\nResponse:\n"
)

def apply_prompt_template(sample):
    return {
        "prompt": prompt.format(query=sample["Query"]),
        "summary": sample["Answer"],  # Using 'summary' to keep consistent with tokenize function
    }

dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

def tokenize_add_label(sample):
    # Encode the prompt and summary using the tokenizer
    prompt = tokenizer.encode(tokenizer.bos_token + sample["prompt"], add_special_tokens=False)
    summary = tokenizer.encode(sample["summary"] + tokenizer.eos_token, add_special_tokens=False)

    sample = {
        "input_ids": prompt + summary,
        "attention_mask": [1] * (len(prompt) + len(summary)),
        "labels": [-100] * len(prompt) + summary,  # Label only the summary part
    }

    return sample

dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))

return dataset

Example of usage

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
dataset_path = "Ultimate_email.csv"

Assuming you want to process the entire data without splitting

processed_dataset = get_preprocessed_ultimate_csv(dataset_path, tokenizer, split='train')

"

I also have set the padding as follows:
"

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id="meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': ''}) # Add a padding token

model = AutoModelForCausalLM.from_pretrained(model_id)
model.config.pad_token_id = tokenizer.pad_token_id # Set the padding token ID in the model config

Resize model embeddings to incorporate special tokens

model.resize_token_embeddings(len(tokenizer))

"

Well I have included padding to make sure they're all at same length.
Is this the underlying problem here?
Is setting all at same length neccessary to finetune llama3?
I didnt use such to finetune mistral or gemma

Guys please help me!!!

Now i seem to finetune the model but after saving it using

"model.save_pretrained(output_dir)"

I can't load it. Here's the error:

"---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
in
2 from transformers import AutoModelForCausalLM, AutoTokenizer
3 output_dir = "tmp/llama-output"
----> 4 model = AutoModelForCausalLM.from_pretrained(output_dir,ignore_mismatched_sizes=True)

~/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
561 elif type(config) in cls._model_mapping.keys():
562 model_class = _get_model_class(config, cls._model_mapping)
--> 563 return model_class.from_pretrained(
564 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
565 )

~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3740
3741 if _adapter_model_path is not None:
-> 3742 model.load_adapter(
3743 _adapter_model_path,
3744 adapter_name=adapter_name,

~/.local/lib/python3.8/site-packages/transformers/integrations/peft.py in load_adapter(self, peft_model_id, adapter_name, revision, token, device_map, max_memory, offload_folder, offload_index, peft_config, adapter_state_dict, adapter_kwargs)
204
205 # Load state dict
--> 206 incompatible_keys = set_peft_model_state_dict(self, processed_adapter_state_dict, adapter_name)
207
208 if incompatible_keys is not None:

~/.local/lib/python3.8/site-packages/peft/utils/save_and_load.py in set_peft_model_state_dict(model, peft_model_state_dict, adapter_name)
247 raise NotImplementedError
248
--> 249 load_result = model.load_state_dict(peft_model_state_dict, strict=False)
250 if config.is_prompt_learning:
251 model.prompt_encoder[adapter_name].embedding.load_state_dict(

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict, assign)
2187
2188 if len(error_msgs) > 0:
-> 2189 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
2190 self.class.name, "\n\t".join(error_msgs)))
2191 return _IncompatibleKeys(missing_keys, unexpected_keys)

RuntimeError: Error(s) in loading state_dict for LlamaForCausalLM:
size mismatch for model.embed_tokens.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
size mismatch for lm_head.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
"

I did modify the tokenizer in the beginning, but it won't matter right?

"import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id="meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': ''}) # Add a padding token

model = AutoModelForCausalLM.from_pretrained(model_id)
model.config.pad_token_id = tokenizer.pad_token_id # Set the padding token ID in the model config

Resize model embeddings to incorporate special tokens

model.resize_token_embeddings(len(tokenizer))"

Sign up or log in to comment