t5-spellchecker

Модель автокорректировки грамматических ошибок в вопросах связанных с терминами из области маркетплейсов. Измененная под численные данные

How to use:

from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'wyluilipe/t5-spellchecker'
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
model.eval()

prompt = 'пунктут выздачи зазкзыов'
encoded = tokenizer.encode(prompt, return_tensors='pt')

output_sequences = model.generate(
    input_ids=encoded.to(device),
    max_length=128,
    temperature=0.1,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1,
    do_sample=True,
    num_return_sequences=1,
    pad_token_id=0,
)
decoded = tokenizer.batch_decode(output_sequences)

Training procedure

dataset_path = 'SpellChecker/trash_large.csv'  # Change this for your personal dataset
model_name = 'wyluilipe/t5-spellchecker'
use_hf = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 128
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

# dataset = load_dataset(dataset_path)
dataset = Dataset.from_pandas(df)
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
model.eval()

Function generate dataset

def tokenize_text(text):
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding='max_length',
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        128
    )
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

def tokenize_function(examples):
    text1 = examples['source']
    text2 = examples['correction']

    tokenized_inputs = tokenize_text(text1)
    labels = tokenize_text(text2)['input_ids']
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = Dataset.from_pandas(dataset['train'].to_pandas()[['source', 'correction']])
test_dataset = Dataset.from_pandas(dataset['test'].to_pandas()[['source', 'correction']])

def convert_dataset(dataset):
    input_column = 'source'
    output_column = 'correction'
    dataset_phrase = Dataset.from_pandas(
        pd.DataFrame(dataset.to_pandas()[input_column])
    )

    dataset_paraphrase = Dataset.from_pandas(
        pd.DataFrame(dataset.to_pandas()[output_column])
    )

    tokenized_dataset_phrase = dataset_phrase.map(
        tokenize_function,
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )

    tokenized_dataset_paraphrase = dataset_paraphrase.map(
        tokenize_function,
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )

    tokenized_dataset_paraphrase.to_pandas()[[output_column, 'input_ids']]

    tokenized_dataset_paraphrase = tokenized_dataset_paraphrase.to_pandas()
    dataset = tokenized_dataset_phrase.to_pandas()
    dataset[input_column] = '<phrase>:' + dataset[input_column] + '\n<answer>:'
    dataset[output_column] = tokenized_dataset_paraphrase[output_column]
    dataset['labels'] = tokenized_dataset_paraphrase['input_ids']
    dataset = dataset[[input_column, output_column, 'input_ids', 'token_type_ids', 'attention_mask', 'labels']]

    return Dataset.from_pandas(dataset)

train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )

test_dataset = test_dataset.map(
        tokenize_function,
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )

max_steps = 100
trained_model_name = "wyluilipe/t5-spellchecker"
output_dir = trained_model_name

training_args = TrainingArguments(
  learning_rate=2.0e-5,
  num_train_epochs=3,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

training_output = trainer.train()
trainer.push_to_hub('wyluilipe/t5-spellchecker')
tokenizer.push_to_hub('wyluilipe/t5-spellchecker')

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 2e-05
train_batch_size: 1
eval_batch_size: 1
seed: 42
gradient_accumulation_steps: 4
total_train_batch_size: 4
optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
lr_scheduler_type: linear
lr_scheduler_warmup_steps: 1
training_steps: 100

Training results

Framework versions

Transformers 4.33.1
Pytorch 2.0.1+cu117
Datasets 2.14.6
Tokenizers 0.13.3

wyluilipe
/

t5-spellchecker

t5-spellchecker

How to use:

Training procedure

Function generate dataset

Training hyperparameters

Training results

Framework versions

Evaluation results