t5-spellchecker
Модель автокорректировки грамматических ошибок в вопросах связанных с терминами из области маркетплейсов. Измененная под численные данные
How to use:
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'wyluilipe/t5-spellchecker'
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
model.eval()
prompt = 'пунктут выздачи зазкзыов'
encoded = tokenizer.encode(prompt, return_tensors='pt')
output_sequences = model.generate(
input_ids=encoded.to(device),
max_length=128,
temperature=0.1,
top_k=0,
top_p=0.9,
repetition_penalty=1,
do_sample=True,
num_return_sequences=1,
pad_token_id=0,
)
decoded = tokenizer.batch_decode(output_sequences)
Training procedure
dataset_path = 'SpellChecker/trash_large.csv' # Change this for your personal dataset
model_name = 'wyluilipe/t5-spellchecker'
use_hf = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
training_config = {
"model": {
"pretrained_name": model_name,
"max_length" : 128
},
"datasets": {
"use_hf": use_hf,
"path": dataset_path
},
"verbose": True
}
# dataset = load_dataset(dataset_path)
dataset = Dataset.from_pandas(df)
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
model.eval()
Function generate dataset
def tokenize_text(text):
tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = tokenizer(
text,
return_tensors="np",
padding='max_length',
)
max_length = min(
tokenized_inputs["input_ids"].shape[1],
128
)
tokenizer.truncation_side = "right"
tokenized_inputs = tokenizer(
text,
return_tensors="np",
padding='max_length',
truncation=True,
max_length=max_length
)
return tokenized_inputs
def tokenize_function(examples):
text1 = examples['source']
text2 = examples['correction']
tokenized_inputs = tokenize_text(text1)
labels = tokenize_text(text2)['input_ids']
tokenized_inputs['labels'] = labels
return tokenized_inputs
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = Dataset.from_pandas(dataset['train'].to_pandas()[['source', 'correction']])
test_dataset = Dataset.from_pandas(dataset['test'].to_pandas()[['source', 'correction']])
def convert_dataset(dataset):
input_column = 'source'
output_column = 'correction'
dataset_phrase = Dataset.from_pandas(
pd.DataFrame(dataset.to_pandas()[input_column])
)
dataset_paraphrase = Dataset.from_pandas(
pd.DataFrame(dataset.to_pandas()[output_column])
)
tokenized_dataset_phrase = dataset_phrase.map(
tokenize_function,
batched=True,
batch_size=1,
drop_last_batch=True
)
tokenized_dataset_paraphrase = dataset_paraphrase.map(
tokenize_function,
batched=True,
batch_size=1,
drop_last_batch=True
)
tokenized_dataset_paraphrase.to_pandas()[[output_column, 'input_ids']]
tokenized_dataset_paraphrase = tokenized_dataset_paraphrase.to_pandas()
dataset = tokenized_dataset_phrase.to_pandas()
dataset[input_column] = '<phrase>:' + dataset[input_column] + '\n<answer>:'
dataset[output_column] = tokenized_dataset_paraphrase[output_column]
dataset['labels'] = tokenized_dataset_paraphrase['input_ids']
dataset = dataset[[input_column, output_column, 'input_ids', 'token_type_ids', 'attention_mask', 'labels']]
return Dataset.from_pandas(dataset)
train_dataset = train_dataset.map(
tokenize_function,
batched=True,
batch_size=1,
drop_last_batch=True
)
test_dataset = test_dataset.map(
tokenize_function,
batched=True,
batch_size=1,
drop_last_batch=True
)
max_steps = 100
trained_model_name = "wyluilipe/t5-spellchecker"
output_dir = trained_model_name
training_args = TrainingArguments(
learning_rate=2.0e-5,
num_train_epochs=3,
# Max steps to train for (each step is a batch of data)
# Overrides num_train_epochs, if not -1
max_steps=max_steps,
# Batch size for training
per_device_train_batch_size=1,
# Directory to save model checkpoints
output_dir=output_dir,
# Other arguments
overwrite_output_dir=False, # Overwrite the content of the output directory
disable_tqdm=False, # Disable progress bars
eval_steps=120, # Number of update steps between two evaluations
save_steps=120, # After # steps model is saved
warmup_steps=1, # Number of warmup steps for learning rate scheduler
per_device_eval_batch_size=1, # Batch size for evaluation
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=1,
optim="adafactor",
gradient_accumulation_steps = 4,
gradient_checkpointing=False,
# Parameters for early stopping
load_best_model_at_end=True,
save_total_limit=1,
metric_for_best_model="eval_loss",
greater_is_better=False
)
model_flops = (
model.floating_point_ops(
{
"input_ids": torch.zeros(
(1, training_config["model"]["max_length"])
)
}
)
* training_args.gradient_accumulation_steps
)
print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
)
training_output = trainer.train()
trainer.push_to_hub('wyluilipe/t5-spellchecker')
tokenizer.push_to_hub('wyluilipe/t5-spellchecker')
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 2e-05
- train_batch_size: 1
- eval_batch_size: 1
- seed: 42
- gradient_accumulation_steps: 4
- total_train_batch_size: 4
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: linear
- lr_scheduler_warmup_steps: 1
- training_steps: 100
Training results
Framework versions
- Transformers 4.33.1
- Pytorch 2.0.1+cu117
- Datasets 2.14.6
- Tokenizers 0.13.3
- Downloads last month
- 10
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.