--- language: - ru tags: - summarization license: apache-2.0 inference: parameters: no_repeat_ngram_size: 4 --- # RuBertTelegramHeadlines ## Model description Example model for [Headline generation competition](https://competitions.codalab.org/competitions/29905) Based on [RuBERT](http://docs.deeppavlov.ai/en/master/features/models/bert.html) model ## Intended uses & limitations #### How to use ```python from transformers import AutoTokenizer, EncoderDecoderModel model_name = "IlyaGusev/rubert_telegram_headlines" tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, do_basic_tokenize=False, strip_accents=False) model = EncoderDecoderModel.from_pretrained(model_name) article_text = "..." input_ids = tokenizer( [article_text], add_special_tokens=True, max_length=256, padding="max_length", truncation=True, return_tensors="pt", )["input_ids"] output_ids = model.generate( input_ids=input_ids, max_length=64, no_repeat_ngram_size=3, num_beams=10, top_p=0.95 )[0] headline = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) print(headline) ``` ## Training data - Dataset: [ru_all_split.tar.gz](https://www.dropbox.com/s/ykqk49a8avlmnaf/ru_all_split.tar.gz) ## Training procedure ```python import random import torch from torch.utils.data import Dataset from tqdm.notebook import tqdm from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments, logging def convert_to_tensors( tokenizer, text, max_text_tokens_count, max_title_tokens_count = None, title = None ): inputs = tokenizer( text, add_special_tokens=True, max_length=max_text_tokens_count, padding="max_length", truncation=True ) result = { "input_ids": torch.tensor(inputs["input_ids"]), "attention_mask": torch.tensor(inputs["attention_mask"]), } if title is not None: outputs = tokenizer( title, add_special_tokens=True, max_length=max_title_tokens_count, padding="max_length", truncation=True ) decoder_input_ids = torch.tensor(outputs["input_ids"]) decoder_attention_mask = torch.tensor(outputs["attention_mask"]) labels = decoder_input_ids.clone() labels[decoder_attention_mask == 0] = -100 result.update({ "labels": labels, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask }) return result class GetTitleDataset(Dataset): def __init__( self, original_records, sample_rate, tokenizer, max_text_tokens_count, max_title_tokens_count ): self.original_records = original_records self.sample_rate = sample_rate self.tokenizer = tokenizer self.max_text_tokens_count = max_text_tokens_count self.max_title_tokens_count = max_title_tokens_count self.records = [] for record in tqdm(original_records): if random.random() > self.sample_rate: continue tensors = convert_to_tensors( tokenizer=tokenizer, title=record["title"], text=record["text"], max_title_tokens_count=self.max_title_tokens_count, max_text_tokens_count=self.max_text_tokens_count ) self.records.append(tensors) def __len__(self): return len(self.records) def __getitem__(self, index): return self.records[index] def train( train_records, val_records, pretrained_model_path, train_sample_rate=1.0, val_sample_rate=1.0, output_model_path="models", checkpoint=None, max_text_tokens_count=256, max_title_tokens_count=64, batch_size=8, logging_steps=1000, eval_steps=10000, save_steps=10000, learning_rate=0.00003, warmup_steps=2000, num_train_epochs=3 ): logging.set_verbosity_info() tokenizer = BertTokenizer.from_pretrained( pretrained_model_path, do_lower_case=False, do_basic_tokenize=False, strip_accents=False ) train_dataset = GetTitleDataset( train_records, train_sample_rate, tokenizer, max_text_tokens_count=max_text_tokens_count, max_title_tokens_count=max_title_tokens_count ) val_dataset = GetTitleDataset( val_records, val_sample_rate, tokenizer, max_text_tokens_count=max_text_tokens_count, max_title_tokens_count=max_title_tokens_count ) model = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrained_model_path, pretrained_model_path) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, eval_steps=eval_steps, evaluation_strategy="steps", save_steps=save_steps, learning_rate=learning_rate, warmup_steps=warmup_steps, num_train_epochs=num_train_epochs, max_steps=-1, save_total_limit=1, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train(checkpoint) model.save_pretrained(output_model_path) ```