| import argparse |
| import glob |
| import logging |
| import os |
| import pickle |
| import random |
| import re |
| import shutil |
| from typing import Dict, List, Tuple |
| from copy import deepcopy |
| from multiprocessing import Pool |
|
|
| import numpy as np |
| import torch |
| from torch.nn.utils.rnn import pad_sequence |
| from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler |
| from torch.utils.data.distributed import DistributedSampler |
| from tqdm import tqdm, trange |
| import itertools |
|
|
| from transformers import ( |
| WEIGHTS_NAME, |
| AdamW, |
| BertConfig, |
| BertForMaskedLM, |
| BertTokenizer, |
| DNATokenizer, |
| |
| |
| CamembertConfig, |
| CamembertForMaskedLM, |
| CamembertTokenizer, |
| DistilBertConfig, |
| DistilBertForMaskedLM, |
| DistilBertTokenizer, |
| GPT2Config, |
| GPT2LMHeadModel, |
| GPT2Tokenizer, |
| OpenAIGPTConfig, |
| OpenAIGPTLMHeadModel, |
| OpenAIGPTTokenizer, |
| PreTrainedModel, |
| PreTrainedTokenizer, |
| RobertaConfig, |
| RobertaForMaskedLM, |
| RobertaTokenizer, |
| get_linear_schedule_with_warmup, |
| ) |
|
|
|
|
| try: |
| from torch.utils.tensorboard import SummaryWriter |
| except ImportError: |
| from tensorboardX import SummaryWriter |
|
|
|
|
| MODEL_CLASSES = { |
| "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), |
| "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), |
| "dna": (BertConfig, BertForMaskedLM, DNATokenizer), |
| "bert": (BertConfig, BertForMaskedLM, BertTokenizer), |
| "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), |
| "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), |
| "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), |
| |
| |
| } |
|
|
| def convert_line_to_example(tokenizer, lines, max_length, add_special_tokens=True): |
| examples = tokenizer.batch_encode_plus(lines, add_special_tokens=add_special_tokens, max_length=max_length)["input_ids"] |
| return examples |
|
|
| class LineByLineTextDataset(Dataset): |
| def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): |
| assert os.path.isfile(file_path) |
| |
| |
| |
| directory, filename = os.path.split(file_path) |
| cached_features_file = os.path.join( |
| directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename |
| ) |
|
|
| print("Creating features from dataset file at %s", file_path) |
|
|
| with open(file_path, encoding="utf-8") as f: |
| lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] |
| |
| if args.n_process == 1: |
| self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"] |
| else: |
| n_proc = args.n_process |
| p = Pool(n_proc) |
| indexes = [0] |
| len_slice = int(len(lines)/n_proc) |
| for i in range(1, n_proc+1): |
| if i != n_proc: |
| indexes.append(len_slice*(i)) |
| else: |
| indexes.append(len(lines)) |
| results = [] |
| for i in range(n_proc): |
| results.append(p.apply_async(convert_line_to_example,[tokenizer, lines[indexes[i]:indexes[i+1]], block_size,])) |
| print(str(i) + " start") |
| p.close() |
| p.join() |
|
|
| self.examples = [] |
| for result in results: |
| ids = result.get() |
| self.examples.extend(ids) |
| print("Saving features into cached file %s", cached_features_file) |
| with open(cached_features_file, "wb") as handle: |
| pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
| def __len__(self): |
| return len(self.examples) |
|
|
| def __getitem__(self, i): |
| return torch.tensor(self.examples[i], dtype=torch.long) |
|
|
|
|
| def load_and_cache_examples(args, tokenizer, evaluate=False): |
| file_path = args.eval_data_file if evaluate else args.train_data_file |
| print(file_path) |
| if args.line_by_line: |
| return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) |
| else: |
| return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) |
|
|
|
|
| def main(): |
|
|
| if args.eval_data_file: |
| eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) |
| print('done') |
|
|
| if args.train_data_file: |
| train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) |
| |
|
|
| if __name__ == '__main__': |
|
|
| parser = argparse.ArgumentParser() |
|
|
| |
| parser.add_argument( |
| "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)." |
| ) |
|
|
| |
| parser.add_argument( |
| "--eval_data_file", |
| default=None, |
| type=str, |
| help="An optional input evaluation data file to evaluate the perplexity on (a text file).", |
| ) |
| parser.add_argument( |
| "--line_by_line", |
| action="store_true", |
| help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.", |
| ) |
|
|
| parser.add_argument( |
| "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.", |
| ) |
|
|
| parser.add_argument( |
| "--tokenizer_name", |
| default=None, |
| type=str, |
| help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", |
| ) |
|
|
| parser.add_argument( |
| "--config_name", |
| default=None, |
| type=str, |
| help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", |
| ) |
|
|
| parser.add_argument( |
| "--block_size", |
| default=-1, |
| type=int, |
| help="Optional input sequence length after tokenization." |
| "The training dataset will be truncated in block of this size for training." |
| "Default to the model max input length for single sentence inputs (take into account special tokens).", |
| ) |
| parser.add_argument( |
| "--specialpath", |
| type=str, |
| help="Optional input sequence length after tokenization." |
| "The training dataset will be truncated in block of this size for training." |
| "Default to the model max input length for single sentence inputs (take into account special tokens).", |
| ) |
|
|
|
|
| parser.add_argument("--n_process", type=int, default=1, help="") |
| args = parser.parse_args() |
|
|
| config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] |
|
|
| if args.config_name: |
| config = config_class.from_pretrained(args.config_name, cache_dir=None) |
| else: |
| config = config_class() |
|
|
| if args.tokenizer_name: |
| tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=None) |
| else: |
| raise ValueError( |
| "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it," |
| "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__) |
| ) |
|
|
| if args.block_size <= 0: |
| args.block_size = tokenizer.max_len |
| |
| else: |
| args.block_size = min(args.block_size, tokenizer.max_len) |
|
|
| main() |
|
|
|
|