import os import sys from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset from transformers import PreTrainedTokenizerFast import transformers from transformers import ( AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, ) from transformers.trainer_utils import get_last_checkpoint from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel from transformers import GPT2Model from transformers import GPT2TokenizerFast import transformers import torch import numpy as np import argparse tokenizer = AutoTokenizer.from_pretrained("/tokenizer/loc") tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) out_dir = "/out_dir/ylilauta" max_length = 1024 #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400" #output_dir = r"H:\Data_temp\checkpoints\tests\yle" path = r"/data/ylilauta-corpus/data/100-percent/train.txt" #get from https://github.com/spyysalo/ylilauta-corpus text = [] labels = [] with open(path,"r",encoding="utf-8") as f: for line in f: parts = line.split(" ", maxsplit=1) labels.append(parts[0]) text.append(parts[1]) data_dict = {"text":text,"labels":labels} dataset = Dataset.from_dict(data_dict) label_names = dataset.unique('labels') n_labels = len(label_names) def to_one_hot(examples): import numpy as np label = np.zeros(n_labels) label[label_names.index(examples["labels"])] = 1 return {"text":examples["text"],"labels":label.tolist()} def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length) tokenized = dataset.map(to_one_hot).map(tokenize_function).train_test_split(test_size=0.1) tokenized.save_to_disk(out_dir)