from datasets import Dataset, DatasetDict import pandas as pd from config import max_length, label2id from model import tokenizer import os import torch def convert_to_stsb_features(example_batch): inputs = example_batch['content'] features = tokenizer.batch_encode_plus( inputs, truncation=True, max_length=max_length, padding='max_length') # features["labels"] = [label2id[i] for i in example_batch["sentiment"]] features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))] # features["nid"] = [int(i) for i in example_batch["nid"]] return features def convert_to_features(dataset_dict, convert_func_dict): columns_dict = { "document": ['input_ids', 'attention_mask', 'labels'], # "paragraph": ['input_ids', 'attention_mask', 'labels'], # "sentence": ['input_ids', 'attention_mask', 'labels'], } features_dict = {} for task_name, dataset in dataset_dict.items(): features_dict[task_name] = {} print(task_name) for phase, phase_dataset in dataset.items(): features_dict[task_name][phase] = phase_dataset.map( convert_func_dict[task_name], batched=True, load_from_cache_file=False, ) print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) features_dict[task_name][phase].set_format( type="torch", columns=columns_dict[task_name], ) print("=>",task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) return features_dict