File size: 1,709 Bytes
aba7566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from datasets import Dataset, DatasetDict
import pandas as pd
from config import max_length, label2id
from model import tokenizer
import os
import torch


def convert_to_stsb_features(example_batch):
    inputs = example_batch['content']
    features = tokenizer.batch_encode_plus(
        inputs, truncation=True, max_length=max_length, padding='max_length')

    # features["labels"] = [label2id[i] for i in example_batch["sentiment"]]
    features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))]
    # features["nid"] = [int(i) for i in example_batch["nid"]]
    return features




def convert_to_features(dataset_dict, convert_func_dict):
    columns_dict = {
        "document": ['input_ids', 'attention_mask', 'labels'],
        # "paragraph": ['input_ids', 'attention_mask', 'labels'],
        # "sentence": ['input_ids', 'attention_mask', 'labels'],
    }
    features_dict = {}

    for task_name, dataset in dataset_dict.items():
        features_dict[task_name] = {}
        print(task_name)
        for phase, phase_dataset in dataset.items():
            features_dict[task_name][phase] = phase_dataset.map(
                convert_func_dict[task_name],
                batched=True,
                load_from_cache_file=False,
            )
            print(task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))
            features_dict[task_name][phase].set_format(
                type="torch",
                columns=columns_dict[task_name],
            )
            print("=>",task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))
    return features_dict