Spaces:
Runtime error
Runtime error
File size: 1,709 Bytes
aba7566 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from datasets import Dataset, DatasetDict
import pandas as pd
from config import max_length, label2id
from model import tokenizer
import os
import torch
def convert_to_stsb_features(example_batch):
inputs = example_batch['content']
features = tokenizer.batch_encode_plus(
inputs, truncation=True, max_length=max_length, padding='max_length')
# features["labels"] = [label2id[i] for i in example_batch["sentiment"]]
features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))]
# features["nid"] = [int(i) for i in example_batch["nid"]]
return features
def convert_to_features(dataset_dict, convert_func_dict):
columns_dict = {
"document": ['input_ids', 'attention_mask', 'labels'],
# "paragraph": ['input_ids', 'attention_mask', 'labels'],
# "sentence": ['input_ids', 'attention_mask', 'labels'],
}
features_dict = {}
for task_name, dataset in dataset_dict.items():
features_dict[task_name] = {}
print(task_name)
for phase, phase_dataset in dataset.items():
features_dict[task_name][phase] = phase_dataset.map(
convert_func_dict[task_name],
batched=True,
load_from_cache_file=False,
)
print(task_name, phase, len(phase_dataset),
len(features_dict[task_name][phase]))
features_dict[task_name][phase].set_format(
type="torch",
columns=columns_dict[task_name],
)
print("=>",task_name, phase, len(phase_dataset),
len(features_dict[task_name][phase]))
return features_dict
|