thak123's picture
Duplicate from FFZG-cleopatra/Croatian-News-Sentiment-Classifier-V1
f08fa03
from datasets import Dataset, DatasetDict
import pandas as pd
from config import max_length, label2id
from model import tokenizer
import os
import torch
def convert_to_stsb_features(example_batch):
inputs = example_batch['content']
features = tokenizer.batch_encode_plus(
inputs, truncation=True, max_length=max_length, padding='max_length')
# features["labels"] = [label2id[i] for i in example_batch["sentiment"]]
features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))]
# features["nid"] = [int(i) for i in example_batch["nid"]]
return features
def convert_to_features(dataset_dict, convert_func_dict):
columns_dict = {
"document": ['input_ids', 'attention_mask', 'labels'],
# "paragraph": ['input_ids', 'attention_mask', 'labels'],
# "sentence": ['input_ids', 'attention_mask', 'labels'],
}
features_dict = {}
for task_name, dataset in dataset_dict.items():
features_dict[task_name] = {}
print(task_name)
for phase, phase_dataset in dataset.items():
features_dict[task_name][phase] = phase_dataset.map(
convert_func_dict[task_name],
batched=True,
load_from_cache_file=False,
)
print(task_name, phase, len(phase_dataset),
len(features_dict[task_name][phase]))
features_dict[task_name][phase].set_format(
type="torch",
columns=columns_dict[task_name],
)
print("=>",task_name, phase, len(phase_dataset),
len(features_dict[task_name][phase]))
return features_dict