|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset |
|
import torch |
|
|
|
|
|
df = pd.read_csv("dataset/processed_new_data.csv") |
|
|
|
def prepare_dataset(df, tokenizer, max_length=512): |
|
input_ids = [] |
|
attention_masks = [] |
|
labels = [] |
|
for _, row in df.iterrows(): |
|
|
|
if pd.isna(row['SAS_Class']) or pd.isna(row['SDS_Class']): |
|
continue |
|
|
|
encoded = tokenizer.encode_plus( |
|
row['Description'], |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
padding='max_length', |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
input_ids.append(encoded['input_ids']) |
|
attention_masks.append(encoded['attention_mask']) |
|
|
|
|
|
sas_label = [0] * 4 |
|
sds_label = [0] * 4 |
|
sas_label[int(row['SAS_Class'])] = 1 |
|
sds_label[int(row['SDS_Class'])] = 1 |
|
combined_label = sas_label + sds_label |
|
|
|
labels.append(combined_label) |
|
|
|
input_ids = torch.cat(input_ids, dim=0) |
|
attention_masks = torch.cat(attention_masks, dim=0) |
|
labels = torch.tensor(labels, dtype=torch.float) |
|
return TensorDataset(input_ids, attention_masks, labels) |
|
|
|
|
|
|