Spaces:
Runtime error
Runtime error
import pickle | |
import numpy as np | |
from transformers import AutoTokenizer | |
class Utility: | |
def __init__(self) -> None: | |
pass | |
def tokenize(self, plot, genres): | |
id2label = {idx:label for idx, label in enumerate(genres)} | |
label2id = {label:idx for idx, label in enumerate(genres)} | |
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
clean_plot_tokenized = tokenizer(plot, padding="max_length", truncation=True, max_length=512) | |
return (id2label, label2id, tokenizer, clean_plot_tokenized) | |
def train_test_split(self, df, y): | |
"""Splits the dataset into training and validation set""" | |
cleaned_plot_df = df['clean_plot_tokenized'] | |
# xtrain, xval, ytrain, yval = train_test_split(cleaned_plot_df, y, test_size=0.2, random_state=9) | |
# stratified sampling | |
xtrain, ytrain, xval, yval = iterative_train_test_split(np.asmatrix(df['clean_plot_tokenized']).transpose(), y, test_size = 0.2) | |
xtrain = np.array(xtrain).flatten() | |
xval = np.array(xval).flatten() | |
return (xtrain, xval, ytrain, yval) | |