movie-genre / utility.py
A-M-S's picture
Update utility.py
79b1554
import pickle
import numpy as np
from transformers import AutoTokenizer
class Utility:
def __init__(self) -> None:
pass
def tokenize(self, plot, genres):
id2label = {idx:label for idx, label in enumerate(genres)}
label2id = {label:idx for idx, label in enumerate(genres)}
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
clean_plot_tokenized = tokenizer(plot, padding="max_length", truncation=True, max_length=512)
return (id2label, label2id, tokenizer, clean_plot_tokenized)
def train_test_split(self, df, y):
"""Splits the dataset into training and validation set"""
cleaned_plot_df = df['clean_plot_tokenized']
# xtrain, xval, ytrain, yval = train_test_split(cleaned_plot_df, y, test_size=0.2, random_state=9)
# stratified sampling
xtrain, ytrain, xval, yval = iterative_train_test_split(np.asmatrix(df['clean_plot_tokenized']).transpose(), y, test_size = 0.2)
xtrain = np.array(xtrain).flatten()
xval = np.array(xval).flatten()
return (xtrain, xval, ytrain, yval)