Spaces:
Sleeping
Sleeping
"""IMDB Dataset module for sentiment analysis.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import numpy as np | |
import tensorflow as tf | |
from data.util import OOV_CHAR | |
from data.util import pad_sentence | |
from data.util import START_CHAR | |
NUM_CLASS = 2 | |
def load(vocabulary_size, sentence_length): | |
"""Returns training and evaluation input for imdb dataset. | |
Args: | |
vocabulary_size: The number of the most frequent tokens | |
to be used from the corpus. | |
sentence_length: The number of words in each sentence. | |
Longer sentences get cut, shorter ones padded. | |
Raises: | |
ValueError: if the dataset value is not valid. | |
Returns: | |
A tuple of length 4, for training and evaluation data, | |
each being an numpy array. | |
""" | |
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data( | |
path="imdb.npz", | |
num_words=vocabulary_size, | |
skip_top=0, | |
maxlen=None, | |
seed=113, | |
start_char=START_CHAR, | |
oov_char=OOV_CHAR, | |
index_from=OOV_CHAR+1) | |
x_train_processed = [] | |
for sen in x_train: | |
sen = pad_sentence(sen, sentence_length) | |
x_train_processed.append(np.array(sen)) | |
x_train_processed = np.array(x_train_processed) | |
x_test_processed = [] | |
for sen in x_test: | |
sen = pad_sentence(sen, sentence_length) | |
x_test_processed.append(np.array(sen)) | |
x_test_processed = np.array(x_test_processed) | |
return x_train_processed, np.eye(NUM_CLASS)[y_train], \ | |
x_test_processed, np.eye(NUM_CLASS)[y_test] | |