Spaces:
Running
Running
"""Dataset module for sentiment analysis. | |
Currently imdb dataset is available. | |
""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import data.imdb as imdb | |
DATASET_IMDB = "imdb" | |
def load(dataset, vocabulary_size, sentence_length): | |
"""Returns training and evaluation input. | |
Args: | |
dataset: Dataset to be trained and evaluated. | |
Currently only imdb is supported. | |
vocabulary_size: The number of the most frequent tokens | |
to be used from the corpus. | |
sentence_length: The number of words in each sentence. | |
Longer sentences get cut, shorter ones padded. | |
Raises: | |
ValueError: if the dataset value is not valid. | |
Returns: | |
A tuple of length 4, for training sentences, labels, | |
evaluation sentences, and evaluation labels, | |
each being an numpy array. | |
""" | |
if dataset == DATASET_IMDB: | |
return imdb.load(vocabulary_size, sentence_length) | |
else: | |
raise ValueError("unsupported dataset: " + dataset) | |
def get_num_class(dataset): | |
"""Returns an integer for the number of label classes. | |
Args: | |
dataset: Dataset to be trained and evaluated. | |
Currently only imdb is supported. | |
Raises: | |
ValueError: if the dataset value is not valid. | |
Returns: | |
int: The number of label classes. | |
""" | |
if dataset == DATASET_IMDB: | |
return imdb.NUM_CLASS | |
else: | |
raise ValueError("unsupported dataset: " + dataset) | |