'''
Split a given dataset into three different datasets: training, validation and
testing.

This is achieved by splitting the given list of sentences into three separate
lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
explicit enumeration. The sentences are also tokenised using the given
vocabulary.

Also splits a given list of dictionaries containing information about
each sentence.

An additional parameter can be set 'extend_with', which will extend the given
vocabulary with up to 'extend_with' tokens, taken from the training dataset.
'''
from __future__ import print_function, unicode_literals
import example_helper
import json

from torchmoji.sentence_tokenizer import SentenceTokenizer

DATASET = [
    'I am sentence 0',
    'I am sentence 1',
    'I am sentence 2',
    'I am sentence 3',
    'I am sentence 4',
    'I am sentence 5',
    'I am sentence 6',
    'I am sentence 7',
    'I am sentence 8',
    'I am sentence 9 newword',
    ]

INFO_DICTS = [
    {'label': 'sentence 0'},
    {'label': 'sentence 1'},
    {'label': 'sentence 2'},
    {'label': 'sentence 3'},
    {'label': 'sentence 4'},
    {'label': 'sentence 5'},
    {'label': 'sentence 6'},
    {'label': 'sentence 7'},
    {'label': 'sentence 8'},
    {'label': 'sentence 9'},
    ]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(st.split_train_val_test(DATASET,
                              INFO_DICTS,
                              [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                              extend_with=1))