File size: 3,392 Bytes
86a83a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from __future__ import print_function

import json
import math
import pickle
import sys
from io import open
import numpy as np
from os.path import abspath, dirname
sys.path.insert(0, dirname(dirname(abspath(__file__))))

from torchmoji.word_generator import WordGenerator
from torchmoji.create_vocab import VocabBuilder
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
from torchmoji.tokenizer import tokenize

try:
    unicode        # Python 2
except NameError:
    unicode = str  # Python 3

IS_PYTHON2 = int(sys.version[0]) == 2

DATASETS = [
    'Olympic',
    'PsychExp',
    'SCv1',
    'SCv2-GEN',
    'SE0714',
    #'SE1604', # Excluded due to Twitter's ToS
    'SS-Twitter',
    'SS-Youtube',
    ]

DIR = '../data'
FILENAME_RAW = 'raw.pickle'
FILENAME_OWN = 'own_vocab.pickle'
FILENAME_OUR = 'twitter_vocab.pickle'
FILENAME_COMBINED = 'combined_vocab.pickle'


def roundup(x):
    return int(math.ceil(x / 10.0)) * 10


def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
    return {'dataset': dset,
            'train_texts': train_texts,
            'val_texts': val_texts,
            'test_texts': test_texts,
            'train_labels': train_labels,
            'val_labels': val_labels,
            'test_labels': test_labels}

def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(texts,
                                                  labels,
                                                  [data['train_ind'],
                                                   data['val_ind'],
                                                   data['test_ind']],
                                                  extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                        dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)

for dset in DATASETS:
    print('Converting {}'.format(dset))

    PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
    PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
    PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
    PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)

    with open(PATH_RAW, 'rb') as dataset:
        if IS_PYTHON2:
            data = pickle.load(dataset)
        else:
            data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [unicode(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    wg = WordGenerator(texts)
    vb = VocabBuilder(wg)
    vb.count_all_words()

    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(tokenize(t)) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))

    # Extract labels
    labels = [x['label'] for x in data['info']]

    convert_dataset(PATH_OWN, 50000, {})
    convert_dataset(PATH_OUR, 0, vocab)
    convert_dataset(PATH_COMBINED, 10000, vocab)