File size: 2,624 Bytes
ff43e05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# $ wget https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz -O en_vectors_web_lg-2.1.0.tar.gz
# $ pip install en_vectors_web_lg-2.1.0.tar.gz
import en_vectors_web_lg
import re
import numpy as np
import os
import pickle

def clean(w):
    return re.sub(
            r"([.,'!?\"()*#:;])",
            '',
            w.lower()
            ).replace('-', ' ').replace('/', ' ')


def tokenize(key_to_word):
    key_to_sentence = {}
    for k, v in key_to_word.items():
        key_to_sentence[k] = [clean(w) for w in v if clean(w) != '']
    return key_to_sentence


def create_dict(key_to_sentence, dataroot, use_glove=True):
    token_file = dataroot+"/token_to_ix.pkl"
    glove_file = dataroot+"/train_glove.npy"
    if os.path.exists(glove_file) and os.path.exists(token_file):
        print("Loading train language files")
        return pickle.load(open(token_file, "rb")), np.load(glove_file)

    print("Creating train language files")
    token_to_ix = {
        'UNK': 1,
    }

    spacy_tool = None
    pretrained_emb = []
    if use_glove:
        spacy_tool = en_vectors_web_lg.load()
        pretrained_emb.append(spacy_tool('UNK').vector)

    for k, v in key_to_sentence.items():
        for word in v:
            if word not in token_to_ix:
                token_to_ix[word] = len(token_to_ix)
                if use_glove:
                    pretrained_emb.append(spacy_tool(word).vector)

    pretrained_emb = np.array(pretrained_emb)
    np.save(glove_file, pretrained_emb)
    pickle.dump(token_to_ix, open(token_file, "wb"))
    return token_to_ix, pretrained_emb

def sent_to_ix(s, token_to_ix, max_token=100):
    ques_ix = np.zeros(max_token, np.int64)

    for ix, word in enumerate(s):
        if word in token_to_ix:
            ques_ix[ix] = token_to_ix[word]
        else:
            ques_ix[ix] = token_to_ix['UNK']

        if ix + 1 == max_token:
            break

    return ques_ix


def cmumosei_7(a):
    if a < -2:
        res = 0
    if -2 <= a and a < -1:
        res = 1
    if -1 <= a and a < 0:
        res = 2
    if 0 <= a and a <= 0:
        res = 3
    if 0 < a and a <= 1:
        res = 4
    if 1 < a and a <= 2:
        res = 5
    if a > 2:
        res = 6
    return res

def cmumosei_2(a):
    if a < 0:
        return 0
    if a >= 0:
        return 1

def pad_feature(feat, max_len):
    if feat.shape[0] > max_len:
        feat = feat[:max_len]

    feat = np.pad(
        feat,
        ((0, max_len - feat.shape[0]), (0, 0)),
        mode='constant',
        constant_values=0
    )

    return feat