File size: 4,546 Bytes
e8f4897
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pickle
import numpy as np
from gensim.models import KeyedVectors
import gzip
import io
import os

def calc_mean_vec_for_lower_mapping(embedd_dict):
    lower_counts = {}
    for word in embedd_dict:
        word_lower = word.lower()
        if word_lower not in lower_counts:
            lower_counts[word_lower] = [word]
        else:
            lower_counts[word_lower] = lower_counts[word_lower] + [word]
    # calculating mean vector for all words that have the same mapping after performing lower()
    for word in lower_counts:
        embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]])
    return embedd_dict

def load_embedding_dict(embedding, embedding_path, lower_case=False):
    """
    load word embeddings from file
    :param embedding:
    :param embedding_path:
    :return: embedding dict, embedding dimention, caseless
    """
    print("loading embedding: %s from %s" % (embedding, embedding_path))
    if lower_case:
        pkl_path = embedding_path + '_lower' + '.pkl'
    else:
        pkl_path = embedding_path + '.pkl'
    if os.path.isfile(pkl_path):
        # load dict and dim from a pickle file
        with open(pkl_path, 'rb') as f:
            embedd_dict, embedd_dim = pickle.load(f)
        print("num dimensions of word embeddings:", embedd_dim)
        return embedd_dict, embedd_dim

    if embedding == 'glove':
        # loading GloVe
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'fasttext':
        # loading GloVe
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'hellwig':
        # loading hellwig
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'one_hot':
        # loading hellwig
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split('@', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'word2vec':
        # loading word2vec
        embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        embedd_dim = embedd_dict.vector_size

    else:
        raise ValueError("embedding should choose from [fasttext, glove, word2vec]")

    print("num dimensions of word embeddings:", embedd_dim)
    # save dict and dim to a pickle file
    with open(pkl_path, 'wb') as f:
        pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL)
    return embedd_dict, embedd_dim