File size: 4,546 Bytes
e8f4897 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import pickle
import numpy as np
from gensim.models import KeyedVectors
import gzip
import io
import os
def calc_mean_vec_for_lower_mapping(embedd_dict):
lower_counts = {}
for word in embedd_dict:
word_lower = word.lower()
if word_lower not in lower_counts:
lower_counts[word_lower] = [word]
else:
lower_counts[word_lower] = lower_counts[word_lower] + [word]
# calculating mean vector for all words that have the same mapping after performing lower()
for word in lower_counts:
embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]])
return embedd_dict
def load_embedding_dict(embedding, embedding_path, lower_case=False):
"""
load word embeddings from file
:param embedding:
:param embedding_path:
:return: embedding dict, embedding dimention, caseless
"""
print("loading embedding: %s from %s" % (embedding, embedding_path))
if lower_case:
pkl_path = embedding_path + '_lower' + '.pkl'
else:
pkl_path = embedding_path + '.pkl'
if os.path.isfile(pkl_path):
# load dict and dim from a pickle file
with open(pkl_path, 'rb') as f:
embedd_dict, embedd_dim = pickle.load(f)
print("num dimensions of word embeddings:", embedd_dim)
return embedd_dict, embedd_dim
if embedding == 'glove':
# loading GloVe
embedd_dict = {}
word = None
with io.open(embedding_path, 'r', encoding='utf-8') as f:
for line in f:
word, vec = line.split(' ', 1)
embedd_dict[word] = np.fromstring(vec, sep=' ')
embedd_dim = len(embedd_dict[word])
if lower_case:
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
for k, v in embedd_dict.items():
if len(v) != embedd_dim:
print(len(v),embedd_dim)
elif embedding == 'fasttext':
# loading GloVe
embedd_dict = {}
word = None
with io.open(embedding_path, 'r', encoding='utf-8') as f:
# skip first line
for i, line in enumerate(f):
if i == 0:
continue
word, vec = line.split(' ', 1)
embedd_dict[word] = np.fromstring(vec, sep=' ')
embedd_dim = len(embedd_dict[word])
if lower_case:
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
for k, v in embedd_dict.items():
if len(v) != embedd_dim:
print(len(v),embedd_dim)
elif embedding == 'hellwig':
# loading hellwig
embedd_dict = {}
word = None
with io.open(embedding_path, 'r', encoding='utf-8') as f:
# skip first line
for i, line in enumerate(f):
if i == 0:
continue
word, vec = line.split(' ', 1)
embedd_dict[word] = np.fromstring(vec, sep=' ')
embedd_dim = len(embedd_dict[word])
if lower_case:
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
for k, v in embedd_dict.items():
if len(v) != embedd_dim:
print(len(v),embedd_dim)
elif embedding == 'one_hot':
# loading hellwig
embedd_dict = {}
word = None
with io.open(embedding_path, 'r', encoding='utf-8') as f:
# skip first line
for i, line in enumerate(f):
if i == 0:
continue
word, vec = line.split('@', 1)
embedd_dict[word] = np.fromstring(vec, sep=' ')
embedd_dim = len(embedd_dict[word])
if lower_case:
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
for k, v in embedd_dict.items():
if len(v) != embedd_dim:
print(len(v),embedd_dim)
elif embedding == 'word2vec':
# loading word2vec
embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
if lower_case:
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
embedd_dim = embedd_dict.vector_size
else:
raise ValueError("embedding should choose from [fasttext, glove, word2vec]")
print("num dimensions of word embeddings:", embedd_dim)
# save dict and dim to a pickle file
with open(pkl_path, 'wb') as f:
pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL)
return embedd_dict, embedd_dim |