LCM

File size: 4,546 Bytes

e8f4897

import pickle
import numpy as np
from gensim.models import KeyedVectors
import gzip
import io
import os

def calc_mean_vec_for_lower_mapping(embedd_dict):
    lower_counts = {}
    for word in embedd_dict:
        word_lower = word.lower()
        if word_lower not in lower_counts:
            lower_counts[word_lower] = [word]
        else:
            lower_counts[word_lower] = lower_counts[word_lower] + [word]
    # calculating mean vector for all words that have the same mapping after performing lower()
    for word in lower_counts:
        embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]])
    return embedd_dict

def load_embedding_dict(embedding, embedding_path, lower_case=False):
    """
    load word embeddings from file
    :param embedding:
    :param embedding_path:
    :return: embedding dict, embedding dimention, caseless
    """
    print("loading embedding: %s from %s" % (embedding, embedding_path))
    if lower_case:
        pkl_path = embedding_path + '_lower' + '.pkl'
    else:
        pkl_path = embedding_path + '.pkl'
    if os.path.isfile(pkl_path):
        # load dict and dim from a pickle file
        with open(pkl_path, 'rb') as f:
            embedd_dict, embedd_dim = pickle.load(f)
        print("num dimensions of word embeddings:", embedd_dim)
        return embedd_dict, embedd_dim

    if embedding == 'glove':
        # loading GloVe
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'fasttext':
        # loading GloVe
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'hellwig':
        # loading hellwig
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split(' ', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'one_hot':
        # loading hellwig
        embedd_dict = {}
        word = None
        with io.open(embedding_path, 'r', encoding='utf-8') as f:
            # skip first line
            for i, line in enumerate(f):
                if i == 0:
                    continue
                word, vec = line.split('@', 1)
                embedd_dict[word] = np.fromstring(vec, sep=' ')
        embedd_dim = len(embedd_dict[word])
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        for k, v in embedd_dict.items():
            if len(v) != embedd_dim:
                print(len(v),embedd_dim)

    elif embedding == 'word2vec':
        # loading word2vec
        embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
        if lower_case:
            embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
        embedd_dim = embedd_dict.vector_size

    else:
        raise ValueError("embedding should choose from [fasttext, glove, word2vec]")

    print("num dimensions of word embeddings:", embedd_dim)
    # save dict and dim to a pickle file
    with open(pkl_path, 'wb') as f:
        pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL)
    return embedd_dict, embedd_dim