import torch import numpy as np import fasttext.util from gensim import models def load_word_embeddings(emb_file, vocab): embeds = {} for line in open(emb_file, 'rb'): line = line.decode().strip().split(' ') wvec = torch.FloatTensor(list(map(float, line[1:]))) embeds[line[0]] = wvec # for zappos (should account for everything) custom_map = { 'Faux.Fur':'fake_fur', 'Faux.Leather':'fake_leather', 'Full.grain.leather':'thick_leather', 'Hair.Calf':'hair_leather', 'Patent.Leather':'shiny_leather', 'Nubuck':'grainy_leather', 'Boots.Ankle':'ankle_boots', 'Boots.Knee.High':'knee_high_boots', 'Boots.Mid-Calf':'midcalf_boots', 'Shoes.Boat.Shoes':'boat_shoes', 'Shoes.Clogs.and.Mules':'clogs_shoes', 'Shoes.Flats':'flats_shoes', 'Shoes.Heels':'heels', 'Shoes.Loafers':'loafers', 'Shoes.Oxfords':'oxford_shoes', 'Shoes.Sneakers.and.Athletic.Shoes':'sneakers'} custom_map_vaw = { 'selfie': 'photo' } E = [] for k in vocab: if k in custom_map: print(f'Change {k} to {custom_map[k]}') k = custom_map[k] k = k.lower() if '_' in k: toks = k.split('_') emb_tmp = torch.zeros(300).float() for tok in toks: if tok in custom_map_vaw: tok = custom_map_vaw[tok] emb_tmp += embeds[tok] emb_tmp /= len(toks) E.append(emb_tmp) else: E.append(embeds[k]) embeds = torch.stack(E) print ('Loaded embeddings from file %s' % emb_file, embeds.size()) return embeds def load_fasttext_embeddings(emb_file,vocab): custom_map = { 'Faux.Fur': 'fake fur', 'Faux.Leather': 'fake leather', 'Full.grain.leather': 'thick leather', 'Hair.Calf': 'hairy leather', 'Patent.Leather': 'shiny leather', 'Boots.Ankle': 'ankle boots', 'Boots.Knee.High': 'kneehigh boots', 'Boots.Mid-Calf': 'midcalf boots', 'Shoes.Boat.Shoes': 'boatshoes', 'Shoes.Clogs.and.Mules': 'clogs shoes', 'Shoes.Flats': 'flats shoes', 'Shoes.Heels': 'heels', 'Shoes.Loafers': 'loafers', 'Shoes.Oxfords': 'oxford shoes', 'Shoes.Sneakers.and.Athletic.Shoes': 'sneakers', 'traffic_light': 'traficlight', 'trash_can': 'trashcan', 'dry-erase_board' : 'dry_erase_board', 'black_and_white' : 'black_white', 'eiffel_tower' : 'tower' } vocab_lower = [v.lower() for v in vocab] vocab = [] for current in vocab_lower: if current in custom_map: vocab.append(custom_map[current]) else: vocab.append(current) ft = fasttext.load_model(emb_file) #DATA_FOLDER+'/fast/cc.en.300.bin') embeds = [] for k in vocab: if '_' in k: ks = k.split('_') emb = np.stack([ft.get_word_vector(it) for it in ks]).mean(axis=0) else: emb = ft.get_word_vector(k) embeds.append(emb) embeds = torch.Tensor(np.stack(embeds)) print('Fasttext Embeddings loaded, total embeddings: {}'.format(embeds.size())) return embeds def load_word2vec_embeddings(emb_file,vocab): # vocab = [v.lower() for v in vocab] model = models.KeyedVectors.load_word2vec_format(emb_file,binary=True) #DATA_FOLDER+'/w2v/GoogleNews-vectors-negative300.bin', binary=True) custom_map = { 'Faux.Fur': 'fake_fur', 'Faux.Leather': 'fake_leather', 'Full.grain.leather': 'thick_leather', 'Hair.Calf': 'hair_leather', 'Patent.Leather': 'shiny_leather', 'Boots.Ankle': 'ankle_boots', 'Boots.Knee.High': 'knee_high_boots', 'Boots.Mid-Calf': 'midcalf_boots', 'Shoes.Boat.Shoes': 'boat_shoes', 'Shoes.Clogs.and.Mules': 'clogs_shoes', 'Shoes.Flats': 'flats_shoes', 'Shoes.Heels': 'heels', 'Shoes.Loafers': 'loafers', 'Shoes.Oxfords': 'oxford_shoes', 'Shoes.Sneakers.and.Athletic.Shoes': 'sneakers', 'traffic_light': 'traffic_light', 'trash_can': 'trashcan', 'dry-erase_board' : 'dry_erase_board', 'black_and_white' : 'black_white', 'eiffel_tower' : 'tower' } embeds = [] for k in vocab: if k in custom_map: k = custom_map[k] if '_' in k and k not in model: ks = k.split('_') emb = np.stack([model[it] for it in ks]).mean(axis=0) else: emb = model[k] embeds.append(emb) embeds = torch.Tensor(np.stack(embeds)) print('Word2Vec Embeddings loaded, total embeddings: {}'.format(embeds.size())) return embeds def initialize_wordembedding_matrix(name, vocab): """ Args: - name: hyphen separated word embedding names: 'glove-word2vec-conceptnet'. - vocab: list of attributes/objects. """ wordembs = name.split('+') result = None for wordemb in wordembs: if wordemb == 'glove': wordemb_ = load_word_embeddings(f'./utils/glove.6B.300d.txt', vocab) if result is None: result = wordemb_ else: result = torch.cat((result, wordemb_), dim=1) dim = 300 * len(wordembs) return result, dim