|
import torch |
|
import numpy as np |
|
import fasttext.util |
|
from gensim import models |
|
|
|
def load_word_embeddings(emb_file, vocab): |
|
embeds = {} |
|
for line in open(emb_file, 'rb'): |
|
line = line.decode().strip().split(' ') |
|
wvec = torch.FloatTensor(list(map(float, line[1:]))) |
|
embeds[line[0]] = wvec |
|
|
|
|
|
custom_map = { |
|
'Faux.Fur':'fake_fur', 'Faux.Leather':'fake_leather', 'Full.grain.leather':'thick_leather', |
|
'Hair.Calf':'hair_leather', 'Patent.Leather':'shiny_leather', 'Nubuck':'grainy_leather', |
|
'Boots.Ankle':'ankle_boots', 'Boots.Knee.High':'knee_high_boots', 'Boots.Mid-Calf':'midcalf_boots', |
|
'Shoes.Boat.Shoes':'boat_shoes', 'Shoes.Clogs.and.Mules':'clogs_shoes', 'Shoes.Flats':'flats_shoes', |
|
'Shoes.Heels':'heels', 'Shoes.Loafers':'loafers', 'Shoes.Oxfords':'oxford_shoes', |
|
'Shoes.Sneakers.and.Athletic.Shoes':'sneakers'} |
|
custom_map_vaw = { |
|
'selfie': 'photo' |
|
} |
|
|
|
E = [] |
|
for k in vocab: |
|
if k in custom_map: |
|
print(f'Change {k} to {custom_map[k]}') |
|
k = custom_map[k] |
|
k = k.lower() |
|
if '_' in k: |
|
toks = k.split('_') |
|
emb_tmp = torch.zeros(300).float() |
|
for tok in toks: |
|
if tok in custom_map_vaw: |
|
tok = custom_map_vaw[tok] |
|
emb_tmp += embeds[tok] |
|
emb_tmp /= len(toks) |
|
E.append(emb_tmp) |
|
else: |
|
E.append(embeds[k]) |
|
|
|
embeds = torch.stack(E) |
|
print ('Loaded embeddings from file %s' % emb_file, embeds.size()) |
|
|
|
return embeds |
|
|
|
def load_fasttext_embeddings(emb_file,vocab): |
|
custom_map = { |
|
'Faux.Fur': 'fake fur', |
|
'Faux.Leather': 'fake leather', |
|
'Full.grain.leather': 'thick leather', |
|
'Hair.Calf': 'hairy leather', |
|
'Patent.Leather': 'shiny leather', |
|
'Boots.Ankle': 'ankle boots', |
|
'Boots.Knee.High': 'kneehigh boots', |
|
'Boots.Mid-Calf': 'midcalf boots', |
|
'Shoes.Boat.Shoes': 'boatshoes', |
|
'Shoes.Clogs.and.Mules': 'clogs shoes', |
|
'Shoes.Flats': 'flats shoes', |
|
'Shoes.Heels': 'heels', |
|
'Shoes.Loafers': 'loafers', |
|
'Shoes.Oxfords': 'oxford shoes', |
|
'Shoes.Sneakers.and.Athletic.Shoes': 'sneakers', |
|
'traffic_light': 'traficlight', |
|
'trash_can': 'trashcan', |
|
'dry-erase_board' : 'dry_erase_board', |
|
'black_and_white' : 'black_white', |
|
'eiffel_tower' : 'tower' |
|
} |
|
vocab_lower = [v.lower() for v in vocab] |
|
vocab = [] |
|
for current in vocab_lower: |
|
if current in custom_map: |
|
vocab.append(custom_map[current]) |
|
else: |
|
vocab.append(current) |
|
|
|
|
|
ft = fasttext.load_model(emb_file) |
|
embeds = [] |
|
for k in vocab: |
|
if '_' in k: |
|
ks = k.split('_') |
|
emb = np.stack([ft.get_word_vector(it) for it in ks]).mean(axis=0) |
|
else: |
|
emb = ft.get_word_vector(k) |
|
embeds.append(emb) |
|
|
|
embeds = torch.Tensor(np.stack(embeds)) |
|
print('Fasttext Embeddings loaded, total embeddings: {}'.format(embeds.size())) |
|
return embeds |
|
|
|
def load_word2vec_embeddings(emb_file,vocab): |
|
|
|
|
|
|
|
model = models.KeyedVectors.load_word2vec_format(emb_file,binary=True) |
|
|
|
|
|
custom_map = { |
|
'Faux.Fur': 'fake_fur', |
|
'Faux.Leather': 'fake_leather', |
|
'Full.grain.leather': 'thick_leather', |
|
'Hair.Calf': 'hair_leather', |
|
'Patent.Leather': 'shiny_leather', |
|
'Boots.Ankle': 'ankle_boots', |
|
'Boots.Knee.High': 'knee_high_boots', |
|
'Boots.Mid-Calf': 'midcalf_boots', |
|
'Shoes.Boat.Shoes': 'boat_shoes', |
|
'Shoes.Clogs.and.Mules': 'clogs_shoes', |
|
'Shoes.Flats': 'flats_shoes', |
|
'Shoes.Heels': 'heels', |
|
'Shoes.Loafers': 'loafers', |
|
'Shoes.Oxfords': 'oxford_shoes', |
|
'Shoes.Sneakers.and.Athletic.Shoes': 'sneakers', |
|
'traffic_light': 'traffic_light', |
|
'trash_can': 'trashcan', |
|
'dry-erase_board' : 'dry_erase_board', |
|
'black_and_white' : 'black_white', |
|
'eiffel_tower' : 'tower' |
|
} |
|
|
|
embeds = [] |
|
for k in vocab: |
|
if k in custom_map: |
|
k = custom_map[k] |
|
if '_' in k and k not in model: |
|
ks = k.split('_') |
|
emb = np.stack([model[it] for it in ks]).mean(axis=0) |
|
else: |
|
emb = model[k] |
|
embeds.append(emb) |
|
embeds = torch.Tensor(np.stack(embeds)) |
|
print('Word2Vec Embeddings loaded, total embeddings: {}'.format(embeds.size())) |
|
return embeds |
|
|
|
|
|
|
|
def initialize_wordembedding_matrix(name, vocab): |
|
""" |
|
Args: |
|
- name: hyphen separated word embedding names: 'glove-word2vec-conceptnet'. |
|
- vocab: list of attributes/objects. |
|
""" |
|
wordembs = name.split('+') |
|
result = None |
|
|
|
for wordemb in wordembs: |
|
if wordemb == 'glove': |
|
wordemb_ = load_word_embeddings(f'./utils/glove.6B.300d.txt', vocab) |
|
if result is None: |
|
result = wordemb_ |
|
else: |
|
result = torch.cat((result, wordemb_), dim=1) |
|
dim = 300 * len(wordembs) |
|
return result, dim |
|
|