Spaces:
Sleeping
Sleeping
import os | |
import numpy as np | |
import requests | |
import zipfile | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torch import optim | |
from torch.autograd import Variable | |
from torchvision import datasets, transforms | |
import torchvision | |
from PIL import Image | |
from .glove import GloVe | |
from tqdm import tqdm | |
AD_DIRECTORY = os.path.dirname(__file__) | |
def obtain_vector(inputs, glove): | |
vector_im = glove.embedding.get(inputs) | |
if vector_im is None: | |
vector_im = glove.embedding.get(inputs.lower()) | |
if vector_im is None: | |
vector_im = glove.embedding.get(inputs.title()) | |
if vector_im is None: | |
vector_im = glove.embedding.get(inputs.upper()) | |
return vector_im | |
def generate_glove(): | |
print("Generating glove similarity...") | |
#download glove file | |
os.makedirs("./knowledge", exist_ok=True) | |
glove_file = './knowledge/glove.840B.300d.txt' | |
if not os.path.exists(glove_file): | |
print("Downloading glove files...") | |
print("") | |
print("Gonna take a while") | |
print("") | |
url_path = "http://nlp.stanford.edu/data/glove.840B.300d.zip" | |
r = requests.get(url_path) | |
with open("./knowledge/glove.840B.300d.zip","wb") as f: | |
f.write(r.content) | |
filename = './knowledge/glove.840B.300d.zip' | |
fz = zipfile.ZipFile(filename, 'r') | |
for file in fz.namelist(): | |
fz.extract(file, './knowledge/.') | |
if os.path.exists(filename): | |
os.remove(filename) | |
glove = GloVe('./knowledge/glove.840B.300d.txt') | |
filepath = os.path.join(AD_DIRECTORY, "label_name.txt") | |
vec_list = [] | |
vec_list_np = [] | |
cos_similarity = np.zeros((1000,1000)) | |
index = 0 | |
#the labels could be a word or a phrase with multi words | |
#we also tested on average of every words | |
#But we assume the last word should be more important, so in our final version | |
#we assign a higher weight to last word in a phrase and average the fornt words | |
#w2v for last word of multi-words | |
for line in tqdm(open(filepath)): | |
a = line.strip('\n') | |
b = a.split(',') | |
cnt = 0 | |
vector = torch.zeros(300) | |
vec_front = torch.zeros(300) | |
vec_b_average = torch.zeros(300) | |
cnt_b = 0 | |
for i in range(len(b)): | |
b[i] = b[i].lstrip() | |
c = b[i].split(' ') | |
if obtain_vector(c[-1], glove) is not None: | |
vec_b_average += obtain_vector(c[-1], glove) | |
cnt_b += 1 | |
if cnt_b == 0: | |
print('index ', index,' generatint word_vector failure') | |
continue | |
vec_b_average = vec_b_average / cnt_b | |
for i in range(len(b)): | |
b[i] = b[i].lstrip() | |
c = b[i].split(' ') | |
cnt_f = 0 | |
for j in range(len(c) - 1): | |
if obtain_vector(c[j], glove) is not None: | |
vec_front += obtain_vector(c[j], glove) | |
cnt_f += 1 | |
if obtain_vector(c[-1], glove) is not None: | |
vec_back =obtain_vector(c[-1], glove) | |
else: | |
vec_back = vec_b_average | |
if cnt_f == 0: | |
vector += vec_back | |
else: | |
vector += (vec_front / cnt_f )* 0.1 + vec_back * 0.9 | |
cnt += 1 | |
vector = torch.div(vector,cnt) | |
vec_list_np.append(np.array(vector)) | |
vec_list.append(vector) | |
index += 1 | |
vec_list_np_stacked = np.stack(vec_list_np) | |
vec_list_torch = torch.from_numpy(vec_list_np_stacked) | |
cos_similarity = F.cosine_similarity(vec_list_torch[None, :], vec_list_torch[:, None], dim=-1) | |
cos_similarity = cos_similarity.numpy() | |
# np.save('./knowledge/golve_vec_list', np.array(vec_list_np)) | |
# for i in range(len(vec_list)): | |
# for j in range(len(vec_list)): | |
# cos_similarity[i,j] = F.cosine_similarity(vec_list[i], vec_list[j],dim=0).type(torch.half) | |
# if i != j: | |
# cos_similarity[i,j] = cos_similarity[i,j] | |
# cos_similarity = np.array(cos_similarity) | |
np.save(os.path.join(AD_DIRECTORY, "imagenet_cos_similarity_glove"), cos_similarity) | |
print("Glove cos_similarity finished") | |
return cos_similarity |