|
|
|
|
|
import argparse |
|
import os |
|
import numpy as np |
|
import scipy |
|
import pickle |
|
from scipy.special import log_softmax |
|
from time import time |
|
from packaging import version |
|
import torch |
|
|
|
assert version.parse(scipy.__version__) >= version.parse( |
|
"1.7.0" |
|
), f"Requries scipy > 1.7.0. Found {scipy.__version__}" |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
class Model(torch.nn.Module): |
|
"""Model defintion, parameters and helper fucntions to compute log-likelihood""" |
|
|
|
def __init__(self, vocab: dict, emb_dim: int): |
|
"""Initialize our model |
|
|
|
Args: |
|
vocab: vocab size for each language {'en': 25000, 'de': 25000} |
|
emb_dim: embedding dimension, will be same across languages |
|
""" |
|
|
|
super().__init__() |
|
|
|
self.L = len(vocab) |
|
self.vocab = vocab |
|
self.emb_dim = emb_dim |
|
|
|
|
|
|
|
|
|
self.E = torch.nn.ParameterDict() |
|
|
|
|
|
|
|
|
|
self.b = torch.nn.ParameterDict() |
|
|
|
n1 = 1.0 / np.sqrt(emb_dim) |
|
|
|
|
|
for lang, vocab_size in vocab.items(): |
|
n2 = 1.0 / np.sqrt(vocab_size) |
|
|
|
self.E[lang] = torch.nn.Parameter(torch.Tensor(np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))), |
|
requires_grad=True).to(device) |
|
self.b[lang] = torch.nn.Parameter(torch.Tensor(np.random.randn(vocab_size, 1) * 0.0001), requires_grad=True).to(device) |
|
|
|
def init_bias_with_log_unigram_dist(self, X, lang): |
|
"""We will initialize the bias vector with log of unigram distribution over vocabulary. |
|
This should help us with better initialization. |
|
|
|
b = \log (\sum_d x_d) / (\sum_d \sum_i x_{di}) |
|
""" |
|
|
|
|
|
if isinstance(X, np.ndarray): |
|
X = X + 1e-08 |
|
else: |
|
X = X.A + 1e-08 |
|
|
|
|
|
|
|
|
|
|
|
b_copy = self.b[lang].clone() |
|
b_copy[:, 0] = torch.from_numpy(np.log(X.sum(axis=0) / X.sum())) |
|
self.b[lang] = torch.nn.Parameter(b_copy, requires_grad=True) |
|
|
|
def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False): |
|
"""Compute log of thetas, where theta_d is the unigram distribution over document `d` |
|
estiamted from the current params (word-embedding matrix, bias vector) and document embedding a_d. |
|
|
|
Args: |
|
---- |
|
lang (str): Language ID (eg: en, de, es ...) |
|
DE_lang (np.ndarray): Document embeddings of language |
|
""" |
|
|
|
|
|
mat = self.b[lang] + (self.E[lang].double() @ torch.from_numpy(DE_lang).double().to(device)) |
|
|
|
|
|
mat = mat.T |
|
|
|
|
|
|
|
|
|
|
|
log_thetas = log_softmax(mat.detach().numpy(), axis=1) |
|
|
|
if sanity_check: |
|
n_docs = DE_lang.shape[0] |
|
|
|
|
|
|
|
print( |
|
"Sanity check for log-thetas:", |
|
np.allclose(np.exp(log_thetas).sum(), n_docs), |
|
) |
|
|
|
return log_thetas |
|
|
|
def compute_log_likelihood(self, lang, DE_lang, X): |
|
"""Compute log-likelihood of the data, given the current parameters / embeddings |
|
|
|
Each summation could be implemented using a for-loop but that would very slow, |
|
since we have every thing stored in matrices and a sparse matrix, we will do it via |
|
matrix muliplications and additions. |
|
|
|
Args: |
|
lang: language ID (eg: en, es, fr) |
|
DE_lang: document embeddings for the given language |
|
X: doc-by-word counts in scipy.sparse format for a specific language |
|
|
|
Returns: |
|
float: log-likelihood of the data |
|
""" |
|
|
|
log_thetas = self.compute_log_thetas(lang, DE_lang) |
|
|
|
|
|
if isinstance(X, np.ndarray): |
|
llh = (X * log_thetas).sum() |
|
else: |
|
|
|
|
|
|
|
coo = X.tocoo() |
|
|
|
row_ixs = torch.LongTensor(coo.row).to(device) |
|
col_ixs = torch.LongTensor(coo.col).to(device) |
|
data = torch.FloatTensor(coo.data).to(device) |
|
|
|
|
|
|
|
log_thetas_tensor = torch.from_numpy(log_thetas) |
|
|
|
llh = (log_thetas_tensor[row_ixs, col_ixs] * data).sum() |
|
|
|
|
|
return llh * (-1.0) |
|
|
|
|
|
def gradients_WE(model, lang, DE_lang, X, alpha): |
|
"""Gradient of the log-likelihood with-respect-to language-specific word embedding matrix `E` |
|
|
|
Args: |
|
model (Model): The object of the model |
|
lang (str): Language ID |
|
DE_lang: document embeddings for the given language |
|
X (scipy.sparse_matrix): The doc-by-word counts |
|
alpha (float): L2 reg. weight |
|
|
|
Returns: |
|
np.ndarray: Gradient of log-likelihood w.r.t word embeddings, i.e, grad of llh w.r.t to model.E |
|
""" |
|
|
|
|
|
|
|
|
|
log_thetas = model.compute_log_thetas(lang, DE_lang) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ef_grads = np.zeros_like(model.E) |
|
|
|
tmp = ( |
|
X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas)) |
|
).A |
|
|
|
|
|
m = model.E[lang].detach().numpy() |
|
|
|
ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * m).sum(axis=1, keepdims=True) |
|
|
|
|
|
|
|
|
|
return ef_grads |
|
|
|
|
|
def update_parameters(params, gradient, learning_rate): |
|
"""Update the parameters |
|
|
|
Args: |
|
params (np.ndarray): Word embedding matrix of the document embedding matrix |
|
gradient (np.ndarray): Gradients of all word embeddings or document embeddings. Should be same as size as params |
|
learning_rate (float): The learning_rate can also be seen as step size, i.e, the size of the step to be taken |
|
along the direction of gradient. Too big steps can overshoot our estimate, whereas too small steps |
|
can take longer for the model to reach optimum. |
|
|
|
Returns: |
|
np.ndarray: the updated params |
|
""" |
|
|
|
assert ( |
|
params.shape == gradient.shape |
|
), "The params and gradient must have same shape, \ |
|
({:d}, {:d}) != ({:d} {:d})".format( |
|
*params.shape, *gradient.shape |
|
) |
|
|
|
new_params = params.detach() + ( |
|
learning_rate * gradient |
|
) |
|
return new_params |
|
|
|
|
|
def train(model, bow, DE, args): |
|
"""Training scheme for the model""" |
|
|
|
print("\nTraining started ..") |
|
optim = torch.optim.Adam(model.parameters(), lr=args.lr) |
|
learning_rate = args.lr |
|
llh_0 = 0.0 |
|
for lang, X in bow.items(): |
|
llh_0 += model.compute_log_likelihood(lang, DE[lang].T, X) |
|
print(" Initial log-likelihood: {:16.2f}".format(llh_0)) |
|
|
|
llhs = [llh_0] |
|
|
|
for i in range(1, args.epochs + 1): |
|
|
|
llh_ei = 0.0 |
|
for lang, X in bow.items(): |
|
|
|
optim.zero_grad() |
|
|
|
|
|
|
|
|
|
|
|
|
|
grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha) |
|
|
|
model.E[lang] = update_parameters(model.E[lang], grad_E, learning_rate) |
|
|
|
llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X) |
|
|
|
loss = torch.tensor(llh_ei, requires_grad=True) |
|
loss.backward() |
|
|
|
optim.step() |
|
|
|
|
|
|
|
print( |
|
"Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format( |
|
i, args.epochs, llh_ei, learning_rate |
|
) |
|
) |
|
|
|
if llh_ei < llhs[-1]: |
|
print( |
|
"The log-likelihood should improve after every epoch.", |
|
"Instead it decreased, which means the updates have overshooted.", |
|
"Halving the learning_rate.", |
|
) |
|
|
|
|
|
llhs.append(llh_ei) |
|
|
|
|
|
|
|
if i % 10 == 0: |
|
print("Reducing the learning by a factor of 0.1 every 10 epcohs") |
|
learning_rate -= learning_rate * 0.1 |
|
if i % 100 == 0: |
|
with open( |
|
os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb" |
|
) as fpw: |
|
pickle.dump(model, fpw) |
|
np.savetxt( |
|
os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"), |
|
np.asarray(llhs), |
|
) |
|
|
|
return model, llhs |
|
|
|
|
|
def main(): |
|
"""main""" |
|
|
|
args = parse_arguments() |
|
|
|
os.makedirs(args.out_dir, exist_ok=True) |
|
|
|
emb_dim = 0 |
|
|
|
doc_embs = {} |
|
with open(args.input_embedding_key_file, "r") as fpr: |
|
for line in fpr: |
|
lang, fpath = line.strip().split() |
|
doc_embs[lang] = np.load(fpath) |
|
print("Loaded embeddings:", lang, doc_embs[lang].shape) |
|
|
|
if emb_dim == 0: |
|
emb_dim = doc_embs[lang].shape[1] |
|
|
|
|
|
bows = {} |
|
vocab = {} |
|
with open(args.input_bag_of_words_key_file, "r") as fpr: |
|
for line in fpr: |
|
lang, fpath = line.strip().split() |
|
bows[lang] = scipy.sparse.load_npz(fpath) |
|
print("Loaded bag-of-words:", lang, bows[lang].shape) |
|
|
|
vocab[lang] = bows[lang].shape[1] |
|
|
|
|
|
assert ( |
|
bows[lang].shape[0] == doc_embs[lang].shape[0] |
|
), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format( |
|
bows[lang].shape[0], doc_embs[lang].shape[0], lang |
|
) |
|
|
|
model = Model(vocab, emb_dim) |
|
model.to(device) |
|
for lang, bow in bows.items(): |
|
model.init_bias_with_log_unigram_dist(bow, lang) |
|
|
|
print("Model params:") |
|
for lang in model.vocab: |
|
print(" ", lang, model.E[lang].shape, model.b[lang].shape) |
|
|
|
if args.resume: |
|
with open(args.resume, "rb") as fpr: |
|
model = pickle.load(fpr) |
|
|
|
|
|
model, llhs = train(model, bows, doc_embs, args) |
|
|
|
with open( |
|
os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb" |
|
) as fpw: |
|
pickle.dump(model, fpw) |
|
|
|
np.savetxt( |
|
os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"), |
|
np.asarray(llhs), |
|
) |
|
|
|
print("Saved in", args.out_dir) |
|
|
|
|
|
def parse_arguments(): |
|
parser = argparse.ArgumentParser( |
|
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter |
|
) |
|
|
|
parser.add_argument( |
|
"input_embedding_key_file", |
|
help="path to file that has paths to embeddings for each language", |
|
) |
|
|
|
parser.add_argument( |
|
"input_bag_of_words_key_file", help="path to input bag of words dictionary file" |
|
) |
|
|
|
parser.add_argument("out_dir", help="out dir to save the model/word embeddings") |
|
|
|
parser.add_argument("--epochs", type=int, default=100, help="number of epochs") |
|
parser.add_argument("--lr", type=float, default=0.0001, help="learning rate") |
|
parser.add_argument( |
|
"--alpha", type=float, default=1e-4, help="L2 reg. weight / weight decay" |
|
) |
|
|
|
parser.add_argument( |
|
"--resume", default="", help="path to trained model to resume training" |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
return args |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|