Spaces:
Runtime error
Runtime error
from gensim.models import Word2Vec | |
from collections import defaultdict | |
import os | |
def load_all_models(): | |
''' | |
Load all word2vec models | |
''' | |
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) | |
classical = ('classical', load_word2vec_model('models/classical_cbow.model')) | |
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) | |
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) | |
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) | |
return [archaic, classical, early_roman, hellen, late_roman] | |
def load_word2vec_model(model_path): | |
''' | |
Load a word2vec model from a file | |
''' | |
return Word2Vec.load(model_path) | |
def get_word_vector(model, word): | |
''' | |
Return the word vector of a word | |
''' | |
return model.wv[word] | |
def iterate_over_words(model): | |
''' | |
Iterate over all words in the vocabulary and print their vectors | |
''' | |
index = 0 | |
for word, index in model.wv.key_to_index.items(): | |
vector = get_word_vector(model, word) | |
print(f'{index} Word: {word}, Vector: {vector}') | |
index += 1 | |
def model_dictionary(model): | |
''' | |
Return the dictionary of the word2vec model | |
Key is the word and value is the vector of the word | |
''' | |
dict = defaultdict(list) | |
for word, index in model.wv.key_to_index.items(): | |
vector = get_word_vector(model, word) | |
dict[word] = vector | |
return dict | |
def dot_product(vector_a, vector_b): | |
''' | |
Return the dot product of two vectors | |
''' | |
return sum(a * b for a, b in zip(vector_a, vector_b)) | |
def magnitude(vector): | |
''' | |
Return the magnitude of a vector | |
''' | |
return sum(x**2 for x in vector) ** 0.5 | |
def cosine_similarity(vector_a, vector_b): | |
''' | |
Return the cosine similarity of two vectors | |
''' | |
dot_prod = dot_product(vector_a, vector_b) | |
mag_a = magnitude(vector_a) | |
mag_b = magnitude(vector_b) | |
# Avoid division by zero | |
if mag_a == 0 or mag_b == 0: | |
return 0.0 | |
similarity = dot_prod / (mag_a * mag_b) | |
return similarity | |
def get_cosine_similarity(word1, word2, time_slice): | |
''' | |
Return the cosine similarity of two words | |
''' | |
# TO DO: MOET NETTER | |
# Return if path does not exist | |
if not os.path.exists(f'models/{time_slice}.model'): | |
return | |
model = load_word2vec_model(f'models/{time_slice}.model') | |
dict = model_dictionary(model) | |
return cosine_similarity(dict[word1], dict[word2]) | |
def get_cosine_similarity_one_word(word, time_slice1, time_slice2): | |
''' | |
Return the cosine similarity of one word in two different time slices | |
''' | |
# Return if path does not exist | |
if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'): | |
return | |
model1 = load_word2vec_model(f'models/{time_slice1}.model') | |
model2 = load_word2vec_model(f'models/{time_slice2}.model') | |
dict1 = model_dictionary(model1) | |
dict2 = model_dictionary(model2) | |
return cosine_similarity(dict1[word], dict2[word]) | |
def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()): | |
''' | |
Return the nearest neighbours of a word | |
word: the word for which the nearest neighbours are calculated | |
time_slice_model: the word2vec model of the time slice of the input word | |
models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models) | |
n: the number of nearest neighbours to return (default: 10) | |
Return: list of tuples with the word, the time slice and | |
the cosine similarity of the nearest neighbours | |
''' | |
time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model') | |
vector_1 = get_word_vector(time_slice_model, word) | |
nearest_neighbours = [] | |
# Iterate over all models | |
for model in models: | |
model_name = model[0] | |
model = model[1] | |
# Iterate over all words of the model | |
for word, index in model.wv.key_to_index.items(): | |
# Vector of the current word | |
vector_2 = get_word_vector(model, word) | |
# Calculate the cosine similarity between current word and input word | |
cosine_similarity_vectors = cosine_similarity(vector_1, vector_2) | |
# If the list of nearest neighbours is not full yet, add the current word | |
if len(nearest_neighbours) < n: | |
nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) | |
# If the list of nearest neighbours is full, replace the word with the smallest cosine similarity | |
else: | |
smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2]) | |
if cosine_similarity_vectors > smallest_neighbour[2]: | |
nearest_neighbours.remove(smallest_neighbour) | |
nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) | |
return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True) | |
def main(): | |
# model = load_word2vec_model('models/archaic_cbow.model') | |
# archaic_cbow_dict = model_dictionary(model) | |
# score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον']) | |
# print(score) | |
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) | |
classical = ('classical', load_word2vec_model('models/classical_cbow.model')) | |
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) | |
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) | |
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) | |
models = [archaic, classical, early_roman, hellen, late_roman] | |
nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5) | |
print(nearest_neighbours) | |
# vector = get_word_vector(model, 'ἀνήρ') | |
# print(vector) | |
# Iterate over all words and print their vectors | |
# iterate_over_words(model) | |
if __name__ == "__main__": | |
main() | |