Spaces:

MarkdenOuden
/

Ancient_Greek_Word2Vec

Runtime error

File size: 6,351 Bytes

from gensim.models import Word2Vec
from collections import defaultdict
import os


def load_all_models():
    '''
        Load all word2vec models
    '''

    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
    
    return [archaic, classical, early_roman, hellen, late_roman]


def load_word2vec_model(model_path):
    '''
        Load a word2vec model from a file
    '''
    return Word2Vec.load(model_path)


def get_word_vector(model, word):
    '''
        Return the word vector of a word
    '''
    return model.wv[word]


def iterate_over_words(model):
    '''
        Iterate over all words in the vocabulary and print their vectors
    '''
    index = 0
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        print(f'{index} Word: {word}, Vector: {vector}')
        index += 1


def model_dictionary(model):
    '''
        Return the dictionary of the word2vec model
        Key is the word and value is the vector of the word
    '''
    dict = defaultdict(list)
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        dict[word] = vector
    
    return dict  
    
    
def dot_product(vector_a, vector_b):
    '''
        Return the dot product of two vectors
    '''
    return sum(a * b for a, b in zip(vector_a, vector_b))


def magnitude(vector):
    '''
        Return the magnitude of a vector
    '''
    return sum(x**2 for x in vector) ** 0.5


def cosine_similarity(vector_a, vector_b):
    '''
        Return the cosine similarity of two vectors
    '''
    dot_prod = dot_product(vector_a, vector_b)
    mag_a = magnitude(vector_a)
    mag_b = magnitude(vector_b)

    # Avoid division by zero
    if mag_a == 0 or mag_b == 0:
        return 0.0

    similarity = dot_prod / (mag_a * mag_b)
    return similarity


def get_cosine_similarity(word1, word2, time_slice):
    '''
        Return the cosine similarity of two words
    '''
    # TO DO: MOET NETTER
    
    # Return if path does not exist
    if not os.path.exists(f'models/{time_slice}.model'):
        return 
    
    model = load_word2vec_model(f'models/{time_slice}.model')
    dict = model_dictionary(model)
    return cosine_similarity(dict[word1], dict[word2])


def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
    '''
        Return the cosine similarity of one word in two different time slices
    '''
   
    # Return if path does not exist
    if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
        return 
    
    model1 = load_word2vec_model(f'models/{time_slice1}.model')
    model2 = load_word2vec_model(f'models/{time_slice2}.model')
    
    dict1 = model_dictionary(model1)
    dict2 = model_dictionary(model2)
    
    return cosine_similarity(dict1[word], dict2[word])


def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
    '''
        Return the nearest neighbours of a word
        
        word: the word for which the nearest neighbours are calculated
        time_slice_model: the word2vec model of the time slice of the input word
        models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
        n: the number of nearest neighbours to return (default: 10)
        
        Return: list of tuples with the word, the time slice and 
                the cosine similarity of the nearest neighbours
    '''    
    time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
    vector_1 = get_word_vector(time_slice_model, word)
    nearest_neighbours = []
    
    # Iterate over all models
    for model in models: 
        model_name = model[0]
        model = model[1]

        # Iterate over all words of the model
        for word, index in model.wv.key_to_index.items():
            
            # Vector of the current word
            vector_2 = get_word_vector(model, word)
            
            # Calculate the cosine similarity between current word and input word
            cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
            
            # If the list of nearest neighbours is not full yet, add the current word
            if len(nearest_neighbours) < n: 
                nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
            
            # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
            else: 
                smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
                if cosine_similarity_vectors > smallest_neighbour[2]:
                    nearest_neighbours.remove(smallest_neighbour)
                    nearest_neighbours.append((word, model_name, cosine_similarity_vectors))

    return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
    


def main():
    # model = load_word2vec_model('models/archaic_cbow.model')
    # archaic_cbow_dict = model_dictionary(model)
    
    # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
    # print(score)
    
    
    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
    
    models = [archaic, classical, early_roman, hellen, late_roman]
    nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
    print(nearest_neighbours)
    # vector = get_word_vector(model, 'ἀνήρ')
    # print(vector)

    # Iterate over all words and print their vectors
    # iterate_over_words(model)


if __name__ == "__main__":
    main()