Spaces:

GroNLP
/

agalma

Sleeping

File size: 14,312 Bytes

from gensim.models import Word2Vec
from collections import defaultdict
import os
import tempfile
import pandas as pd
from collections import Counter


def load_all_models():
    '''
        Load all word2vec models
    '''

    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
    
    return [archaic, classical, early_roman, hellen, late_roman]


def load_selected_models(selected_models):
    '''
        Load the selected word2vec models
        
        selected_models: a list of models that should be loaded
    '''
    models = []
    for model in selected_models:
        if model == "Early Roman":
            model = "early_roman"
        elif model == "Late Roman":
            model = "late_roman"
        elif model == "Hellenistic":
            model = "hellen"
        model_name = model.lower() + "_cbow"
        models.append([model_name, load_word2vec_model(f'models/{model_name}.model')])
    
    return models


def load_word2vec_model(model_path):
    '''
        Load a word2vec model from a file
        
        model_path: relative path to model files
    '''
    return Word2Vec.load(model_path)


def get_word_vector(model, word):
    '''
        Return the word vector of a word
        
        model: word2vec model object
        word: word to extract vector from
    '''
    return model.wv[word]


def iterate_over_words(model):
    '''
        Iterate over all words in the vocabulary and print their vectors
        
        model: word2vec model object
    '''
    index = 0
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        print(f'{index} Word: {word}, Vector: {vector}')
        index += 1


def model_dictionary(model):
    '''
        Return the dictionary of the word2vec model
        Key is the word and value is the vector of the word
        
        model: word2vec model object
    '''
    dict = defaultdict(list)
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        dict[word] = vector
    
    return dict  
    
    
def dot_product(vector_a, vector_b):
    '''
        Return the dot product of two vectors
        
        vector_a: A list of numbers representing the first vector
        vector_b: A list of numbers representing the second vector
        
        Returns:
        A single number representing the dot product of the two vectors
    '''
    return sum(a * b for a, b in zip(vector_a, vector_b))


def magnitude(vector):
    '''
        Returns the magnitude of a vector
        
        vector: A list of numbers representing the vetor
        
        Returns:
        A single number representing the magnitude of the vector. 
    '''
    return sum(x**2 for x in vector) ** 0.5


def cosine_similarity(vector_a, vector_b):
    '''
        Return the cosine similarity of two vectors
        
        vector_a: A list of numbers representing the first vector
        vector_b: A list of numbers representing the second vector
        
        Returns:
        A String representing the cosine similarity of the two vectors \
        formatted to two decimals. 
    '''
    dot_prod = dot_product(vector_a, vector_b)
    mag_a = magnitude(vector_a)
    mag_b = magnitude(vector_b)

    # Avoid division by zero
    if mag_a == 0 or mag_b == 0:
        return 0.0

    similarity = dot_prod / (mag_a * mag_b)
    return "{:.2f}".format(similarity)


def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
    '''
        Return the cosine similarity of two words
        
        word1: The first word as a string.
        time_slice_1: The time slice for the first word as a string.
        word2: The second word as a string.
        time_slice_2: The time slice for the second word as a string.

        Returns:
        A string representing the cosine similarity of the two words formatted to two decimal places.
   
    '''
    
    time_slice_1 = convert_time_name_to_model(time_slice_1)
    time_slice_2 = convert_time_name_to_model(time_slice_2)
    
    if not os.path.exists(f'models/{time_slice_1}.model'):
        return
    
    model_1 = load_word2vec_model(f'models/{time_slice_1}.model')
    model_2 = load_word2vec_model(f'models/{time_slice_2}.model')
    
    dict_1 = model_dictionary(model_1)
    dict_2 = model_dictionary(model_2)
    
    return cosine_similarity(dict_1[word1], dict_2[word2])


def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
    '''
        Return the cosine similarity of one word in two different time slices
        
        word: The word as a string.
        time_slice1: The first time slice as a string.
        time_slice2: The second time slice as a string.

        Returns:
        A string representing the cosine similarity of the word in two different time slices formatted to two decimal places.
    
    '''
   
    # Return if path does not exist
    if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
        return 
    
    model1 = load_word2vec_model(f'models/{time_slice1}.model')
    model2 = load_word2vec_model(f'models/{time_slice2}.model')
    
    dict1 = model_dictionary(model1)
    dict2 = model_dictionary(model2)
    
    return cosine_similarity(dict1[word], dict2[word])



def validate_nearest_neighbours(word, n, models):
    '''
        Validate the input of the nearest neighbours function
        
        word: The word as a string.
        n: The number of nearest neighbours to find as an integer.
        models: A list of model names as strings.

        Returns:
        A boolean value. True if inputs are valid, False otherwise.
   
    '''
    if word == '' or n == '' or models == []:
        return False
    return True


def convert_model_to_time_name(model_name):
    '''
        Convert the model name to the time slice name
        
        model_name: The model name as a string.

        Returns:
        A string representing the corresponding time slice name.
    '''
    if model_name == 'archaic_cbow' or model_name == 'archaic':
        return 'Archaic'
    elif model_name == 'classical_cbow' or model_name == 'classical':
        return 'Classical'
    elif model_name == 'early_roman_cbow' or model_name == 'early_roman':
        return 'Early Roman'
    elif model_name == 'hellen_cbow' or model_name == 'hellen':
        return 'Hellenistic'
    elif model_name == 'late_roman_cbow' or model_name == 'late_roman':
        return 'Late Roman'


def convert_time_name_to_model(time_name):
    '''
        Convert the time slice name to the model name
        
        time_name -- The time slice name as a string.

        Returns:
        A string representing the corresponding model name.
    
    '''
    if time_name == 'Archaic':
        return 'archaic_cbow'
    elif time_name == 'Classical':
        return 'classical_cbow'
    elif time_name == 'Early Roman':
        return 'early_roman_cbow'
    elif time_name == 'Hellenistic':
        return 'hellen_cbow'
    elif time_name == 'Late Roman':
        return 'late_roman_cbow'
    elif time_name == 'classical':
        return 'Classical'
    elif time_name == 'early_roman':
        return 'Early Roman'
    elif time_name == 'hellen':
        return 'Hellenistic'
    elif time_name == 'late_roman':
        return 'Late Roman'
    elif time_name == 'archaic':
        return 'Archaic'


def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
    """
    Return the nearest neighbours of a word for the given models
    
    word: the word for which the nearest neighbours are calculated
    n: the number of nearest neighbours to return (default: 10)
    models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)

    Return: { 'model_name': [(word, cosine_similarity), ...], ... }
    """
    nearest_neighbours = {}
    
    # Iterate over models and compute nearest neighbours
    for model in models:
        model_neighbours = []
        model_name = convert_model_to_time_name(model[0])
        model = model[1]
        vector_1 = get_word_vector(model, target_word)
        
        # Iterate over all words of the model
        for word, index in model.wv.key_to_index.items():
            vector_2 = get_word_vector(model, word)
            cosine_sim = cosine_similarity(vector_1, vector_2)
            
            # If the list of nearest neighbours is not full yet, add the current word
            if len(model_neighbours) < n:
                model_neighbours.append((word, cosine_sim))
            else:
                # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
                smallest_neighbour = min(model_neighbours, key=lambda x: x[1])
                if cosine_sim > smallest_neighbour[1]:
                    model_neighbours.remove(smallest_neighbour)
                    model_neighbours.append((word, cosine_sim))
        
        # Sort the nearest neighbours by cosine similarity
        model_neighbours = sorted(model_neighbours, key=lambda x: x[1], reverse=True)
        
        # Add the model name and the nearest neighbours to the dictionary
        nearest_neighbours[model_name] = model_neighbours
    
    return nearest_neighbours

        
    

def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
    '''
        Return the vectors of the nearest neighbours of a word
        
        word: the word for which the nearest neighbours are calculated
        time_slice_model: the word2vec model of the time slice of the input word
        n: the number of nearest neighbours to return (default: 15)
        
        Return: list of tuples with the word, the time slice, the vector, and the cosine similarity 
                of the nearest neighbours
    '''
    model_name = convert_model_to_time_name(time_slice_model)
    time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
    vector_1 = get_word_vector(time_slice_model, word)
    nearest_neighbours = []
    

    
    for word, index in time_slice_model.wv.key_to_index.items():
        print(word)
        vector_2 = get_word_vector(time_slice_model, word)
        cosine_sim = cosine_similarity(vector_1, vector_2)
        
        if len(nearest_neighbours) < n:
            nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
        else:
            smallest_neighbour = min(nearest_neighbours, key=lambda x: x[3])
            if cosine_sim > smallest_neighbour[3]:
                nearest_neighbours.remove(smallest_neighbour)
                nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
    
    return sorted(nearest_neighbours, key=lambda x: x[3], reverse=True)


def write_to_file(data):
    '''
        Write the data to a file
        
        data: the data to be written to the file
        
        Return: the path to the temporary file
    '''
    # Create random tmp file name
    temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
    
    os.close(temp_file_descriptor)

    # Write data to the temporary file
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write(str(data))
    
    return temp_file_path


def store_df_in_temp_file(all_dfs):
    '''
        Store the dataframes in a temporary file
        
        all_dfs: list of tuples with the name of the time slice and the dataframe
        
        Return: the path to the temporary Excel file
    '''
    # Define directory for temporary files
    temp_dir = "./downloads/nn"
    
    # Create the directory if it doesn't exist
    os.makedirs(temp_dir, exist_ok=True)

    # Create random temporary file name
    _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)

    # Concatenate all dataframes
    df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])

    # Create an ExcelWriter object
    with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
        # Create a new sheet
        worksheet = writer.book.add_worksheet('Results')

        start_row = 0
        for model, df in all_dfs:
            worksheet.write(start_row, 0, f"Model: {model}")
            df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
            start_row += df.shape[0] + 3  # Add some space between models

    return temp_file_path


def check_word_in_models(word):
    '''
        Check in which models a word occurs
        
        word: the word to check
        
        Return: list of model names where the word occurs
    '''
    all_models = load_all_models()
    eligible_models = []
    
    for model in all_models:
        model_name = convert_time_name_to_model(model[0])
        model = model[1]
        
        if word in model.wv.key_to_index:
            eligible_models.append(model_name)
    
    return eligible_models

    

def count_lemmas(directory):
    '''
        Create a Counter with all words and their occurrences for all models
        
        directory: the directory containing the text files for the models
        
        Return: a dictionary where keys are model names and values are Counters of word occurrences
    '''
    lemma_count_dict = {}
    for file in os.listdir(directory):
        model_name = file.split('.')[0].replace('_', ' ').capitalize()
        if len(model_name.split()) == 2:
            # Also capitalize second part of model name
            model_name = ' '.join([word.capitalize() for word in model_name.split()])
        if file.endswith(".txt"):
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                text = f.read()
                words = text.split()
                lemma_count_dict[model_name] = Counter(words)
    
    return lemma_count_dict