import pickle import gzip from word2vec import * def get_unique_words(corpus_filename): """ Get a list of unique words from a corpus file """ unique_words = set() with open(corpus_filename, 'r', encoding='utf-8') as file: for line in file: words = line.strip().split() unique_words.update(words) return list(unique_words) def save_compressed_word_list(words, filename): """ Save a list of words to a compressed file """ with gzip.open(filename, 'wb') as file: pickle.dump(words, file) def load_compressed_word_list(filename): """ Load a list of words from a compressed file """ with gzip.open(filename, 'rb') as file: return pickle.load(file) def get_autocomplete(input_word=" ", all_words=" "): """ Get a list of words that start with the input word """ return [word for word in all_words if word.startswith(input_word)] def custom_sort(item): if item.isdigit(): print(item) return (2, item) # Place numbers last else: return (0, item.lower()) def order_compressed_list(filename): """ Order the compressed list of words alphabetically and put numbers at the end """ # Strip extension from filename filename_raw = filename.split('.')[0] with gzip.open(filename, 'rb') as file: words = pickle.load(file) # Sort the words sorted_words = sorted(words, key=custom_sort) return sorted_words def read_compressed_list(filename): """ Read the compressed list of words """ with gzip.open(filename, 'rb') as file: print(pickle.load(file)) def word_in_models_dict(words_file): """ Create a dictionary with words as keys and models in which the word occurs as values """ with gzip.open(words_file, 'rb') as file: words = pickle.load(file) models = load_all_models() word_models = {word: [] for word in words} # Initialize word_models dictionary with empty lists for model in models: model_name = convert_model_to_time_name(model[0]) for word in words: if word in model[1].wv.key_to_index: word_models[word].append(model_name) return word_models