|
import pickle |
|
import gzip |
|
from word2vec import * |
|
|
|
|
|
def get_unique_words(corpus_filename): |
|
""" |
|
Get a list of unique words from a corpus file |
|
""" |
|
unique_words = set() |
|
with open(corpus_filename, 'r', encoding='utf-8') as file: |
|
for line in file: |
|
words = line.strip().split() |
|
unique_words.update(words) |
|
return list(unique_words) |
|
|
|
|
|
def save_compressed_word_list(words, filename): |
|
""" |
|
Save a list of words to a compressed file |
|
""" |
|
with gzip.open(filename, 'wb') as file: |
|
pickle.dump(words, file) |
|
|
|
|
|
def load_compressed_word_list(filename): |
|
""" |
|
Load a list of words from a compressed file |
|
""" |
|
with gzip.open(filename, 'rb') as file: |
|
return pickle.load(file) |
|
|
|
|
|
def get_autocomplete(input_word=" ", all_words=" "): |
|
""" |
|
Get a list of words that start with the input word |
|
""" |
|
return [word for word in all_words if word.startswith(input_word)] |
|
|
|
|
|
def custom_sort(item): |
|
if item.isdigit(): |
|
print(item) |
|
return (2, item) |
|
else: |
|
return (0, item.lower()) |
|
|
|
|
|
def order_compressed_list(filename): |
|
""" |
|
Order the compressed list of words alphabetically and put numbers at the end |
|
""" |
|
|
|
filename_raw = filename.split('.')[0] |
|
|
|
with gzip.open(filename, 'rb') as file: |
|
words = pickle.load(file) |
|
|
|
|
|
sorted_words = sorted(words, key=custom_sort) |
|
|
|
return sorted_words |
|
|
|
|
|
def read_compressed_list(filename): |
|
""" |
|
Read the compressed list of words |
|
""" |
|
with gzip.open(filename, 'rb') as file: |
|
print(pickle.load(file)) |
|
|
|
|
|
def word_in_models_dict(words_file): |
|
""" |
|
Create a dictionary with words as keys and models in which the word occurs as values |
|
""" |
|
with gzip.open(words_file, 'rb') as file: |
|
words = pickle.load(file) |
|
|
|
models = load_all_models() |
|
|
|
word_models = {word: [] for word in words} |
|
|
|
for model in models: |
|
model_name = convert_model_to_time_name(model[0]) |
|
for word in words: |
|
if word in model[1].wv.key_to_index: |
|
word_models[word].append(model_name) |
|
|
|
return word_models |
|
|
|
|
|
|
|
|