agalma / autocomplete.py
Mark7549's picture
removed forms for first 2 tabs and used cache to make program faster
cdb0a70
raw
history blame
2.32 kB
import pickle
import gzip
from word2vec import *
def get_unique_words(corpus_filename):
"""
Get a list of unique words from a corpus file
"""
unique_words = set()
with open(corpus_filename, 'r', encoding='utf-8') as file:
for line in file:
words = line.strip().split()
unique_words.update(words)
return list(unique_words)
def save_compressed_word_list(words, filename):
"""
Save a list of words to a compressed file
"""
with gzip.open(filename, 'wb') as file:
pickle.dump(words, file)
def load_compressed_word_list(filename):
"""
Load a list of words from a compressed file
"""
with gzip.open(filename, 'rb') as file:
return pickle.load(file)
def get_autocomplete(input_word=" ", all_words=" "):
"""
Get a list of words that start with the input word
"""
return [word for word in all_words if word.startswith(input_word)]
def custom_sort(item):
if item.isdigit():
print(item)
return (2, item) # Place numbers last
else:
return (0, item.lower())
def order_compressed_list(filename):
"""
Order the compressed list of words alphabetically and put numbers at the end
"""
# Strip extension from filename
filename_raw = filename.split('.')[0]
with gzip.open(filename, 'rb') as file:
words = pickle.load(file)
# Sort the words
sorted_words = sorted(words, key=custom_sort)
return sorted_words
def read_compressed_list(filename):
"""
Read the compressed list of words
"""
with gzip.open(filename, 'rb') as file:
print(pickle.load(file))
def word_in_models_dict(words_file):
"""
Create a dictionary with words as keys and models in which the word occurs as values
"""
with gzip.open(words_file, 'rb') as file:
words = pickle.load(file)
models = load_all_models()
word_models = {word: [] for word in words} # Initialize word_models dictionary with empty lists
for model in models:
model_name = convert_model_to_time_name(model[0])
for word in words:
if word in model[1].wv.key_to_index:
word_models[word].append(model_name)
return word_models