Spaces:
Runtime error
Runtime error
from memory_profiler import profile | |
import pandas as pd | |
from typing import List, Dict, Tuple | |
class Vocabulary: | |
def __init__( | |
self, | |
subset_name: str | |
) -> None: | |
# Dataset info | |
self.subset_name = subset_name | |
self.ds_path = f"data/{subset_name}_vocab_v6.zip" | |
# Pandas dataset | |
self.df_vocab = None | |
# Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph | |
self.histogram = None | |
# Load vocabulary dataset | |
self.__load() | |
def __contains__( | |
self, | |
word: str | |
) -> bool: | |
return word in self.df_vocab['word'].to_list() | |
def __load( | |
self | |
) -> None: | |
print(f"Preparing {self.subset_name} vocabulary...") | |
# --- Download vocab dataset --- | |
self.df_vocab = pd.read_json(self.ds_path) | |
# --- Create min histogram to plot the word distribution graph --- | |
x_values = self.df_vocab['percentile'].to_list() | |
y_values = self.df_vocab['freq'].to_list() | |
# Delete duplicated tups | |
uniques_tups_list = set(list(zip(x_values, y_values))) | |
# Leave only tuples with different first element | |
uniques_tups_list = dict(uniques_tups_list) | |
self.histogram = sorted( | |
uniques_tups_list.items(), | |
key=lambda tup: tup[0], | |
reverse=True | |
) | |
def __getValue( | |
self, | |
word: str, | |
feature: str | |
): | |
word_id, value = None, None | |
if word in self: | |
word_id = self.df_vocab['word'].to_list().index(word) | |
if word_id != None: | |
value = self.df_vocab[feature].to_list()[word_id] | |
return value | |
def getFreq( | |
self, | |
word | |
) -> int: | |
return self.__getValue(word, 'freq') | |
def getPercentile( | |
self, | |
word:str | |
) -> float: | |
return self.__getValue(word, 'percentile') | |
def getSplits( | |
self, | |
word: str | |
) -> List[str]: | |
return self.__getValue(word, 'splits') | |
def getSubsets( | |
self, | |
word: str | |
) -> Dict[str, int]: | |
return self.__getValue(word, 'in_subset') | |
def distribution( | |
self | |
) -> Tuple: | |
x_values, y_values = zip(*self.histogram) | |
return x_values, y_values | |
def getWordNeighbors( | |
self, | |
word: str, | |
n_neighbors: int=20 | |
)-> Tuple: | |
word_id = self.df_vocab['word'].to_list().index(word) | |
words = self.df_vocab['word'].to_list() | |
freqs = self.df_vocab['freq'].to_list() | |
l_sorted = list(zip(words, freqs)) | |
g = l_sorted[max(0, word_id-n_neighbors):word_id] # less than | |
e = l_sorted[word_id] # equal than | |
l = l_sorted[word_id+1:word_id+n_neighbors] # greter than | |
dic = dict(g+[e]+l) | |
l = [x[0] for x in l] | |
g = [x[0] for x in g] | |
return dic, l, g |