Spaces:
Runtime error
Runtime error
File size: 3,029 Bytes
743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 2d0d0c7 743fd42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from memory_profiler import profile
import pandas as pd
from typing import List, Dict, Tuple
class Vocabulary:
def __init__(
self,
subset_name: str
) -> None:
# Dataset info
self.subset_name = subset_name
self.ds_path = f"data/{subset_name}_vocab_v6.zip"
# Pandas dataset
self.df_vocab = None
# Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
self.histogram = None
# Load vocabulary dataset
self.__load()
def __contains__(
self,
word: str
) -> bool:
return word in self.df_vocab['word'].to_list()
def __load(
self
) -> None:
print(f"Preparing {self.subset_name} vocabulary...")
# --- Download vocab dataset ---
self.df_vocab = pd.read_json(self.ds_path)
# --- Create min histogram to plot the word distribution graph ---
x_values = self.df_vocab['percentile'].to_list()
y_values = self.df_vocab['freq'].to_list()
# Delete duplicated tups
uniques_tups_list = set(list(zip(x_values, y_values)))
# Leave only tuples with different first element
uniques_tups_list = dict(uniques_tups_list)
self.histogram = sorted(
uniques_tups_list.items(),
key=lambda tup: tup[0],
reverse=True
)
def __getValue(
self,
word: str,
feature: str
):
word_id, value = None, None
if word in self:
word_id = self.df_vocab['word'].to_list().index(word)
if word_id != None:
value = self.df_vocab[feature].to_list()[word_id]
return value
def getFreq(
self,
word
) -> int:
return self.__getValue(word, 'freq')
def getPercentile(
self,
word:str
) -> float:
return self.__getValue(word, 'percentile')
def getSplits(
self,
word: str
) -> List[str]:
return self.__getValue(word, 'splits')
def getSubsets(
self,
word: str
) -> Dict[str, int]:
return self.__getValue(word, 'in_subset')
def distribution(
self
) -> Tuple:
x_values, y_values = zip(*self.histogram)
return x_values, y_values
def getWordNeighbors(
self,
word: str,
n_neighbors: int=20
)-> Tuple:
word_id = self.df_vocab['word'].to_list().index(word)
words = self.df_vocab['word'].to_list()
freqs = self.df_vocab['freq'].to_list()
l_sorted = list(zip(words, freqs))
g = l_sorted[max(0, word_id-n_neighbors):word_id] # less than
e = l_sorted[word_id] # equal than
l = l_sorted[word_id+1:word_id+n_neighbors] # greter than
dic = dict(g+[e]+l)
l = [x[0] for x in l]
g = [x[0] for x in g]
return dic, l, g |