File size: 3,029 Bytes
743fd42
 
2d0d0c7
743fd42
 
2d0d0c7
 
 
 
 
743fd42
 
 
 
 
 
 
 
 
 
 
 
 
2d0d0c7
 
 
 
 
743fd42
 
2d0d0c7
 
 
 
743fd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d0d0c7
 
 
 
 
743fd42
 
 
 
 
 
 
 
 
 
2d0d0c7
 
 
 
 
743fd42
 
2d0d0c7
 
 
 
 
743fd42
 
2d0d0c7
 
 
 
 
743fd42
 
2d0d0c7
 
 
 
 
743fd42
 
2d0d0c7
 
 
 
743fd42
 
 
2d0d0c7
 
 
 
 
 
743fd42
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from memory_profiler import profile
import pandas as pd
from typing import List, Dict, Tuple

class Vocabulary:
    def __init__(
        self, 
        subset_name: str
    ) -> None:

        # Dataset info
        self.subset_name = subset_name
        self.ds_path = f"data/{subset_name}_vocab_v6.zip"
        
        # Pandas dataset
        self.df_vocab = None

        # Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
        self.histogram = None

        # Load vocabulary dataset
        self.__load()

    def __contains__(
        self, 
        word: str
    ) -> bool:

        return word in self.df_vocab['word'].to_list()

    def __load(
        self
    ) -> None:

        print(f"Preparing {self.subset_name} vocabulary...")

        # --- Download vocab dataset ---
        self.df_vocab = pd.read_json(self.ds_path)

        # --- Create min histogram to plot the word distribution graph ---
        x_values = self.df_vocab['percentile'].to_list()
        y_values = self.df_vocab['freq'].to_list()

        # Delete duplicated tups
        uniques_tups_list = set(list(zip(x_values, y_values)))
        # Leave only tuples with different first element
        uniques_tups_list = dict(uniques_tups_list)

        self.histogram = sorted(
            uniques_tups_list.items(),
            key=lambda tup: tup[0], 
            reverse=True
        )
        
    def __getValue(
        self, 
        word: str, 
        feature: str
    ):
        word_id, value = None, None

        if word in self:
            word_id = self.df_vocab['word'].to_list().index(word)
        
        if word_id != None:
            value = self.df_vocab[feature].to_list()[word_id]

        return value

    def getFreq(
        self, 
        word
    ) -> int:

        return self.__getValue(word, 'freq')

    def getPercentile(
        self, 
        word:str
    ) -> float:

        return self.__getValue(word, 'percentile')

    def getSplits(
        self, 
        word: str
    ) -> List[str]:

        return self.__getValue(word, 'splits')
    
    def getSubsets(
        self, 
        word: str
    ) -> Dict[str, int]:

        return self.__getValue(word, 'in_subset')

    def distribution(
        self
    ) -> Tuple:

        x_values, y_values = zip(*self.histogram)
        return x_values, y_values
     
    def getWordNeighbors(
        self, 
        word: str, 
        n_neighbors: int=20
    )-> Tuple:

        word_id = self.df_vocab['word'].to_list().index(word)
        words = self.df_vocab['word'].to_list()
        freqs = self.df_vocab['freq'].to_list()
        l_sorted = list(zip(words, freqs))

        g = l_sorted[max(0, word_id-n_neighbors):word_id]    # less than
        e = l_sorted[word_id]                               # equal than
        l = l_sorted[word_id+1:word_id+n_neighbors]         # greter than

        dic = dict(g+[e]+l)
        l = [x[0] for x in l]
        g = [x[0] for x in g]

        return dic, l, g