nanom commited on
Commit
2d0d0c7
β€’
1 Parent(s): 4b0aadc

Code correction and typing added

Browse files
app.py CHANGED
@@ -1,26 +1,32 @@
1
  # --- Imports modules ---
2
  from modules.module_vocabulary import Vocabulary
3
 
 
4
  # --- Imports interfaces ---
5
  from interfaces.interface_datos import interface as interface_datos
6
 
 
7
  # --- Tool config ---
8
- AVAILABLE_LOGS = True # [True | False]
9
- LANGUAGE = "spanish" # [spanish]
10
- VOCABULARY_SUBSET = "full" # [full]
11
  # ToDo: Change the owner of the context dataset from nanom to vialibre
12
  CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
 
 
 
 
 
13
 
14
  # --- Init classes ---
15
  vocabulary = Vocabulary(
16
  subset_name=VOCABULARY_SUBSET
17
  )
18
 
 
19
  # --- Main App ---
20
  iface = interface_datos(
21
  vocabulary=vocabulary,
22
  contexts=CONTEXTS_DATASET,
23
  available_logs=AVAILABLE_LOGS,
 
24
  lang=LANGUAGE
25
  )
26
 
 
1
  # --- Imports modules ---
2
  from modules.module_vocabulary import Vocabulary
3
 
4
+
5
  # --- Imports interfaces ---
6
  from interfaces.interface_datos import interface as interface_datos
7
 
8
+
9
  # --- Tool config ---
 
 
 
10
  # ToDo: Change the owner of the context dataset from nanom to vialibre
11
  CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
12
+ AVAILABLE_WORDCLOUD = False # [True | False]
13
+ AVAILABLE_LOGS = True # [True | False]
14
+ LANGUAGE = "spanish" # [spanish]
15
+ VOCABULARY_SUBSET = "full" # [full]
16
+
17
 
18
  # --- Init classes ---
19
  vocabulary = Vocabulary(
20
  subset_name=VOCABULARY_SUBSET
21
  )
22
 
23
+
24
  # --- Main App ---
25
  iface = interface_datos(
26
  vocabulary=vocabulary,
27
  contexts=CONTEXTS_DATASET,
28
  available_logs=AVAILABLE_LOGS,
29
+ available_wordcloud=AVAILABLE_WORDCLOUD,
30
  lang=LANGUAGE
31
  )
32
 
interfaces/interface_datos.py CHANGED
@@ -4,7 +4,14 @@ from tool_info import TOOL_INFO
4
  import gradio as gr
5
  import pandas as pd
6
 
7
- def interface(vocabulary, contexts, available_logs, lang="spanish"):
 
 
 
 
 
 
 
8
 
9
  # --- Init logs ---
10
  log_callback = HuggingFaceDatasetSaver(
@@ -12,58 +19,112 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
12
  )
13
 
14
  # --- Init Class ---
15
- connector = Word2ContextExplorerConnector(vocabulary=vocabulary, context=contexts)
16
- labels = pd.read_json(f"language/{lang}.json")["DataExplorer_interface"]
 
 
 
 
 
 
 
17
 
18
  # --- Interface ---
19
- iface = gr.Blocks(css=".container { max-width: 90%; margin: auto;}")
 
 
20
 
21
  with iface:
22
  with gr.Row():
23
  with gr.Column():
24
  with gr.Group():
25
- gr.Markdown(labels["step1"])
26
- with gr.Row(): input_word = gr.Textbox(label=labels["inputWord"]["title"],
27
- show_label=False,
28
- placeholder=labels["inputWord"]["placeholder"])
29
- with gr.Row(): btn_get_w_info = gr.Button(labels["wordInfoButton"])
 
 
 
 
 
 
 
 
30
 
31
  with gr.Group():
32
- gr.Markdown(labels["step2"])
33
- n_context = gr.Slider(label="",
34
- step=1, minimum=1, maximum=30, value=5,
35
- visible=True, interactive=True)
 
 
 
 
 
36
  with gr.Group():
37
- gr.Markdown(labels["step3"])
38
- subsets_choice = gr.CheckboxGroup(label="",
39
- interactive=True,
40
- visible=True)
41
- with gr.Row(): btn_get_contexts = gr.Button(labels["wordContextButton"], visible=True)
 
 
 
 
 
 
 
 
42
 
43
- with gr.Row(): out_msj = gr.Markdown(label="", visible=True)
 
 
 
 
44
 
45
  with gr.Column():
46
  with gr.Group():
47
- gr.Markdown(labels["wordDistributionTitle"])
48
- dist_plot = gr.Plot(label="", show_label=False)
49
- # Set visibility to "true" if you want to see cloud of related words by frequency
50
- wc_plot = gr.Plot(label="", show_label=False, visible=False)
 
 
 
 
 
 
 
 
51
 
52
  with gr.Group():
53
- gr.Markdown(labels["frequencyPerSetTitle"])
54
- subsets_freq = gr.HTML(label="")
 
 
 
 
55
 
56
  with gr.Row():
57
  with gr.Group():
58
- with gr.Row(): gr.Markdown(labels["contextList"])
59
- with gr.Row(): out_context = gr.Dataframe(label="",
60
- interactive=False,
61
- value=pd.DataFrame([], columns=['']),
62
- wrap=True,
63
- datatype=['str','markdown','str','markdown'])
 
 
 
 
 
 
64
 
65
  with gr.Group():
66
- gr.Markdown(TOOL_INFO)
 
 
67
 
68
  btn_get_w_info.click(
69
  fn=connector.get_word_info,
@@ -73,10 +134,11 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
73
  subsets_freq,
74
  dist_plot,
75
  wc_plot,
76
- subsets_choice]
 
77
  )
78
 
79
- btn_get_contexts.click(
80
  fn=connector.get_word_context,
81
  inputs=[input_word, n_context, subsets_choice],
82
  outputs=[out_msj, out_context]
@@ -84,13 +146,16 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
84
 
85
  # --- Logs ---
86
  save_field = [input_word, subsets_choice]
87
- log_callback.setup(components=save_field, flagging_dir="edia_datos_es")
 
 
 
88
 
89
  btn_get_contexts.click(
90
  fn=lambda *args: log_callback.flag(
91
- flag_data=args,
92
- flag_option="datos",
93
- username="vialibre"
94
  ),
95
  inputs=save_field,
96
  outputs=None,
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+
8
+ def interface(
9
+ vocabulary, # Vocabulary class instance
10
+ contexts: str,
11
+ available_logs: bool,
12
+ available_wordcloud: bool,
13
+ lang: str="spanish"
14
+ ) -> gr.Blocks:
15
 
16
  # --- Init logs ---
17
  log_callback = HuggingFaceDatasetSaver(
 
19
  )
20
 
21
  # --- Init Class ---
22
+ connector = Word2ContextExplorerConnector(
23
+ vocabulary=vocabulary,
24
+ context=contexts
25
+ )
26
+
27
+ # --- Load language ---
28
+ labels = pd.read_json(
29
+ f"language/{lang}.json"
30
+ )["DataExplorer_interface"]
31
 
32
  # --- Interface ---
33
+ iface = gr.Blocks(
34
+ css=".container { max-width: 90%; margin: auto;}"
35
+ )
36
 
37
  with iface:
38
  with gr.Row():
39
  with gr.Column():
40
  with gr.Group():
41
+ gr.Markdown(
42
+ value=labels["step1"]
43
+ )
44
+ with gr.Row():
45
+ input_word = gr.Textbox(
46
+ label=labels["inputWord"]["title"],
47
+ show_label=False,
48
+ placeholder=labels["inputWord"]["placeholder"]
49
+ )
50
+ with gr.Row():
51
+ btn_get_w_info = gr.Button(
52
+ value=labels["wordInfoButton"]
53
+ )
54
 
55
  with gr.Group():
56
+ gr.Markdown(
57
+ value=labels["step2"]
58
+ )
59
+ n_context = gr.Slider(
60
+ label="",
61
+ step=1, minimum=1, maximum=30, value=5,
62
+ visible=True,
63
+ interactive=True
64
+ )
65
  with gr.Group():
66
+ gr.Markdown(
67
+ value=labels["step3"]
68
+ )
69
+ subsets_choice = gr.CheckboxGroup(
70
+ label="",
71
+ interactive=True,
72
+ visible=True
73
+ )
74
+ with gr.Row():
75
+ btn_get_contexts = gr.Button(
76
+ value=labels["wordContextButton"],
77
+ visible=True
78
+ )
79
 
80
+ with gr.Row():
81
+ out_msj = gr.Markdown(
82
+ label="",
83
+ visible=True
84
+ )
85
 
86
  with gr.Column():
87
  with gr.Group():
88
+ gr.Markdown(
89
+ value=labels["wordDistributionTitle"]
90
+ )
91
+ dist_plot = gr.Plot(
92
+ label="",
93
+ show_label=False
94
+ )
95
+ wc_plot = gr.Plot(
96
+ label="",
97
+ show_label=False,
98
+ visible=available_wordcloud
99
+ )
100
 
101
  with gr.Group():
102
+ gr.Markdown(
103
+ value=labels["frequencyPerSetTitle"]
104
+ )
105
+ subsets_freq = gr.HTML(
106
+ label=""
107
+ )
108
 
109
  with gr.Row():
110
  with gr.Group():
111
+ with gr.Row():
112
+ gr.Markdown(
113
+ value=labels["contextList"]
114
+ )
115
+ with gr.Row():
116
+ out_context = gr.Dataframe(
117
+ label="",
118
+ interactive=False,
119
+ value=pd.DataFrame([], columns=['']),
120
+ wrap=True,
121
+ datatype=['str','markdown','str','markdown']
122
+ )
123
 
124
  with gr.Group():
125
+ gr.Markdown(
126
+ value=TOOL_INFO
127
+ )
128
 
129
  btn_get_w_info.click(
130
  fn=connector.get_word_info,
 
134
  subsets_freq,
135
  dist_plot,
136
  wc_plot,
137
+ subsets_choice
138
+ ]
139
  )
140
 
141
+ btn_get_contexts.click(
142
  fn=connector.get_word_context,
143
  inputs=[input_word, n_context, subsets_choice],
144
  outputs=[out_msj, out_context]
 
146
 
147
  # --- Logs ---
148
  save_field = [input_word, subsets_choice]
149
+ log_callback.setup(
150
+ components=save_field,
151
+ flagging_dir=f"edia_datos_{lang}"
152
+ )
153
 
154
  btn_get_contexts.click(
155
  fn=lambda *args: log_callback.flag(
156
+ flag_data=args,
157
+ flag_option="datos",
158
+ username="vialibre"
159
  ),
160
  inputs=save_field,
161
  outputs=None,
modules/module_connection.py CHANGED
@@ -1,37 +1,64 @@
 
 
1
  import pandas as pd
2
  import gradio as gr
3
  from abc import ABC
4
- from modules.module_word2Context import Word2Context
5
 
6
  class Connector(ABC):
7
- def parse_word(self, word : str):
 
 
 
 
8
  return word.lower().strip()
9
 
10
- def parse_words(self, array_in_string : str):
 
 
 
 
11
  words = array_in_string.strip()
12
  if not words:
13
  return []
14
- words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
 
 
 
15
  return words
16
 
17
- def process_error(self, err: str):
18
- if err is None:
19
- return
20
- return "<center><h3>" + err + "</h3></center>"
21
 
 
 
 
22
 
23
  class Word2ContextExplorerConnector(Connector):
24
- def __init__(self, **kwargs):
 
 
 
 
25
  vocabulary = kwargs.get('vocabulary', None)
26
  context = kwargs.get('context', None)
27
 
28
  if vocabulary is None and context is None:
29
  raise KeyError
30
- self.word2context_explorer = Word2Context(context, vocabulary)
31
 
32
- def get_word_info(self, word):
 
 
 
 
 
 
 
 
 
33
  err = ""
34
- contexts = pd.DataFrame([],columns=[''])
35
  subsets_info = ""
36
  distribution_plot = None
37
  word_cloud_plot = None
@@ -53,7 +80,13 @@ class Word2ContextExplorerConnector(Connector):
53
 
54
  return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
55
 
56
- def get_word_context(self, word, n_context, subset_choice):
 
 
 
 
 
 
57
  word = self.parse_word(word)
58
  n_context = int(n_context)
59
  err = ""
 
1
+ from modules.module_word2Context import Word2Context
2
+ from typing import List, Tuple
3
  import pandas as pd
4
  import gradio as gr
5
  from abc import ABC
 
6
 
7
  class Connector(ABC):
8
+ def parse_word(
9
+ self,
10
+ word: str
11
+ ) -> str:
12
+
13
  return word.lower().strip()
14
 
15
+ def parse_words(
16
+ self,
17
+ array_in_string: str
18
+ ) -> List[str]:
19
+
20
  words = array_in_string.strip()
21
  if not words:
22
  return []
23
+ words = [
24
+ self.parse_word(word)
25
+ for word in words.split(',') if word.strip() != ''
26
+ ]
27
  return words
28
 
29
+ def process_error(
30
+ self,
31
+ err: str
32
+ ) -> str:
33
 
34
+ if err:
35
+ err = "<center><h3>" + err + "</h3></center>"
36
+ return err
37
 
38
  class Word2ContextExplorerConnector(Connector):
39
+ def __init__(
40
+ self,
41
+ **kwargs
42
+ ) -> None:
43
+
44
  vocabulary = kwargs.get('vocabulary', None)
45
  context = kwargs.get('context', None)
46
 
47
  if vocabulary is None and context is None:
48
  raise KeyError
 
49
 
50
+ self.word2context_explorer = Word2Context(
51
+ context, # Context dataset HF name | path
52
+ vocabulary # Vocabulary class instance
53
+ )
54
+
55
+ def get_word_info(
56
+ self,
57
+ word: str
58
+ ) -> Tuple:
59
+
60
  err = ""
61
+ contexts = pd.DataFrame([], columns=[''])
62
  subsets_info = ""
63
  distribution_plot = None
64
  word_cloud_plot = None
 
80
 
81
  return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
82
 
83
+ def get_word_context(
84
+ self,
85
+ word: str,
86
+ n_context: int,
87
+ subset_choice: List[str]
88
+ ) -> Tuple:
89
+
90
  word = self.parse_word(word)
91
  n_context = int(n_context)
92
  err = ""
modules/module_customSubsetsLabel.py CHANGED
@@ -1,5 +1,10 @@
 
 
1
  class CustomSubsetsLabel:
2
- def __init__(self):
 
 
 
3
  self.html_head = """
4
  <html>
5
  <head>
@@ -50,7 +55,14 @@ class CustomSubsetsLabel:
50
  'UN': "http://opus.nlpl.eu/UN.php",
51
  }
52
 
53
- def __progressbar(self, percentage, subset, freq, size=15):
 
 
 
 
 
 
 
54
  html = f"""
55
  <div id="myturn">
56
  <progress value="{int(percentage)}" max="100"></progress>
@@ -66,7 +78,13 @@ class CustomSubsetsLabel:
66
  """
67
  return html
68
 
69
- def __render(self, subsets, freqs, percentages):
 
 
 
 
 
 
70
  html = ""
71
  for subset, freq, perc in zip(subsets, freqs, percentages):
72
  html += self.__progressbar(
@@ -77,13 +95,24 @@ class CustomSubsetsLabel:
77
 
78
  return self.html_head + html + self.html_footer
79
 
80
- def compute(self, subsets_dic):
 
 
 
 
81
  subsets_dic_info = {
82
  k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
83
  for k,v in subsets_dic.items()
84
  }
85
 
86
  subsets = list(subsets_dic_info.keys())
87
- freqs = [d['freq'] for d in subsets_dic_info.values()]
88
- percentages = [d['perc'] for d in subsets_dic_info.values()]
 
 
 
 
 
 
 
89
  return self.__render(subsets, freqs, percentages)
 
1
+ from typing import List, Dict
2
+
3
  class CustomSubsetsLabel:
4
+ def __init__(
5
+ self
6
+ ) -> None:
7
+
8
  self.html_head = """
9
  <html>
10
  <head>
 
55
  'UN': "http://opus.nlpl.eu/UN.php",
56
  }
57
 
58
+ def __progressbar(
59
+ self,
60
+ percentage: float,
61
+ subset: str,
62
+ freq: int,
63
+ size: int=15
64
+ ) -> str:
65
+
66
  html = f"""
67
  <div id="myturn">
68
  <progress value="{int(percentage)}" max="100"></progress>
 
78
  """
79
  return html
80
 
81
+ def __render(
82
+ self,
83
+ subsets: List[str],
84
+ freqs: List[int],
85
+ percentages: List[float]
86
+ ) -> str:
87
+
88
  html = ""
89
  for subset, freq, perc in zip(subsets, freqs, percentages):
90
  html += self.__progressbar(
 
95
 
96
  return self.html_head + html + self.html_footer
97
 
98
+ def compute(
99
+ self,
100
+ subsets_dic: Dict[str, int]
101
+ ) -> str:
102
+
103
  subsets_dic_info = {
104
  k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
105
  for k,v in subsets_dic.items()
106
  }
107
 
108
  subsets = list(subsets_dic_info.keys())
109
+ freqs = [
110
+ d['freq']
111
+ for d in subsets_dic_info.values()
112
+ ]
113
+ percentages = [
114
+ d['perc']
115
+ for d in subsets_dic_info.values()
116
+ ]
117
+
118
  return self.__render(subsets, freqs, percentages)
modules/module_logsManager.py CHANGED
@@ -1,4 +1,3 @@
1
- from distutils.log import debug
2
  from gradio.flagging import FlaggingCallback, _get_dataset_features_info
3
  from gradio.components import IOComponent
4
  from gradio import utils
@@ -14,14 +13,24 @@ load_dotenv()
14
 
15
  # --- Classes declaration ---
16
  class DateLogs:
17
- def __init__(self, zone="America/Argentina/Cordoba"):
 
 
 
 
18
  self.time_zone = pytz.timezone(zone)
19
 
20
- def full(self):
 
 
 
21
  now = datetime.now(self.time_zone)
22
  return now.strftime("%H:%M:%S %d-%m-%Y")
23
 
24
- def day(self):
 
 
 
25
  now = datetime.now(self.time_zone)
26
  return now.strftime("%d-%m-%Y")
27
 
@@ -41,12 +50,12 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
41
 
42
  def __init__(
43
  self,
44
- hf_token: str = os.getenv('HF_TOKEN'),
45
- dataset_name: str = os.getenv('DS_LOGS_NAME'),
46
- organization: Optional[str] = os.getenv('ORG_NAME'),
47
- private: bool = True,
48
- available_logs: bool = False
49
- ):
50
  """
51
  Parameters:
52
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
@@ -66,10 +75,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
66
 
67
 
68
  def setup(
69
- self,
70
- components: List[IOComponent],
71
- flagging_dir: str
72
- ):
73
  """
74
  Params:
75
  flagging_dir (str): local directory where the dataset is cloned,
@@ -113,9 +122,9 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
113
  def flag(
114
  self,
115
  flag_data: List[Any],
116
- flag_option: Optional[str] = None,
117
- flag_index: Optional[int] = None,
118
- username: Optional[str] = None,
119
  ) -> int:
120
 
121
  if self.available_logs:
 
 
1
  from gradio.flagging import FlaggingCallback, _get_dataset_features_info
2
  from gradio.components import IOComponent
3
  from gradio import utils
 
13
 
14
  # --- Classes declaration ---
15
  class DateLogs:
16
+ def __init__(
17
+ self,
18
+ zone: str="America/Argentina/Cordoba"
19
+ ) -> None:
20
+
21
  self.time_zone = pytz.timezone(zone)
22
 
23
+ def full(
24
+ self
25
+ ) -> str:
26
+
27
  now = datetime.now(self.time_zone)
28
  return now.strftime("%H:%M:%S %d-%m-%Y")
29
 
30
+ def day(
31
+ self
32
+ ) -> str:
33
+
34
  now = datetime.now(self.time_zone)
35
  return now.strftime("%d-%m-%Y")
36
 
 
50
 
51
  def __init__(
52
  self,
53
+ hf_token: str=os.getenv('HF_TOKEN'),
54
+ dataset_name: str=os.getenv('DS_LOGS_NAME'),
55
+ organization: Optional[str]=os.getenv('ORG_NAME'),
56
+ private: bool=True,
57
+ available_logs: bool=False
58
+ ) -> None:
59
  """
60
  Parameters:
61
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
 
75
 
76
 
77
  def setup(
78
+ self,
79
+ components: List[IOComponent],
80
+ flagging_dir: str
81
+ ) -> None:
82
  """
83
  Params:
84
  flagging_dir (str): local directory where the dataset is cloned,
 
122
  def flag(
123
  self,
124
  flag_data: List[Any],
125
+ flag_option: Optional[str]=None,
126
+ flag_index: Optional[int]=None,
127
+ username: Optional[str]=None,
128
  ) -> int:
129
 
130
  if self.available_logs:
modules/module_segmentedWordCloud.py CHANGED
@@ -1,6 +1,6 @@
1
  from wordcloud import WordCloud
2
  import matplotlib.pyplot as plt
3
-
4
 
5
  class SimpleGroupedColorFunc(object):
6
  """Create a color function object which assigns EXACT colors
@@ -16,7 +16,12 @@ class SimpleGroupedColorFunc(object):
16
  of any value from color_to_words.
17
  """
18
 
19
- def __init__(self, color_to_words, default_color):
 
 
 
 
 
20
  self.word_to_color = {
21
  word: color
22
  for (color, words) in color_to_words.items()
@@ -30,7 +35,13 @@ class SimpleGroupedColorFunc(object):
30
 
31
 
32
  class SegmentedWordCloud:
33
- def __init__(self, freq_dic, less_group, greater_group):
 
 
 
 
 
 
34
  colors = {
35
  'less': '#529ef3',
36
  'salient':'#d35400',
@@ -56,7 +67,10 @@ class SegmentedWordCloud:
56
 
57
  self.wc.recolor(color_func=grouped_color_func)
58
 
59
- def plot(self, figsize):
 
 
 
60
  fig, ax = plt.subplots(figsize=figsize)
61
  ax.imshow(self.wc, interpolation="bilinear")
62
  ax.axis("off")
 
1
  from wordcloud import WordCloud
2
  import matplotlib.pyplot as plt
3
+ from typing import Dict, Tuple, List
4
 
5
  class SimpleGroupedColorFunc(object):
6
  """Create a color function object which assigns EXACT colors
 
16
  of any value from color_to_words.
17
  """
18
 
19
+ def __init__(
20
+ self,
21
+ color_to_words: Dict,
22
+ default_color: str
23
+ ) -> Dict:
24
+
25
  self.word_to_color = {
26
  word: color
27
  for (color, words) in color_to_words.items()
 
35
 
36
 
37
  class SegmentedWordCloud:
38
+ def __init__(
39
+ self,
40
+ freq_dic: Dict[str, int],
41
+ less_group: List[str],
42
+ greater_group: List[str]
43
+ ) :
44
+
45
  colors = {
46
  'less': '#529ef3',
47
  'salient':'#d35400',
 
67
 
68
  self.wc.recolor(color_func=grouped_color_func)
69
 
70
+ def plot(
71
+ self,
72
+ figsize: Tuple[int,int]
73
+ ):
74
  fig, ax = plt.subplots(figsize=figsize)
75
  ax.imshow(self.wc, interpolation="bilinear")
76
  ax.axis("off")
modules/module_vocabulary.py CHANGED
@@ -1,9 +1,14 @@
1
  from memory_profiler import profile
2
  import pandas as pd
 
3
 
4
  class Vocabulary:
5
  @profile
6
- def __init__(self, subset_name):
 
 
 
 
7
  # Dataset info
8
  self.subset_name = subset_name
9
  self.ds_path = f"data/{subset_name}_vocab_v6.zip"
@@ -17,10 +22,17 @@ class Vocabulary:
17
  # Load vocabulary dataset
18
  self.__load()
19
 
20
- def __contains__(self, word):
 
 
 
 
21
  return word in self.df_vocab['word'].to_list()
22
 
23
- def __load(self):
 
 
 
24
  print(f"Preparing {self.subset_name} vocabulary...")
25
 
26
  # --- Download vocab dataset ---
@@ -41,7 +53,11 @@ class Vocabulary:
41
  reverse=True
42
  )
43
 
44
- def __getValue(self, word, feature):
 
 
 
 
45
  word_id, value = None, None
46
 
47
  if word in self:
@@ -52,23 +68,47 @@ class Vocabulary:
52
 
53
  return value
54
 
55
- def getFreq(self, word):
 
 
 
 
56
  return self.__getValue(word, 'freq')
57
 
58
- def getPercentile(self, word):
 
 
 
 
59
  return self.__getValue(word, 'percentile')
60
 
61
- def getSplits(self, word):
 
 
 
 
62
  return self.__getValue(word, 'splits')
63
 
64
- def getSubsets(self, word):
 
 
 
 
65
  return self.__getValue(word, 'in_subset')
66
 
67
- def distribution(self):
 
 
 
68
  x_values, y_values = zip(*self.histogram)
69
  return x_values, y_values
70
 
71
- def getWordNeighbors(self, word, n_neighbors=20):
 
 
 
 
 
72
  word_id = self.df_vocab['word'].to_list().index(word)
73
  words = self.df_vocab['word'].to_list()
74
  freqs = self.df_vocab['freq'].to_list()
 
1
  from memory_profiler import profile
2
  import pandas as pd
3
+ from typing import List, Dict, Tuple
4
 
5
  class Vocabulary:
6
  @profile
7
+ def __init__(
8
+ self,
9
+ subset_name: str
10
+ ) -> None:
11
+
12
  # Dataset info
13
  self.subset_name = subset_name
14
  self.ds_path = f"data/{subset_name}_vocab_v6.zip"
 
22
  # Load vocabulary dataset
23
  self.__load()
24
 
25
+ def __contains__(
26
+ self,
27
+ word: str
28
+ ) -> bool:
29
+
30
  return word in self.df_vocab['word'].to_list()
31
 
32
+ def __load(
33
+ self
34
+ ) -> None:
35
+
36
  print(f"Preparing {self.subset_name} vocabulary...")
37
 
38
  # --- Download vocab dataset ---
 
53
  reverse=True
54
  )
55
 
56
+ def __getValue(
57
+ self,
58
+ word: str,
59
+ feature: str
60
+ ):
61
  word_id, value = None, None
62
 
63
  if word in self:
 
68
 
69
  return value
70
 
71
+ def getFreq(
72
+ self,
73
+ word
74
+ ) -> int:
75
+
76
  return self.__getValue(word, 'freq')
77
 
78
+ def getPercentile(
79
+ self,
80
+ word:str
81
+ ) -> float:
82
+
83
  return self.__getValue(word, 'percentile')
84
 
85
+ def getSplits(
86
+ self,
87
+ word: str
88
+ ) -> List[str]:
89
+
90
  return self.__getValue(word, 'splits')
91
 
92
+ def getSubsets(
93
+ self,
94
+ word: str
95
+ ) -> Dict[str, int]:
96
+
97
  return self.__getValue(word, 'in_subset')
98
 
99
+ def distribution(
100
+ self
101
+ ) -> Tuple:
102
+
103
  x_values, y_values = zip(*self.histogram)
104
  return x_values, y_values
105
 
106
+ def getWordNeighbors(
107
+ self,
108
+ word: str,
109
+ n_neighbors: int=20
110
+ )-> Tuple:
111
+
112
  word_id = self.df_vocab['word'].to_list().index(word)
113
  words = self.df_vocab['word'].to_list()
114
  freqs = self.df_vocab['freq'].to_list()
modules/module_word2Context.py CHANGED
@@ -1,8 +1,8 @@
1
  from datasets import load_dataset, interleave_datasets
2
  from modules.module_segmentedWordCloud import SegmentedWordCloud
3
  from modules.module_customSubsetsLabel import CustomSubsetsLabel
4
-
5
  from random import sample as random_sample
 
6
  import re
7
 
8
  import matplotlib as mpl
@@ -11,7 +11,12 @@ import matplotlib.pyplot as plt
11
 
12
 
13
  class Word2Context:
14
- def __init__(self, context_ds_name, vocabulary):
 
 
 
 
 
15
  self.context_ds_name = context_ds_name
16
 
17
  # Vocabulary class
@@ -20,7 +25,11 @@ class Word2Context:
20
  # Custom Label component
21
  self.Label = CustomSubsetsLabel()
22
 
23
- def errorChecking(self, word):
 
 
 
 
24
  out_msj = ""
25
 
26
  if not word:
@@ -31,19 +40,33 @@ class Word2Context:
31
 
32
  return out_msj
33
 
34
- def genWebLink(self,text):
 
 
 
 
35
  text = text.replace("\"", "'")
36
  text = text.replace("<u><b>", "")
37
  text = text.replace("</b></u>", "")
38
  url = "https://www.google.com.tr/search?q={}".format(text)
39
  return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>πŸŒπŸ”</center></a>'.format(url)
40
 
41
- def genWordCloudPlot(self, word, figsize=(9,3)):
 
 
 
 
 
42
  freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
43
  wc = SegmentedWordCloud(freq_dic, l_group, g_group)
44
  return wc.plot(figsize)
45
 
46
- def genDistributionPlot(self, word, figsize=(6,1)):
 
 
 
 
 
47
  x_values, y_values = self.vocab.distribution()
48
  w_percentile = self.vocab.getPercentile(word)
49
  w_freq = self.vocab.getFreq(word)
@@ -52,19 +75,20 @@ class Word2Context:
52
  ax.plot(x_values, y_values, color='green')
53
  ax.fill_between(x_values, y_values, color='lightgreen',)
54
 
55
- # -- Uncomment if wordcloud is enabled in the application interface --
56
- # ax.axvline(x=max(0,w_percentile-.01),
57
- # color='blue',
58
- # linewidth=7,
59
- # alpha=.2,
60
- # linestyle='-'
61
- # )
62
- # ax.axvline(x=min(100,w_percentile+.01),
63
- # color='black',
64
- # linewidth=7,
65
- # alpha=.2,
66
- # linestyle='-'
67
- # )
 
68
  ax.axvline(x=w_percentile,
69
  color='#d35400',
70
  linewidth=2,
@@ -76,7 +100,12 @@ class Word2Context:
76
  plt.legend(loc='upper left', prop={'size': 7})
77
  return fig
78
 
79
- def findSplits(self, word, subsets_list):
 
 
 
 
 
80
  w_splits = self.vocab.getSplits(word)
81
 
82
  splits_list = []
@@ -102,7 +131,12 @@ class Word2Context:
102
 
103
  return datasets
104
 
105
- def findContexts(self, sample, word):
 
 
 
 
 
106
  sample = sample['text'].strip()
107
  context = ""
108
  m = re.search(r'\b{}\b'.format(word), sample)
@@ -112,7 +146,11 @@ class Word2Context:
112
  context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
113
  return {'context':context}
114
 
115
- def getSubsetsInfo(self, word):
 
 
 
 
116
  total_freq = self.vocab.getFreq(word)
117
  subsets_name_list = list(self.vocab.getSubsets(word).keys())
118
  subsets_freq_list = list(self.vocab.getSubsets(word).values())
@@ -127,73 +165,21 @@ class Word2Context:
127
  subsets_info = self.Label.compute(subsets_origin_info)
128
  return subsets_info, subsets_origin_info
129
 
130
- def getContexts(self, word, n_context, ds):
131
- ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
132
- only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
133
- shuffle_contexts = only_contexts.shuffle(buffer_size=10)
134
-
135
- list_of_dict = list(shuffle_contexts.take(n_context))
136
- list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
137
-
138
- return list_of_contexts
139
-
140
- # TODO: The next methods can be removed, or keep them as a wrapper method of several ones
141
- '''
142
- def getWordInfo(self, word):
143
- errors = ""
144
- contexts = pd.DataFrame([],columns=[''])
145
- subsets_info = ""
146
- distribution_plot = None
147
- word_cloud_plot = None
148
- subsets_choice = gr.CheckboxGroup.update(choices=[])
149
-
150
- errors = self.errorChecking(word)
151
- if errors:
152
- return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
153
 
154
- total_freq = self.vocab.getFreq(word)
155
- subsets_name_list = list(self.vocab.getSubsets(word).keys())
156
- subsets_freq_list = list(self.vocab.getSubsets(word).values())
157
-
158
- # Create subset frequency dict to subset_freq component
159
- subsets_info = {
160
- s_name + f" ({s_freq})": s_freq/total_freq
161
- for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
162
- }
163
- subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
164
- subsets_info = self.Label.compute(subsets_origin_info)
165
-
166
- # Create sort list to subsets_choice component
167
- clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
168
- subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
169
-
170
- # Get word distribution, and wordcloud graph
171
- distribution_plot = self.genDistributionPlot(word)
172
- word_cloud_plot = self.genWordCloudPlot(word)
173
-
174
- return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
175
-
176
- def getWordContext(self, word, n_context, subset_choice):
177
- n_context = int(n_context)
178
- errors = ""
179
-
180
- if len(subset_choice) > 0:
181
- ds = self.findSplits(word, subset_choice)
182
-
183
- else:
184
- errors = "Error: Palabra no ingresada y/o conjunto/s de interΓ©s no seleccionado/s!"
185
- errors = "<center><h3>"+errors+"</h3></center>"
186
- return errors, pd.DataFrame([], columns=[''])
187
-
188
  ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
189
  only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
190
  shuffle_contexts = only_contexts.shuffle(buffer_size=10)
191
 
192
  list_of_dict = list(shuffle_contexts.take(n_context))
193
- list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
194
-
195
- contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
196
- contexts["buscar"] = contexts.contexto.apply(lambda text: self.genWebLink(text))
197
 
198
- return errors, contexts
199
- '''
 
1
  from datasets import load_dataset, interleave_datasets
2
  from modules.module_segmentedWordCloud import SegmentedWordCloud
3
  from modules.module_customSubsetsLabel import CustomSubsetsLabel
 
4
  from random import sample as random_sample
5
+ from typing import Tuple, List, Dict
6
  import re
7
 
8
  import matplotlib as mpl
 
11
 
12
 
13
  class Word2Context:
14
+ def __init__(
15
+ self,
16
+ context_ds_name: str,
17
+ vocabulary # Vocabulary class instance
18
+ ) -> None:
19
+
20
  self.context_ds_name = context_ds_name
21
 
22
  # Vocabulary class
 
25
  # Custom Label component
26
  self.Label = CustomSubsetsLabel()
27
 
28
+ def errorChecking(
29
+ self,
30
+ word: str
31
+ ) -> str:
32
+
33
  out_msj = ""
34
 
35
  if not word:
 
40
 
41
  return out_msj
42
 
43
+ def genWebLink(
44
+ self,
45
+ text: str
46
+ ) -> str:
47
+
48
  text = text.replace("\"", "'")
49
  text = text.replace("<u><b>", "")
50
  text = text.replace("</b></u>", "")
51
  url = "https://www.google.com.tr/search?q={}".format(text)
52
  return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>πŸŒπŸ”</center></a>'.format(url)
53
 
54
+ def genWordCloudPlot(
55
+ self,
56
+ word: str,
57
+ figsize: Tuple[int,int]=(9,3)
58
+ ): # ToDO: Figure typing
59
+
60
  freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
61
  wc = SegmentedWordCloud(freq_dic, l_group, g_group)
62
  return wc.plot(figsize)
63
 
64
+ def genDistributionPlot(
65
+ self,
66
+ word: str,
67
+ figsize: Tuple[int,int]=(6,1)
68
+ ): # ToDO: Figure typing
69
+
70
  x_values, y_values = self.vocab.distribution()
71
  w_percentile = self.vocab.getPercentile(word)
72
  w_freq = self.vocab.getFreq(word)
 
75
  ax.plot(x_values, y_values, color='green')
76
  ax.fill_between(x_values, y_values, color='lightgreen',)
77
 
78
+ ax.axvline(x=max(0,w_percentile-.01),
79
+ color='blue',
80
+ linewidth=7,
81
+ alpha=.1,
82
+ linestyle='-'
83
+ )
84
+
85
+ ax.axvline(x=min(100,w_percentile+.01),
86
+ color='black',
87
+ linewidth=7,
88
+ alpha=.1,
89
+ linestyle='-'
90
+ )
91
+
92
  ax.axvline(x=w_percentile,
93
  color='#d35400',
94
  linewidth=2,
 
100
  plt.legend(loc='upper left', prop={'size': 7})
101
  return fig
102
 
103
+ def findSplits(
104
+ self,
105
+ word: str,
106
+ subsets_list: List[str]
107
+ ):
108
+
109
  w_splits = self.vocab.getSplits(word)
110
 
111
  splits_list = []
 
131
 
132
  return datasets
133
 
134
+ def findContexts(
135
+ self,
136
+ sample: str,
137
+ word: str
138
+ ) -> Dict[str,str]:
139
+
140
  sample = sample['text'].strip()
141
  context = ""
142
  m = re.search(r'\b{}\b'.format(word), sample)
 
146
  context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
147
  return {'context':context}
148
 
149
+ def getSubsetsInfo(
150
+ self,
151
+ word: str
152
+ ) -> Tuple:
153
+
154
  total_freq = self.vocab.getFreq(word)
155
  subsets_name_list = list(self.vocab.getSubsets(word).keys())
156
  subsets_freq_list = list(self.vocab.getSubsets(word).values())
 
165
  subsets_info = self.Label.compute(subsets_origin_info)
166
  return subsets_info, subsets_origin_info
167
 
168
+ def getContexts(
169
+ self,
170
+ word: str,
171
+ n_context: int,
172
+ ds
173
+ ) -> List:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
176
  only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
177
  shuffle_contexts = only_contexts.shuffle(buffer_size=10)
178
 
179
  list_of_dict = list(shuffle_contexts.take(n_context))
180
+ list_of_contexts = [
181
+ (i, dic['context'], dic['subset'])
182
+ for i,dic in enumerate(list_of_dict)
183
+ ]
184
 
185
+ return list_of_contexts