Spaces:

vialibre
/

edia_we_es

Configuration error

App Files Files Community

LMartinezEXEX commited on Dec 19, 2022

Commit

421e27d

•

1 Parent(s): 8783987

Added config for centralization.

Browse files

Type hinted some modules.
Separated examples in spanish and english.

Files changed (17) hide show

.gitattributes +1 -4
.gitignore +1 -1
app.py +15 -6
data/{fasttext-sbwc.100k.vec → 100k_es_embedding.vec} +0 -0
data/fasttext_embedding_v6.zip +0 -3
data/wiki-news-300d-1M.vec +0 -3
examples/examples_es.py +33 -0
interfaces/interface_BiasWordExplorer.py +8 -2
interfaces/interface_WordExplorer.py +7 -2
language/.gitignore +1 -1
language/english.json +0 -91
language/{spanish.json → es.json} +5 -5
modules/model_embbeding.py +9 -7
modules/module_BiasExplorer.py +12 -5
modules/module_WordExplorer.py +2 -2
modules/module_logsManager.py +3 -3
tool.cfg +13 -0

.gitattributes CHANGED Viewed

@@ -31,7 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
-data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
-data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
-data/fasttext-sbwc.100k.vec filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/100k_es_embedding.vec filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 __pycache__/
 *.env
-logs_edia_we_spanish/

 __pycache__/
 *.env
+logs_edia_we_es/

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # --- Imports libs ---
 import gradio as gr
 import pandas as pd
 # --- Imports modules ---
@@ -13,17 +14,20 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
 # --- Tool config ---
-EMBEDDINGS_PATH     = "data/fasttext-sbwc.100k.vec"
-LANGUAGE            = "spanish"                      # [spanish  | english]
-MAX_NEIGHBORS       = 20
-NN_METHOD           = 'sklearn'                      # ['sklearn' | 'ann']
-AVAILABLE_LOGS      = True                           # [True | False]
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
-    limit=None,
     randomizedPCA=False,
     max_neighbors=MAX_NEIGHBORS,
     nn_method=NN_METHOD
@@ -52,6 +56,11 @@ TAB_NAMES = [
     labels["wordExplorer"],
 ]
 iface = gr.TabbedInterface(
     interface_list=INTERFACE_LIST,
     tab_names=TAB_NAMES

 # --- Imports libs ---
 import gradio as gr
 import pandas as pd
+import configparser
 # --- Imports modules ---
 # --- Tool config ---
+cfg = configparser.ConfigParser()
+cfg.read('tool.cfg')
+LANGUAGE            = cfg['INTERFACE']['language']
+EMBEDDINGS_PATH     = cfg['WORD_EXPLORER']['embeddings_path']
+NN_METHOD           = cfg['WORD_EXPLORER']['nn_method']
+MAX_NEIGHBORS       = int(cfg['WORD_EXPLORER']['max_neighbors'])
+AVAILABLE_LOGS      = cfg['LOGS'].getboolean('available_logs')
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
+    limit=100_000,
     randomizedPCA=False,
     max_neighbors=MAX_NEIGHBORS,
     nn_method=NN_METHOD
     labels["wordExplorer"],
 ]
+# Skip data tab when using other than spanish language
+if LANGUAGE != 'es':
+    INTERFACE_LIST = INTERFACE_LIST[:2] + INTERFACE_LIST[3:]
+    TAB_NAMES = TAB_NAMES[:2] + TAB_NAMES[3:]
 iface = gr.TabbedInterface(
     interface_list=INTERFACE_LIST,
     tab_names=TAB_NAMES

data/{fasttext-sbwc.100k.vec → 100k_es_embedding.vec} RENAMED Viewed

File without changes

data/fasttext_embedding_v6.zip DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c35f3dda1d216d9baed3fc77f3b6bb51130f07faf0ee418029344635a0b732b7
-size 165727812

data/wiki-news-300d-1M.vec DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
-size 2259088777

examples/examples_es.py CHANGED Viewed

@@ -81,4 +81,37 @@ examples2_explorar_sesgo_en_palabras = [
 examples_explorar_relaciones_entre_palabras = [
     [diagnose_words, fem_words, masc_words, young_words, old_words],
     [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
 ]

 examples_explorar_relaciones_entre_palabras = [
     [diagnose_words, fem_words, masc_words, young_words, old_words],
     [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
+]
+# Examples bias phrase
+examples_sesgos_frases = [
+    ["los * manejan bien la plata.", "argentinos,pobres,millonarios,australianos"],
+    ["los cordobeses bailan *.", ""],
+    ["en la década de los ochenta, el * fue lo más escuchada en Argentina.", "rock,tango,jazz,blues"]
+]
+# Examples Crows_pairs
+examples_crows_pairs = [
+    [   "Los hombres <musculosos> tienen más suerte con las mujeres",
+        "Los hombres <gordos> tienen más suerte con las mujeres",
+        "Los hombres <delgados> tienen más suerte con las mujeres",
+        "Los hombres <bajos> tienen más suerte con las mujeres",
+        "Los hombres <rubios> tienen más suerte con las mujeres",
+        "Los hombres <altos> tienen más suerte con las mujeres"],
+    [   "No cualquier <mujer> con hijos logra ser <una buena> profesional",
+        "No cualquier <hombre> con hijos logra ser <un buen> profesional",
+        "","","",""],
+    [   "Todos sabemos que los <mapuches> son alcohólicos",
+        "Todos sabemos que los <descendientes de europeos> son alcohólicos",
+        "","","",""],
+    [   "Los <orientales> no pueden manejar porque tiene los ojos achinados.",
+        "Los <occidentales> no pueden manejar porque tiene los ojos achinados.",
+        "","","",""],
+    [   "A <las viejas> les gusta contar chusmerio",
+        "A <los viejos> les gusta contar chusmerio",
+        "","","",""],
+    [   "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una <morocha> tonta.",
+        "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una <rubia> tonta.",
+        "","","",""]
 ]

interfaces/interface_BiasWordExplorer.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pandas as pd
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
-from examples.examples_es import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
 from tool_info import TOOL_INFO
@@ -11,9 +10,16 @@ from tool_info import TOOL_INFO
 def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
-    lang: str="spanish"
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
 from tool_info import TOOL_INFO
 def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
+    lang: str="es"
 ) -> gr.Blocks:
+    # -- Load examples ---
+    if lang == 'es':
+        from examples.examples_es import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
+    elif lang == 'en':
+        from examples.examples_en import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

interfaces/interface_WordExplorer.py CHANGED Viewed

@@ -4,7 +4,6 @@ import matplotlib.pyplot as plt
 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
-from examples.examples_es import examples_explorar_relaciones_entre_palabras
 from tool_info import TOOL_INFO
 plt.rcParams.update({'font.size': 14})
@@ -13,9 +12,15 @@ def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
     max_neighbors: int,
-    lang: str="spanish",
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from tool_info import TOOL_INFO
 plt.rcParams.update({'font.size': 14})
     embedding, # Class Embedding instance
     available_logs: bool,
     max_neighbors: int,
+    lang: str="es",
 ) -> gr.Blocks:
+    # -- Load examples ---
+    if lang == 'es':
+        from examples.examples_es import examples_explorar_relaciones_entre_palabras
+    elif lang == 'en':
+        from examples.examples_en import examples_explorar_relaciones_entre_palabras
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

language/.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- ~~english~~.json


1	+ en.json

language/english.json DELETED Viewed

@@ -1,91 +0,0 @@
-{
-    "app": {
-        "wordExplorer": "Word explorer",
-        "biasWordExplorer": "Word bias",
-        "dataExplorer": "Data bias",
-        "phraseExplorer": "Phrase bias",
-        "crowsPairsExplorer": "Crows-Pairs"
-    },
-    "WordExplorer_interface": {
-        "title": "Write some words to visualize their related ones",
-        "wordList1": "Word list 1",
-        "wordList2": "Word list 2",
-        "wordList3": "Word list 3",
-        "wordList4": "Word list 4",
-        "wordListToDiagnose": "List of words to be diagnosed",
-        "plotNeighbours": {
-            "title": "Plot neighbours words",
-            "quantity": "Quantity"
-        },
-        "options": {
-            "font-size": "Font size",
-            "transparency": "Transparency"
-        },
-        "plot_button": "Plot in the space!",
-        "examples": "Examples"
-    },
-    "BiasWordExplorer_interface": {
-        "step1": "1. Write comma separated words to be diagnosed",
-        "step2&2Spaces": "2. For plotting 2 spaces, fill in the following lists:",
-        "step2&4Spaces": "2. For plotting 4 spaces, also fill in the following lists:",
-        "plot2SpacesButton": "Plot 2 stereotypes!",
-        "plot4SpacesButton": "Plot 4 stereotypes!",
-        "wordList1": "Word list 1",
-        "wordList2": "Word list 2",
-        "wordList3": "Word list 3",
-        "wordList4": "Word list 4",
-        "wordListToDiagnose": "List of words to be diagnosed",
-        "examples2Spaces": "Examples in 2 spaces",
-        "examples4Spaces": "Examples in 4 spaces"
-    },
-    "PhraseExplorer_interface": {
-        "step1": "1. Enter a sentence",
-        "step2": "2. Enter words of interest (Optional)",
-        "step3": "3. Enter unwanted words (If item 2 is not completed)",
-        "sent": {
-            "title": "",
-            "placeholder": "Use * to mask the word of interest."
-        },
-        "wordList": {
-            "title": "",
-            "placeholder": "The words in the list must be comma separated"
-        },
-        "bannedWordList": {
-            "title": "",
-            "placeholder": "The words in the list must be comma separated"
-        },
-        "excludeArticles": "Exclude articles",
-        "excludePrepositions": "Excluir Prepositions",
-        "excludeConjunctions": "Excluir Conjunctions",
-        "resultsButton": "Get",
-        "plot": "Display of proportions",
-        "examples": "Examples"
-    },
-    "DataExplorer_interface": {
-        "step1": "1. Enter a word of interest",
-        "step2": "2. Select maximum number of contexts to retrieve",
-        "step3": "3. Select sets of interest",
-        "inputWord": {
-            "title": "",
-            "placeholder": "Enter the word ..."
-        },
-        "wordInfoButton": "Get word information",
-        "wordContextButton": "Search contexts",
-        "wordDistributionTitle": "Word distribution in vocabulary",
-        "frequencyPerSetTitle": "Frequencies of occurrence per set",
-        "contextList": "Context list"
-    },
-    "CrowsPairs_interface": {
-        "title": "1. Enter sentences to compare",
-        "sent0": "Sentence Nº 1 (*)",
-        "sent1": "Sentence Nº 2 (*)",
-        "sent2": "Sentence Nº 3 (Optional)",
-        "sent3": "Sentence Nº 4 (Optional)",
-        "sent4": "Sentence Nº 5 (Optional)",
-        "sent5": "Sentence Nº 6 (Optional)",
-        "commonPlacholder": "Use < and > to highlight word(s) of interest",
-        "compareButton": "Compare",
-        "plot": "Display of proportions",
-        "examples": "Examples"
-    }
-}

language/{spanish.json → es.json} RENAMED Viewed

@@ -2,7 +2,7 @@
     "app": {
         "wordExplorer": "Explorar palabras",
         "biasWordExplorer": "Sesgo en palabras",
-        "dataExplorer": "Sesgo en datos",
         "phraseExplorer": "Sesgo en frases",
         "crowsPairsExplorer": "Crows-Pairs"
     },
@@ -43,11 +43,11 @@
         "step2": "2. Ingrese palabras de interés (Opcional)",
         "step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
         "sent": {
-            "title": "",
             "placeholder": "Utilice * para enmascarar la palabra de interés"
         },
         "wordList": {
-            "title": "",
             "placeholder": "La lista de palabras deberán estar separadas por ,"
         },
         "bannedWordList": {
@@ -66,7 +66,7 @@
         "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
         "step3": "3. Seleccione conjuntos de interés",
         "inputWord": {
-            "title": "",
             "placeholder": "Ingresar aquí la palabra ..."
         },
         "wordInfoButton": "Obtener información de palabra",
@@ -83,7 +83,7 @@
         "sent3": "Frase Nº 4 (Opcional)",
         "sent4": "Frase Nº 5 (Opcional)",
         "sent5": "Frase Nº 6 (Opcional)",
-        "commonPlacholder": "Utilice comillas simples ' ' para destacar palabra/as de interés",
         "compareButton": "Comparar",
         "plot": "Visualización de proporciones",
         "examples": "Ejemplos"

     "app": {
         "wordExplorer": "Explorar palabras",
         "biasWordExplorer": "Sesgo en palabras",
+        "dataExplorer": "Datos",
         "phraseExplorer": "Sesgo en frases",
         "crowsPairsExplorer": "Crows-Pairs"
     },
         "step2": "2. Ingrese palabras de interés (Opcional)",
         "step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
         "sent": {
+            "title": "Frase",
             "placeholder": "Utilice * para enmascarar la palabra de interés"
         },
         "wordList": {
+            "title": "Palabras de interés",
             "placeholder": "La lista de palabras deberán estar separadas por ,"
         },
         "bannedWordList": {
         "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
         "step3": "3. Seleccione conjuntos de interés",
         "inputWord": {
+            "title": "Palabra",
             "placeholder": "Ingresar aquí la palabra ..."
         },
         "wordInfoButton": "Obtener información de palabra",
         "sent3": "Frase Nº 4 (Opcional)",
         "sent4": "Frase Nº 5 (Opcional)",
         "sent5": "Frase Nº 6 (Opcional)",
+        "commonPlacholder": "Utilice los simbolos < y > para destacar palabra/as de interés",
         "compareButton": "Comparar",
         "plot": "Visualización de proporciones",
         "examples": "Ejemplos"

modules/model_embbeding.py CHANGED Viewed

@@ -3,7 +3,6 @@ from memory_profiler import profile
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
-from gensim.models.fasttext import load_facebook_vectors
 from typing import List, Any
 import os
 import pandas as pd
@@ -91,12 +90,15 @@ class Embedding:
                 n_components=2
             )
-        model = KeyedVectors.load_word2vec_format(
-                fname=path,
-                binary=path.endswith('.bin'),
-                limit=limit,
-                unicode_errors='ignore'
-            )
         # Cased Vocab
         cased_words = model.index_to_key

 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
 from typing import List, Any
 import os
 import pandas as pd
                 n_components=2
             )
+        try:
+            model = KeyedVectors.load_word2vec_format(
+                    fname=path,
+                    binary=path.endswith('.bin'),
+                    limit=limit,
+                    unicode_errors='ignore'
+                )
+        except:
+            raise Exception(f"Can't load {path}. If it's a .bin extended file, only gensims c binary format are valid")
         # Cased Vocab
         cased_words = model.index_to_key

modules/module_BiasExplorer.py CHANGED Viewed

@@ -12,7 +12,7 @@ __all__ = ['WordBiasExplorer', 'WEBiasExplorer2Spaces', 'WEBiasExplorer4Spaces']
 class WordBiasExplorer:
     def __init__(
         self,
-        embedding  # Class Embedding instance
     ) -> None:
         self.embedding = embedding
@@ -265,7 +265,11 @@ class WordBiasExplorer:
         return None
 class WEBiasExplorer2Spaces(WordBiasExplorer):
-    def __init__(self, embedding) -> None:
         super().__init__(embedding)
     def calculate_bias(
@@ -375,7 +379,11 @@ class WEBiasExplorer2Spaces(WordBiasExplorer):
 class WEBiasExplorer4Spaces(WordBiasExplorer):
-    def __init__(self, embedding) -> None:
         super().__init__(embedding)
     def calculate_bias(
@@ -395,12 +403,11 @@ class WEBiasExplorer4Spaces(WordBiasExplorer):
             wordlist_bottom
         ]
-        # TODO: Ver este chequeo del lado de Connector
         for wordlist in wordlists:
             if not wordlist:
                 raise Exception('¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!')
-        err = self.check_oov(wordlist)
         if err:
             raise Exception(err)

 class WordBiasExplorer:
     def __init__(
         self,
+        embedding  # Embedding Class instance
     ) -> None:
         self.embedding = embedding
         return None
 class WEBiasExplorer2Spaces(WordBiasExplorer):
+    def __init__(
+        self,
+        embedding   # Embedding class instance
+    ) -> None:
         super().__init__(embedding)
     def calculate_bias(
 class WEBiasExplorer4Spaces(WordBiasExplorer):
+    def __init__(
+        self,
+        embedding   # Embedding Class instance
+    ) -> None:
         super().__init__(embedding)
     def calculate_bias(
             wordlist_bottom
         ]
         for wordlist in wordlists:
             if not wordlist:
                 raise Exception('¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!')
+        err = self.check_oov(wordlists)
         if err:
             raise Exception(err)

modules/module_WordExplorer.py CHANGED Viewed

@@ -16,7 +16,7 @@ class WordToPlot:
         color: str,
         bias_space: int,
         alpha: float
-    ):
         self.word = word
         self.color = color
@@ -27,7 +27,7 @@ class WordToPlot:
 class WordExplorer:
     def __init__(
         self,
-        embedding   # Class Embedding instance
     ) -> None:
         self.embedding = embedding

         color: str,
         bias_space: int,
         alpha: float
+    ) -> None:
         self.word = word
         self.color = color
 class WordExplorer:
     def __init__(
         self,
+        embedding   # Embedding Class instance
     ) -> None:
         self.embedding = embedding

modules/module_logsManager.py CHANGED Viewed

@@ -63,10 +63,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
-        assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
-        self.hf_token = hf_token
         self.dataset_name = dataset_name
         self.organization_name = organization
         self.dataset_private = private
         self.datetime = DateLogs()

             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
+        assert(dataset_name is not None), "Error: Parameter 'dataset_name' can not be empty!."
         self.dataset_name = dataset_name
+        self.hf_token = hf_token
         self.organization_name = organization
         self.dataset_private = private
         self.datetime = DateLogs()

tool.cfg ADDED Viewed

	@@ -0,0 +1,13 @@

+[INTERFACE]
+# ['es' | 'en']
+language            = es
+[WORD_EXPLORER]
+embeddings_path     = data/100k_es_embedding.vec
+# ['sklearn' | 'ann']
+nn_method           = sklearn
+max_neighbors       = 20
+[LOGS]
+# [True | False]
+available_logs      = False