Spaces:

vialibre
/

edia_datos_es

Runtime error

App Files Files Community

nanom commited on Dec 1, 2022

Commit

743fd42

•

1 Parent(s): 5a3c3c7

Upload datos app

Browse files

Files changed (16) hide show

.gitignore +3 -0
LICENSE +21 -0
app.py +28 -0
data/full_vocab_v6.zip +3 -0
interfaces/.gitignore +1 -0
interfaces/interface_datos.py +100 -0
language/spanish.json +16 -0
modules/.gitignore +1 -0
modules/module_connection.py +73 -0
modules/module_customSubsetsLabel.py +89 -0
modules/module_logsManager.py +175 -0
modules/module_segmentedWordCloud.py +64 -0
modules/module_vocabulary.py +85 -0
modules/module_word2Context.py +199 -0
requirements.txt +9 -0
tool_info.py +23 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+.env
+bias_tool_logs/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Fundación Vía Libre
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# --- Imports modules ---
+from modules.module_vocabulary import Vocabulary
+# --- Imports interfaces ---
+from interfaces.interface_datos import interface as interface_datos
+# --- Tool config ---
+AVAILABLE_LOGS      = True                          # [True | False]
+LANGUAGE            = "spanish"                     # [spanish]
+VOCABULARY_SUBSET   = "full"                        # [full]
+# ToDo Cheange context dataset owner  from nanom to vialibre
+CONTEXTS_DATASET    = "nanom/splittedspanish3bwc"
+# --- Init classes ---
+vocabulary = Vocabulary(
+    subset_name=VOCABULARY_SUBSET
+)
+# --- Main App ---
+iface = interface_datos(
+    vocabulary=vocabulary,
+    contexts=CONTEXTS_DATASET,
+    available_logs=AVAILABLE_LOGS,
+    lang=LANGUAGE
+)
+iface.queue(concurrency_count=8)
+iface.launch(debug=False)

data/full_vocab_v6.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:478fa3e953fbc65746681b1b9770e726f0cd28a0a9992735c00001a09d04b42a
+size 205538236

interfaces/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

interfaces/interface_datos.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from modules.module_logsManager import HuggingFaceDatasetSaver
+from modules.module_connection import Word2ContextExplorerConnector
+from tool_info import TOOL_INFO
+import gradio as gr
+import pandas as pd
+def interface(vocabulary, contexts, available_logs, lang="spanish"):
+    # --- Init logs ---
+    log_callback = HuggingFaceDatasetSaver(
+        available_logs=available_logs
+    )
+    # --- Init Class ---
+    connector = Word2ContextExplorerConnector(vocabulary=vocabulary, context=contexts)
+    labels = pd.read_json(f"language/{lang}.json")["DataExplorer_interface"]
+    # --- Interface ---
+    iface = gr.Blocks(css=".container { max-width: 90%; margin: auto;}")
+    with iface:
+        with gr.Row():
+            with gr.Column():
+                with gr.Group():
+                    gr.Markdown(labels["step1"])
+                    with gr.Row(): input_word = gr.Textbox(label=labels["inputWord"]["title"],
+                                                            show_label=False,
+                                                            placeholder=labels["inputWord"]["placeholder"])
+                    with gr.Row(): btn_get_w_info = gr.Button(labels["wordInfoButton"])
+                with gr.Group():
+                    gr.Markdown(labels["step2"])
+                    n_context = gr.Slider(label="",
+                                        step=1, minimum=1, maximum=30, value=5,
+                                        visible=True, interactive=True)
+                with gr.Group():
+                    gr.Markdown(labels["step3"])
+                    subsets_choice = gr.CheckboxGroup(label="",
+                                        interactive=True,
+                                        visible=True)
+                    with gr.Row(): btn_get_contexts = gr.Button(labels["wordContextButton"], visible=True)
+                with gr.Row(): out_msj = gr.Markdown(label="", visible=True)
+            with gr.Column():
+                with gr.Group():
+                    gr.Markdown(labels["wordDistributionTitle"])
+                    dist_plot = gr.Plot(label="", show_label=False)
+                    # Set visibility to "true" if you want to see cloud of related words by frequency
+                    wc_plot = gr.Plot(label="", show_label=False, visible=False)
+                with gr.Group():
+                    gr.Markdown(labels["frequencyPerSetTitle"])
+                    subsets_freq = gr.HTML(label="")
+        with gr.Row():
+            with gr.Group():
+                with gr.Row(): gr.Markdown(labels["contextList"])
+                with gr.Row(): out_context = gr.Dataframe(label="",
+                                                interactive=False,
+                                                value=pd.DataFrame([], columns=['']),
+                                                wrap=True,
+                                                datatype=['str','markdown','str','markdown'])
+        with gr.Group():
+            gr.Markdown(TOOL_INFO)
+        btn_get_w_info.click(
+            fn=connector.get_word_info,
+            inputs=[input_word],
+            outputs=[out_msj,
+                    out_context,
+                    subsets_freq,
+                    dist_plot,
+                    wc_plot,
+                    subsets_choice]
+        )
+        btn_get_contexts.click(
+            fn=connector.get_word_context,
+            inputs=[input_word, n_context, subsets_choice],
+            outputs=[out_msj, out_context]
+        )
+        # --- Logs ---
+        save_field = [input_word, subsets_choice]
+        log_callback.setup(components=save_field, flagging_dir="edia_datos_es")
+        btn_get_contexts.click(
+            fn=lambda *args: log_callback.flag(
+                    flag_data=args,
+                    flag_option="datos",
+                    username="vialibre"
+            ),
+            inputs=save_field,
+            outputs=None,
+            preprocess=False
+        )
+    return iface

language/spanish.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "DataExplorer_interface": {
+        "step1": "1. Ingrese una palabra de interés",
+        "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
+        "step3": "3. Seleccione conjuntos de interés",
+        "inputWord": {
+            "title": "",
+            "placeholder": "Ingresar aquí la palabra ..."
+        },
+        "wordInfoButton": "Obtener información de palabra",
+        "wordContextButton": "Buscar contextos",
+        "wordDistributionTitle": "Distribución de palabra en vocabulario",
+        "frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
+        "contextList": "Lista de contextos"
+    },
+}

modules/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

modules/module_connection.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import pandas as pd
+import gradio as gr
+from abc import ABC
+from modules.module_word2Context import Word2Context
+class Connector(ABC):
+    def parse_word(self, word : str):
+        return word.lower().strip()
+    def parse_words(self, array_in_string : str):
+        words = array_in_string.strip()
+        if not words:
+            return []
+        words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
+        return words
+    def process_error(self, err: str):
+        if err is None:
+            return
+        return "<center><h3>" + err + "</h3></center>"
+class Word2ContextExplorerConnector(Connector):
+    def __init__(self, **kwargs):
+        vocabulary = kwargs.get('vocabulary', None)
+        context = kwargs.get('context', None)
+        if vocabulary is None and context is None:
+            raise KeyError
+        self.word2context_explorer = Word2Context(context, vocabulary)
+    def get_word_info(self, word):
+        err = ""
+        contexts = pd.DataFrame([],columns=[''])
+        subsets_info = ""
+        distribution_plot = None
+        word_cloud_plot = None
+        subsets_choice = gr.CheckboxGroup.update(choices=[])
+        err = self.word2context_explorer.errorChecking(word)
+        if err:
+            return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
+        word = self.parse_word(word)
+        subsets_info, subsets_origin_info = self.word2context_explorer.getSubsetsInfo(word)
+        clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
+        subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
+        distribution_plot = self.word2context_explorer.genDistributionPlot(word)
+        word_cloud_plot = self.word2context_explorer.genWordCloudPlot(word)
+        return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
+    def get_word_context(self, word, n_context, subset_choice):
+        word = self.parse_word(word)
+        n_context = int(n_context)
+        err = ""
+        contexts = pd.DataFrame([], columns=[''])
+        if len(subset_choice) > 0:
+            ds = self.word2context_explorer.findSplits(word, subset_choice)
+        else:
+            err = self.process_error("Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!")
+            return err, contexts
+        list_of_contexts = self.word2context_explorer.getContexts(word, n_context, ds)
+        contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
+        contexts["buscar"] = contexts.contexto.apply(lambda text: self.word2context_explorer.genWebLink(text))
+        return self.process_error(err), contexts

modules/module_customSubsetsLabel.py ADDED Viewed

	@@ -0,0 +1,89 @@

+class CustomSubsetsLabel:
+    def __init__(self):
+        self.html_head = """
+        <html>
+            <head>
+                <meta charset="utf-8">
+                <meta name="viewport" content="width=device-width, initial-scale=1">
+                <style>
+                    progress {
+                        -webkit-appearance: none;
+                    }
+                    progress::-webkit-progress-bar {
+                        background-color: #666;
+                        border-radius: 7px;
+                    }
+                    progress {
+                        width:100%;
+                        height:4px;
+                        border-radius: 1px;
+                    }
+                    #myturn {
+                        display: block;
+                        position: relative;
+                        margin: auto;
+                        width: 90%;
+                        padding: 2px;
+                    }
+                </style>
+            </head>
+            <body>
+        """
+        self.html_footer ="</body></html>"
+        self.subset_links = {
+            'allwikis': "https://github.com/josecannete/wikiextractorforBERT",
+            'DGT': "http://opus.nlpl.eu/DGT.php",
+            'DOGC': "http://opus.nlpl.eu/DOGC.php",
+            'ECB': "http://opus.nlpl.eu/ECB.php",
+            'EMEA': "http://opus.nlpl.eu/EMEA.php",
+            'EUBookShop': "http://opus.nlpl.eu/EUbookshop.php",
+            'Europarl': "http://opus.nlpl.eu/Europarl.php",
+            'GlobalVoices': "http://opus.nlpl.eu/GlobalVoices.php",
+            'JRC': "http://opus.nlpl.eu/JRC-Acquis.php",
+            'multiUN': "http://opus.nlpl.eu/MultiUN.php",
+            'NewsCommentary11': "http://opus.nlpl.eu/News-Commentary-v11.php",
+            'OpenSubtitles2018': "http://opus.nlpl.eu/OpenSubtitles-v2018.php",
+            'ParaCrawl': "http://opus.nlpl.eu/ParaCrawl.php",
+            'TED': "http://opus.nlpl.eu/TED2013.php",
+            'UN': "http://opus.nlpl.eu/UN.php",
+        }
+    def __progressbar(self, percentage, subset, freq, size=15):
+        html = f"""
+        <div id="myturn">
+            <progress value="{int(percentage)}" max="100"></progress>
+            <p style="text-align:left; font-size:{size}px; padding:0px;">
+                <a href="{self.subset_links[subset]}" target="_blank">
+                    <strong>{subset}</strong> <span style="font-size:{size-2}px">(Frecuencia: {freq})</span>
+                </a>
+                <span style="float:right;">
+                    <strong>{percentage}%</strong>
+                </span>
+            </p>
+        </div>
+        """
+        return html
+    def __render(self, subsets, freqs, percentages):
+        html = ""
+        for subset, freq, perc in zip(subsets, freqs, percentages):
+            html += self.__progressbar(
+                percentage=perc,
+                subset=subset,
+                freq=freq
+            )
+        return self.html_head + html + self.html_footer
+    def compute(self, subsets_dic):
+        subsets_dic_info = {
+            k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
+            for k,v in subsets_dic.items()
+        }
+        subsets = list(subsets_dic_info.keys())
+        freqs = [d['freq'] for d in subsets_dic_info.values()]
+        percentages = [d['perc'] for d in subsets_dic_info.values()]
+        return self.__render(subsets, freqs, percentages)

modules/module_logsManager.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from distutils.log import debug
+from gradio.flagging import FlaggingCallback, _get_dataset_features_info
+from gradio.components import IOComponent
+from gradio import utils
+from typing import Any, List, Optional
+from dotenv import load_dotenv
+from datetime import datetime
+import csv, os, pytz
+# --- Load environments vars ---
+load_dotenv()
+# --- Classes declaration ---
+class DateLogs:
+    def __init__(self, zone="America/Argentina/Cordoba"):
+        self.time_zone = pytz.timezone(zone)
+    def full(self):
+        now = datetime.now(self.time_zone)
+        return now.strftime("%H:%M:%S %d-%m-%Y")
+    def day(self):
+        now = datetime.now(self.time_zone)
+        return now.strftime("%d-%m-%Y")
+class HuggingFaceDatasetSaver(FlaggingCallback):
+    """
+    A callback that saves each flagged sample (both the input and output data)
+    to a HuggingFace dataset.
+    Example:
+        import gradio as gr
+        hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
+        def image_classifier(inp):
+            return {'cat': 0.3, 'dog': 0.7}
+        demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
+                            allow_flagging="manual", flagging_callback=hf_writer)
+    Guides: using_flagging
+    """
+    def __init__(
+        self,
+        hf_token: str = os.getenv('HF_TOKEN'),
+        dataset_name: str = os.getenv('DS_LOGS_NAME'),
+        organization: Optional[str] = os.getenv('ORG_NAME'),
+        private: bool = True,
+        available_logs: bool = False
+    ):
+        """
+        Parameters:
+            hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
+            dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
+            organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
+            private: Whether the dataset should be private (defaults to False).
+        """
+        self.hf_token = hf_token
+        self.dataset_name = dataset_name
+        self.organization_name = organization
+        self.dataset_private = private
+        self.datetime = DateLogs()
+        self.available_logs = available_logs
+        if not available_logs:
+            print("Push: logs DISABLED!...")
+    def setup(
+            self,
+            components: List[IOComponent],
+            flagging_dir: str
+        ):
+        """
+        Params:
+        flagging_dir (str): local directory where the dataset is cloned,
+        updated, and pushed from.
+        """
+        if self.available_logs:
+            try:
+                import huggingface_hub
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError(
+                    "Package `huggingface_hub` not found is needed "
+                    "for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
+                )
+            path_to_dataset_repo = huggingface_hub.create_repo(
+                repo_id=os.path.join(self.organization_name, self.dataset_name),
+                token=self.hf_token,
+                private=self.dataset_private,
+                repo_type="dataset",
+                exist_ok=True,
+            )
+            self.path_to_dataset_repo = path_to_dataset_repo
+            self.components = components
+            self.flagging_dir = flagging_dir
+            self.dataset_dir = self.dataset_name
+            self.repo = huggingface_hub.Repository(
+                local_dir=self.dataset_dir,
+                clone_from=path_to_dataset_repo,
+                use_auth_token=self.hf_token,
+            )
+            self.repo.git_pull(lfs=True)
+            # Should filename be user-specified?
+            # log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
+            self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
+    def flag(
+        self,
+        flag_data: List[Any],
+        flag_option: Optional[str] = None,
+        flag_index: Optional[int] = None,
+        username: Optional[str] = None,
+    ) -> int:
+        if self.available_logs:
+            self.repo.git_pull(lfs=True)
+            is_new = not os.path.exists(self.log_file)
+            with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
+                writer = csv.writer(csvfile)
+                # File previews for certain input and output types
+                infos, file_preview_types, headers = _get_dataset_features_info(
+                    is_new, self.components
+                )
+                # Generate the headers and dataset_infos
+                if is_new:
+                    headers = [
+                        component.label or f"component {idx}"
+                        for idx, component in enumerate(self.components)
+                    ] + [
+                        "flag",
+                        "username",
+                        "timestamp",
+                    ]
+                    writer.writerow(utils.sanitize_list_for_csv(headers))
+                # Generate the row corresponding to the flagged sample
+                csv_data = []
+                for component, sample in zip(self.components, flag_data):
+                    save_dir = os.path.join(
+                        self.dataset_dir,
+                        utils.strip_invalid_filename_characters(component.label),
+                    )
+                    filepath = component.deserialize(sample, save_dir, None)
+                    csv_data.append(filepath)
+                    if isinstance(component, tuple(file_preview_types)):
+                        csv_data.append(
+                            "{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
+                        )
+                csv_data.append(flag_option if flag_option is not None else "")
+                csv_data.append(username if username is not None else "")
+                csv_data.append(self.datetime.full())
+                writer.writerow(utils.sanitize_list_for_csv(csv_data))
+            with open(self.log_file, "r", encoding="utf-8") as csvfile:
+                line_count = len([None for row in csv.reader(csvfile)]) - 1
+            self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
+        else:
+            line_count = 0
+            print("Logs: Virtual push...")
+        return line_count

modules/module_segmentedWordCloud.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+class SimpleGroupedColorFunc(object):
+    """Create a color function object which assigns EXACT colors
+       to certain words based on the color to words mapping
+       Parameters
+       ----------
+       color_to_words : dict(str -> list(str))
+         A dictionary that maps a color to the list of words.
+       default_color : str
+         Color that will be assigned to a word that's not a member
+         of any value from color_to_words.
+    """
+    def __init__(self, color_to_words, default_color):
+        self.word_to_color = {
+            word: color
+            for (color, words) in color_to_words.items()
+            for word in words
+        }
+        self.default_color = default_color
+    def __call__(self, word, **kwargs):
+        return self.word_to_color.get(word, self.default_color)
+class SegmentedWordCloud:
+    def __init__(self, freq_dic, less_group, greater_group):
+        colors = {
+            'less': '#529ef3',
+            'salient':'#d35400',
+            'greater':'#5d6d7e',
+        }
+        color_to_words = {
+            colors['greater']: greater_group,
+            colors['less']: less_group,
+        }
+        grouped_color_func = SimpleGroupedColorFunc(
+            color_to_words=color_to_words,
+            default_color=colors['salient']
+        )
+        self.wc = WordCloud(
+            background_color="white",
+            width=900,
+            height=300,
+            random_state=None).generate_from_frequencies(freq_dic)
+        self.wc.recolor(color_func=grouped_color_func)
+    def plot(self, figsize):
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.imshow(self.wc, interpolation="bilinear")
+        ax.axis("off")
+        fig.tight_layout()
+        return fig

modules/module_vocabulary.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from memory_profiler import profile
+import pandas as pd
+class Vocabulary:
+    @profile
+    def __init__(self, subset_name):
+        # Dataset info
+        self.subset_name = subset_name
+        self.ds_path = f"data/{subset_name}_vocab_v6.zip"
+        # Pandas dataset
+        self.df_vocab = None
+        # Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
+        self.histogram = None
+        # Load vocabulary dataset
+        self.__load()
+    def __contains__(self, word):
+        return word in self.df_vocab['word'].to_list()
+    def __load(self):
+        print(f"Preparing {self.subset_name} vocabulary...")
+        # --- Download vocab dataset ---
+        self.df_vocab = pd.read_json(self.ds_path)
+        # --- Create min histogram to plot the word distribution graph ---
+        x_values = self.df_vocab['percentile'].to_list()
+        y_values = self.df_vocab['freq'].to_list()
+        # Delete duplicated tups
+        uniques_tups_list = set(list(zip(x_values, y_values)))
+        # Leave only tuples with different first element
+        uniques_tups_list = dict(uniques_tups_list)
+        self.histogram = sorted(
+            uniques_tups_list.items(),
+            key=lambda tup: tup[0],
+            reverse=True
+        )
+    def __getValue(self, word, feature):
+        word_id, value = None, None
+        if word in self:
+            word_id = self.df_vocab['word'].to_list().index(word)
+        if word_id != None:
+            value = self.df_vocab[feature].to_list()[word_id]
+        return value
+    def getFreq(self, word):
+        return self.__getValue(word, 'freq')
+    def getPercentile(self, word):
+        return self.__getValue(word, 'percentile')
+    def getSplits(self, word):
+        return self.__getValue(word, 'splits')
+    def getSubsets(self, word):
+        return self.__getValue(word, 'in_subset')
+    def distribution(self):
+        x_values, y_values = zip(*self.histogram)
+        return x_values, y_values
+    def getWordNeighbors(self, word, n_neighbors=20):
+        word_id = self.df_vocab['word'].to_list().index(word)
+        words = self.df_vocab['word'].to_list()
+        freqs = self.df_vocab['freq'].to_list()
+        l_sorted = list(zip(words, freqs))
+        g = l_sorted[max(0, word_id-n_neighbors):word_id]    # less than
+        e = l_sorted[word_id]                               # equal than
+        l = l_sorted[word_id+1:word_id+n_neighbors]         # greter than
+        dic = dict(g+[e]+l)
+        l = [x[0] for x in l]
+        g = [x[0] for x in g]
+        return dic, l, g

modules/module_word2Context.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from datasets import load_dataset, interleave_datasets
+from modules.module_segmentedWordCloud import SegmentedWordCloud
+from modules.module_customSubsetsLabel import CustomSubsetsLabel
+from random import sample as random_sample
+import re
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+class Word2Context:
+    def __init__(self, context_ds_name, vocabulary):
+        self.context_ds_name = context_ds_name
+        # Vocabulary class
+        self.vocab = vocabulary
+        # Custom Label component
+        self.Label = CustomSubsetsLabel()
+    def errorChecking(self, word):
+        out_msj = ""
+        if not word:
+            out_msj = "Error: Primero debe ingresar una palabra!"
+        else:
+            if word not in self.vocab:
+                out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
+        return out_msj
+    def genWebLink(self,text):
+        text = text.replace("\"", "'")
+        text = text.replace("<u><b>", "")
+        text = text.replace("</b></u>", "")
+        url = "https://www.google.com.tr/search?q={}".format(text)
+        return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>🌐🔍</center></a>'.format(url)
+    def genWordCloudPlot(self, word, figsize=(9,3)):
+        freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
+        wc = SegmentedWordCloud(freq_dic, l_group, g_group)
+        return wc.plot(figsize)
+    def genDistributionPlot(self, word, figsize=(6,1)):
+        x_values, y_values = self.vocab.distribution()
+        w_percentile = self.vocab.getPercentile(word)
+        w_freq = self.vocab.getFreq(word)
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.plot(x_values, y_values, color='green')
+        ax.fill_between(x_values, y_values, color='lightgreen',)
+        # -- Uncomment if wordcloud is enabled in the application interface --
+        # ax.axvline(x=max(0,w_percentile-.01),
+        #     color='blue',
+        #     linewidth=7,
+        #     alpha=.2,
+        #     linestyle='-'
+        # )
+        # ax.axvline(x=min(100,w_percentile+.01),
+        #     color='black',
+        #     linewidth=7,
+        #     alpha=.2,
+        #     linestyle='-'
+        # )
+        ax.axvline(x=w_percentile,
+            color='#d35400',
+            linewidth=2,
+            linestyle='--',
+            label=f'{w_freq}\n(frecuencia total)'
+        )
+        ax.axis('off')
+        plt.legend(loc='upper left', prop={'size': 7})
+        return fig
+    def findSplits(self, word, subsets_list):
+        w_splits = self.vocab.getSplits(word)
+        splits_list = []
+        for subset in subsets_list:
+            current_split_list = []
+            for s in w_splits:
+                if (subset == s.split("_")[0]):
+                    current_split_list.append(s)
+            if current_split_list:
+                splits_list.append(current_split_list)
+        splits_list = [random_sample(s_list, 1)[0] for s_list in splits_list]
+        ds_list = [
+            load_dataset(path=self.context_ds_name, name=split, streaming=True, split='all')
+            for split in splits_list
+        ]
+        datasets = ds_list[0]
+        if len(ds_list) > 1:
+            datasets = interleave_datasets(ds_list, probabilities=None)
+        return datasets
+    def findContexts(self, sample, word):
+        sample = sample['text'].strip()
+        context = ""
+        m = re.search(r'\b{}\b'.format(word), sample)
+        if m:
+            init = m.span()[0]
+            end = init+len(word)
+            context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
+        return {'context':context}
+    def getSubsetsInfo(self, word):
+        total_freq = self.vocab.getFreq(word)
+        subsets_name_list = list(self.vocab.getSubsets(word).keys())
+        subsets_freq_list = list(self.vocab.getSubsets(word).values())
+        # Create subset frequency dict to subset_freq component
+        subsets_info = {
+            s_name + f" ({s_freq})": s_freq/total_freq
+            for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
+        }
+        subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
+        subsets_info = self.Label.compute(subsets_origin_info)
+        return subsets_info, subsets_origin_info
+    def getContexts(self, word, n_context, ds):
+        ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
+        only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
+        shuffle_contexts = only_contexts.shuffle(buffer_size=10)
+        list_of_dict = list(shuffle_contexts.take(n_context))
+        list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
+        return list_of_contexts
+    # TODO: The next methods can be removed, or keep them as a wrapper method of several ones
+    '''
+    def getWordInfo(self, word):
+        errors = ""
+        contexts = pd.DataFrame([],columns=[''])
+        subsets_info = ""
+        distribution_plot = None
+        word_cloud_plot = None
+        subsets_choice = gr.CheckboxGroup.update(choices=[])
+        errors = self.errorChecking(word)
+        if errors:
+            return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
+        total_freq = self.vocab.getFreq(word)
+        subsets_name_list = list(self.vocab.getSubsets(word).keys())
+        subsets_freq_list = list(self.vocab.getSubsets(word).values())
+        # Create subset frequency dict to subset_freq component
+        subsets_info = {
+            s_name + f" ({s_freq})": s_freq/total_freq
+            for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
+        }
+        subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
+        subsets_info = self.Label.compute(subsets_origin_info)
+        # Create sort list to subsets_choice component
+        clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
+        subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
+        # Get word distribution, and wordcloud graph
+        distribution_plot = self.genDistributionPlot(word)
+        word_cloud_plot = self.genWordCloudPlot(word)
+        return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
+    def getWordContext(self, word, n_context, subset_choice):
+        n_context = int(n_context)
+        errors = ""
+        if len(subset_choice) > 0:
+            ds = self.findSplits(word, subset_choice)
+        else:
+            errors = "Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!"
+            errors = "<center><h3>"+errors+"</h3></center>"
+            return errors, pd.DataFrame([], columns=[''])
+        ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
+        only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
+        shuffle_contexts = only_contexts.shuffle(buffer_size=10)
+        list_of_dict = list(shuffle_contexts.take(n_context))
+        list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
+        contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
+        contexts["buscar"] = contexts.contexto.apply(lambda text: self.genWebLink(text))
+        return errors, contexts
+    '''

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+regex
+torch
+transformers
+wordcloud
+matplotlib
+numpy
+uuid
+python-dotenv
+memory_profiler

tool_info.py ADDED Viewed

	@@ -0,0 +1,23 @@

+TOOL_INFO = """
+> ### A tool to overcome technical barriers for bias assessment in human language technologies
+* [Read Full Paper](https://arxiv.org/abs/2207.06591)
+> ### Licensing Information
+* [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/edia_datos_es/resolve/main/LICENSE)
+> ### Citation Information
+```c
+@misc{https://doi.org/10.48550/arxiv.2207.06591,
+    doi = {10.48550/ARXIV.2207.06591},
+    url = {https://arxiv.org/abs/2207.06591},
+    author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
+    keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
+    FOS: Computer and information sciences, FOS: Computer and information sciences},
+    title = {A tool to overcome technical barriers for bias assessment in human language technologies},
+    publisher = {arXiv},
+    year = {2022},
+    copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
+}
+```
+"""