File size: 10,465 Bytes
888933e
 
 
2388248
 
 
 
 
 
 
 
 
 
888933e
 
47107c7
2388248
888933e
 
 
 
 
 
 
 
 
 
 
 
 
d4a6e66
 
888933e
 
2388248
 
5640d38
 
2388248
 
888933e
2388248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e897077
5640d38
e897077
5640d38
 
2388248
 
888933e
 
 
 
 
9ed2311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888933e
 
 
 
aefcc19
888933e
 
aefcc19
 
 
 
888933e
aefcc19
9ed2311
d4a6e66
2388248
 
 
 
 
888933e
 
 
472b8ef
9ed2311
 
 
 
 
 
 
888933e
9ed2311
888933e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2388248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adb76f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888933e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import random
from mtranslate import translate
import streamlit as st
import seaborn as sns
from spacy import displacy
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    pipeline
)


LOGO = "https://huggingface.co/bertin-project/bertin-roberta-base-spanish/resolve/main/images/bertin.png"
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""

PROMPT_LIST = [
    "Fui a la librería a comprar un <mask>.",
    "¡Qué buen <mask> hace hoy!",
    "Hoy empiezan las vacaciones así que vamos a la <mask>.",
    "Mi color favorito es el <mask>.",
    "Voy a <mask> porque estoy muy cansada.",
    "Mañana vienen mis amigos de <mask>.",
    "¿Te apetece venir a <mask> conmigo?",
    "En verano hace mucho <mask>.",
    "En el bosque había <mask>.",
    "El ministro dijo que <mask> los impuestos.",
    "Si no estuviera afónica, <mask> esa canción.",
    "Parece que ha salido el <mask>, por eso hace tanto calor.",
    "Al pan, pan, y al vino, <mask>.",
]

PAWS_X_PROMPT_LIST = [
    "Te amo.</s>Te adoro.",
    "Te amo.</s>Te detesto.",
    "Te amo.</s>Voy a caminar al campo."
]


@st.cache(show_spinner=False, persist=True)
def load_model(masked_text, model_url):
    model = AutoModelForMaskedLM.from_pretrained(model_url)
    tokenizer = AutoTokenizer.from_pretrained(model_url)
    nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
    result = nlp(masked_text)
    return result


@st.cache(show_spinner=False, persist=True)
def load_model_pair_classification(text, model_url_pair_classification):
    model = AutoModelForSequenceClassification.from_pretrained(model_url_pair_classification)
    tokenizer = AutoTokenizer.from_pretrained(model_url)
    nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)
    result = nlp(f"{text}</s>")
    if result[0]["label"] == "entailment":
        return f"Entailment: {result[0]['score']:02f}"
    if result[0]["label"] == "neutral":
        return f"Neutral: {result[0]['score']:02f}"
    return f"Contradiction: {result[0]['score']:02f}"


# Page
st.set_page_config(page_title="BERTIN Demo", page_icon=LOGO)
st.title("BERTIN")

#Sidebar
st.sidebar.markdown(f"""
<div align=center>
<img src="{LOGO}" width=200/>

# BERTIN

</div>

BERTIN is a series of BERT-based models for Spanish.

The models are trained with Flax and using TPUs sponsored by Google since this is part of the
[Flax/Jax Community Week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104)
organised by HuggingFace.

Please read our [full report](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) for more details on the methodology and metrics on downstream tasks.

""", unsafe_allow_html=True)

# Body
st.markdown(
    """
    All models are variations of **RoBERTa-base** trained from scratch in **Spanish** using a sample from the **mc4 dataset**.
    We reduced the dataset size to 50 million documents to keep training times shorter, and also to be able to bias training examples based on their perplexity.

    The idea is to favour examples with perplexities that are neither too small (short, repetitive texts) or too long (potentially poor quality). There are three versions of the sampling procedure (producing three different series of models):
    * **Random** sampling is the control baseline and simply takes documents at random with uniform probability to reduce the dataset size.
    * **Gaussian** rejects documents with higher probability for lower and larger perplexities, based on weighting the perplexity distribution with a Gaussian function.
    * **Stepwise** applies different four sampling probabilities to each of the four quartiles of the perplexity distribution.

    The first models have been trained (250.000 steps) on sequence length 128, and then training for Gaussian changed to sequence length 512 for the last 25.000 training steps to yield another version.

    Please read our [full report](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) for more details on the methodology and metrics on downstream tasks.

    ### Masked language modeling

    Here you can play with the filling the mask objective of all the models.

    """
)

col1, col2, col3 = st.columns(3)
strategy = col1.selectbox("Sampling strategy", ["Gaussian", "Stepwise", "Random"])
seq_len = col2.selectbox("Sequence length", [128, 512])

if seq_len == 128:
    model_url = f"bertin-project/bertin-base-{str(strategy).lower()}"
else:
    model_url = f"bertin-project/bertin-base-{str(strategy).lower()}-exp-512seqlen"

prompt = col3.selectbox("Prompt", ["Random", "Custom"])
if prompt == "Custom":
    prompt_box = "Enter your masked text here..."
else:
    prompt_box = random.choice(PROMPT_LIST)
text = st.text_area("Enter text", prompt_box)

if st.button("Fill the mask"):
    with st.spinner(text="Filling the mask..."):
        st.subheader("Result")
        result = load_model(text, model_url)
        result_sequence = result[0]["sequence"]
        st.write(result_sequence)
        st.write("_English_ _translation:_", translate(result_sequence, "en", "es"))
        st.write(result)


def make_color_palette(labels):
    color_palette = sns.color_palette(n_colors=len(labels))
    color_map = {x: rgb2hex(*y) for x, y in zip(labels, color_palette)}
    return color_map


@st.cache(allow_output_mutation=True)
def get_colormap(labels):
    color_map = make_color_palette(labels)
    return color_map


def add_colormap(labels):
    color_map = get_colormap(labels)
    for label in labels:
        if label not in color_map:
            rand_color = "#"+"%06x" % random.randint(0, 0xFFFFFF)
            color_map[label]=rand_color
    return color_map



def load_model_ner(model_url):
    config = AutoConfig.from_pretrained(model_url)
    model = AutoModelForTokenClassification.from_pretrained(
        model_url, config=config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_url, use_fast=True)
    return pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        ignore_labels=[],
        aggregation_strategy="simple",
    )


def display(entities):
    doc = model_entities_to_displacy_format(entities, ignore_entities=["O"])
    labels = list(set([ent["label"] for ent in doc["ents"]]))
    color_map = add_colormap(labels)
    html = displacy.render(
        doc,
        manual=True,
        style="ent",
        options={"colors": color_map}
    )
    html = html.replace("\n", " ")
    st.write(WRAPPER.format(html), unsafe_allow_html=True)


def rgb2hex(r, g, b):
    return "#{:02x}{:02x}{:02x}".format(
        int(r * 255), int(g * 255), int(b * 255)
    )


def model_entities_to_displacy_format(ents, ignore_entities=[]):
    s_ents = {}
    s_ents["text"] = " ".join([e["word"] for e in ents])
    spacy_ents = []
    start_pointer = 0
    if isinstance(ents, list) and "entity_group" in ents[0]:
        entity_key = "entity_group"
    else:
        entity_key = "entity"
    for i, ent in enumerate(ents):
        if ent[entity_key] not in ignore_entities:
            spacy_ents.append({
                "start": start_pointer,
                "end": start_pointer + len(ent["word"]),
                "label": ent[entity_key],
            })
        start_pointer = start_pointer + len(ent["word"]) + 1
    s_ents["ents"] = spacy_ents
    s_ents["title"] = None
    return s_ents

st.markdown("""

### Fine-tuning to CoNLL 2002 es for Named Entity Recognition (NER)

Here you can play with the RoBERTa Base Gaussian Seq Len 512 model fine-tuned to conll2002-es.

""")
text_input = str(st.text_input(
    "Text",
    "Mi nombre es Íñigo Montoya. Viajo a Los Acantilados de la Locura "
))
ner_model_url = "bertin-project/bertin-base-ner-conll2002-es"
label2id = AutoConfig.from_pretrained(ner_model_url, cache=False).label2id
color_map = get_colormap(list(label2id.keys()))
if st.button("Recognize named entities"):
    with st.spinner(text="Recognizing named entities..."):
        ner = load_model_ner(ner_model_url)
        entities = ner(str(text_input))
        st.write("_English_ _translation:_", translate(str(text_input), "en", "es"))
        if entities:
            if isinstance(entities, dict) and "error" in entities:
                st.write(entities)
            else:
                display(entities)
                raw_entities = []
                for entity in entities:
                    raw_entity = entity
                    raw_entity["start"] = int(raw_entity["start"])
                    raw_entity["end"] = int(raw_entity["end"])
                    raw_entity["score"] = float(raw_entity["score"])
                    raw_entities.append(raw_entity)
                st.write(raw_entities)
        else:
            st.write("No entities found")

st.markdown(
    """
    ### Fine-tuning to XNLI
    Here you can play with the RoBERTa Base Gaussian Seq Len 512 model fine-tuned to XNLI.
    """
)

pawsx_model_url = "bertin-project/bertin-base-xnli-es"
paraphrase_prompt = st.selectbox("Paraphrase Prompt", ["Random", "Custom"])
if paraphrase_prompt == "Custom":
    paraphrase_prompt_box = "Enter two sentences separated by </s> here..."
else:
    paraphrase_prompt_box = random.choice(PAWS_X_PROMPT_LIST)
text = st.text_area("Enter text", paraphrase_prompt_box)
if st.button("Clasify paraphrasing"):
    with st.spinner(text="Clasifying paraphrasing..."):
        st.subheader("Classification result")
        paraphrase_score = load_model_pair_classification(text, pawsx_model_url)
        st.write("_English_ _translation:_", translate(text, "en", "es"))
        st.write(paraphrase_score)

st.markdown(
    """
    ### Team members
    - Eduardo González ([edugp](https://huggingface.co/edugp))
    - Javier de la Rosa ([versae](https://huggingface.co/versae))
    - Manu Romero ([mrm8488](https://huggingface.co/mrm8488))
    - María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
    - Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
    - Paulo Villegas ([paulo](https://huggingface.co/paulo))

    ### More information
    You can find more information about these models
    [here](https://huggingface.co/bertin-project/bertin-roberta-base-spanish).
    """
)