Spaces:

bertin-project
/

bertin

Runtime error

App Files Files Community

versae commited on Jul 26, 2021

Commit

2388248

•

1 Parent(s): 9ed2311

New changes to demo

Browse files

Files changed (2) hide show

app.py +177 -16
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,25 +1,20 @@
 import random
 from mtranslate import translate
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
 LOGO = "https://huggingface.co/bertin-project/bertin-roberta-base-spanish/resolve/main/images/bertin.png"
-MODELS = {
-    "RoBERTa Base Gaussian Seq Len 128": {
-        "url": "bertin-project/bertin-base-gaussian"
-    },
-    "RoBERTa Base Gaussian Seq Len 512": {
-        "url": "bertin-project/bertin-base-gaussian-exp-512seqlen"
-    },
-    "RoBERTa Base Random Seq Len 128": {
-        "url": "bertin-project/bertin-base-random"
-    },
-    "RoBERTa Base Stepwise Seq Len 128": {
-        "url": "bertin-project/bertin-base-stepwise"
-    },
-}
 PROMPT_LIST = [
     "Fui a la librería a comprar un <mask>.",
@@ -37,6 +32,12 @@ PROMPT_LIST = [
     "Al pan, pan, y al vino, <mask>.",
 ]
 @st.cache(show_spinner=False, persist=True)
 def load_model(masked_text, model_url):
@@ -47,6 +48,26 @@ def load_model(masked_text, model_url):
     return result
 # Page
 st.set_page_config(page_title="BERTIN Demo", page_icon=LOGO)
 st.title("BERTIN")
@@ -84,6 +105,11 @@ st.markdown(
     The first models have been trained (250.000 steps) on sequence length 128, and then training for Gaussian changed to sequence length 512 for the last 25.000 training steps to yield another version.
     Please read our [full report](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) for more details on the methodology and metrics on downstream tasks.
     """
 )
@@ -112,6 +138,141 @@ if st.button("Fill the mask"):
         st.write("_English_ _translation:_", translate(result_sequence, "en", "es"))
         st.write(result)
 st.markdown(
     """
     ### Team members

 import random
 from mtranslate import translate
 import streamlit as st
+import seaborn as sns
+from spacy import displacy
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForMaskedLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    pipeline
+)
 LOGO = "https://huggingface.co/bertin-project/bertin-roberta-base-spanish/resolve/main/images/bertin.png"
+WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 PROMPT_LIST = [
     "Fui a la librería a comprar un <mask>.",
     "Al pan, pan, y al vino, <mask>.",
 ]
+PAWS_X_PROMPT_LIST = [
+    "Te amo.</s>Te adoro.",
+    "Te odio.</s>Te detesto.",
+    "Me gusta montar en bicicleta.</s>París es una ciudad francesa."
+]
 @st.cache(show_spinner=False, persist=True)
 def load_model(masked_text, model_url):
     return result
+@st.cache(show_spinner=False, persist=True)
+def load_model(masked_text, model_url):
+    model = AutoModelForMaskedLM.from_pretrained(model_url)
+    tokenizer = AutoTokenizer.from_pretrained(model_url)
+    nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+    result = nlp(masked_text)
+    return result
+@st.cache(show_spinner=False, persist=True)
+def load_model_pair_classification(text, model_url_pair_classification):
+    model = AutoModelForSequenceClassification.from_pretrained(model_url_pair_classification)
+    tokenizer = AutoTokenizer.from_pretrained(model_url)
+    nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)
+    result = nlp(f"{text}</s>")
+    if result[0]["label"] == "LABEL_0":
+        return f"Different meaning: {result[0]['score']:02f}"
+    return f"Paraphrase: {result[0]['score']:02f}"
 # Page
 st.set_page_config(page_title="BERTIN Demo", page_icon=LOGO)
 st.title("BERTIN")
     The first models have been trained (250.000 steps) on sequence length 128, and then training for Gaussian changed to sequence length 512 for the last 25.000 training steps to yield another version.
     Please read our [full report](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) for more details on the methodology and metrics on downstream tasks.
+    ### Masked language modeling
+    Here you can play with the filling the mask objective of all the models.
     """
 )
         st.write("_English_ _translation:_", translate(result_sequence, "en", "es"))
         st.write(result)
+st.markdown(
+    """
+    ### Fine-tuning to PAWS-X for paraphrase identification
+    Here you can play with the RoBERTa Base Gaussian Seq Len 512 model fine-tuned to PAWS-X.
+    """
+)
+pawsx_model_url = "bertin-project/bertin-base-paws-x-es"
+paraphrase_prompt = st.selectbox("Paraphrase Prompt", ["Random", "Custom"])
+if paraphrase_prompt == "Custom":
+    paraphrase_prompt_box = "Enter two sentences separated by </s> here..."
+else:
+    paraphrase_prompt_box = random.choice(PAWS_X_PROMPT_LIST)
+text = st.text_area("Enter text", paraphrase_prompt_box)
+if st.button("Clasify paraphrasing"):
+    with st.spinner(text="Clasifying paraphrasing..."):
+        st.subheader("Classification result")
+        paraphrase_score = load_model_pair_classification(text, pawsx_model_url)
+        st.write("_English_ _translation:_", translate(text, "en", "es"))
+        st.write(paraphrase_score)
+def make_color_palette(labels):
+    color_palette = sns.color_palette(n_colors=len(labels))
+    color_map = {x: rgb2hex(*y) for x, y in zip(labels, color_palette)}
+    return color_map
+@st.cache(allow_output_mutation=True)
+def get_colormap(labels):
+    color_map = make_color_palette(labels)
+    return color_map
+def add_colormap(labels):
+    color_map = get_colormap(labels)
+    for label in labels:
+        if label not in color_map:
+            rand_color = "#"+"%06x" % random.randint(0, 0xFFFFFF)
+            color_map[label]=rand_color
+    return color_map
+def load_model_ner(model_url):
+    config = AutoConfig.from_pretrained(model_url)
+    model = AutoModelForTokenClassification.from_pretrained(
+        model_url, config=config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_url, use_fast=True)
+    return pipeline(
+        "ner",
+        model=model,
+        tokenizer=tokenizer,
+        ignore_labels=[],
+        aggregation_strategy="simple",
+    )
+def display(entities):
+    doc = model_entities_to_displacy_format(entities, ignore_entities=["O"])
+    labels = list(set([ent["label"] for ent in doc["ents"]]))
+    color_map = add_colormap(labels)
+    html = displacy.render(
+        doc,
+        manual=True,
+        style="ent",
+        options={"colors": color_map}
+    )
+    html = html.replace("\n", " ")
+    st.write(WRAPPER.format(html), unsafe_allow_html=True)
+def rgb2hex(r, g, b):
+    return "#{:02x}{:02x}{:02x}".format(
+        int(r * 255), int(g * 255), int(b * 255)
+    )
+def model_entities_to_displacy_format(ents, ignore_entities=[]):
+    s_ents = {}
+    s_ents["text"] = " ".join([e["word"] for e in ents])
+    spacy_ents = []
+    start_pointer = 0
+    if isinstance(ents, list) and "entity_group" in ents[0]:
+        entity_key = "entity_group"
+    else:
+        entity_key = "entity"
+    for i, ent in enumerate(ents):
+        if ent[entity_key] not in ignore_entities:
+            spacy_ents.append({
+                "start": start_pointer,
+                "end": start_pointer + len(ent["word"]),
+                "label": ent[entity_key],
+            })
+        start_pointer = start_pointer + len(ent["word"]) + 1
+    s_ents["ents"] = spacy_ents
+    s_ents["title"] = None
+    return s_ents
+st.markdown("""
+### Fine-tuning to CoNLL 2002 es for Named Entity Recognition (NER)
+Here you can play with the RoBERTa Base Gaussian Seq Len 512 model fine-tuned to conll2002-es.
+""")
+text_input = str(st.text_input(
+    "Text",
+    "Mi nombre es Íñigo Montoya. Viajo a Los Acantilados de la Locura "
+))
+ner_model_url = "bertin-project/bertin-base-ner-conll2002-es"
+label2id = AutoConfig.from_pretrained(ner_model_url, cache=False).label2id
+color_map = get_colormap(list(label2id.keys()))
+if st.button("Recognize named entities"):
+    with st.spinner(text="Recognizing named entities..."):
+        ner = load_model_ner(ner_model_url)
+        entities = ner(str(text_input))
+        st.write("_English_ _translation:_", translate(str(text_input), "en", "es"))
+        if entities:
+            if isinstance(entities, dict) and "error" in entities:
+                st.write(entities)
+            else:
+                display(entities)
+                raw_entities = []
+                for entity in entities:
+                    raw_entity = entity
+                    raw_entity["start"] = int(raw_entity["start"])
+                    raw_entity["end"] = int(raw_entity["end"])
+                    raw_entity["score"] = float(raw_entity["score"])
+                    raw_entities.append(raw_entity)
+                st.write(raw_entities)
+        else:
+            st.write("No entities found")
 st.markdown(
     """
     ### Team members

requirements.txt CHANGED Viewed

@@ -2,3 +2,5 @@ streamlit
 mtranslate
 transformers
 torch

 mtranslate
 transformers
 torch
+seaborn
+spacy