Spaces:

flax-community
/

alberti

Runtime error

App Files Files Community

alvp commited on Jul 17, 2021

Commit

f78244e

1 Parent(s): db97ce2

Better UI and model comparison

Browse files

Files changed (3) hide show

app.py +165 -16
poems.py +173 -0
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -1,33 +1,182 @@
-from transformers import pipeline
 import streamlit as st
-import pandas as pd
-def filter_candidates(candidates):
-    df = pd.DataFrame(columns=["Candidates", "Probability"])
     cand_list = []
     score_list = []
     for candidate in candidates:
-        if candidate["token_str"][:2] != "##":
             cand = candidate["sequence"]
             score = candidate["score"]
             cand_list.append(cand)
             score_list.append('{0:.5f}'.format(score))
-        if len(score_list) == 5:
             break
-    df["Candidates"] = cand_list
-    df["Probability"] = score_list
-    df.index = [1,2,3,4,5]
-    return df
-nlp = pipeline("fill-mask", model="flax-community/alberti-bert-base-multilingual-cased")
-user_input = st.text_input("Mask token: [MASK]", "Me encanta escribir [MASK].")
-if st.button("Guess!"):
-    results = filter_candidates(nlp(user_input, top_k=20))
-    st.table(results)

+import random
+import re
+from poems import SAMPLE_POEMS
+import langid
+import numpy as np
 import streamlit as st
+import torch
+from icu_tokenizer import Tokenizer
+from transformers import pipeline
+MODELS = {
+    "ALBERTI": "flax-community/alberti-bert-base-multilingual-cased",
+    "mBERT": "bert-base-multilingual-cased"
+}
+TOPK = 50
+st.set_page_config(layout="wide")
+def mask_line(line, language="es", restrictive=True):
+    tokenizer = Tokenizer(lang=language)
+    token_list = tokenizer.tokenize(line)
+    if lang != "zh":
+        restrictive = not all([len(token) <= 3 for token in token_list])
+    random_num = random.randint(0, len(token_list) - 1)
+    random_word = token_list[random_num]
+    if not restrictive:
+        token_list[random_num] = "[MASK]"
+        masked_l = " ".join(token_list)
+        return masked_l
+    elif len(random_word) > 3 or (lang == "zh" and random_word.isalpha()):
+        token_list[random_num] = "[MASK]"
+        masked_l = " ".join(token_list)
+        return masked_l
+    else:
+        return mask_line(line, language)
+def filter_candidates(candidates, get_any_candidate=False):
     cand_list = []
     score_list = []
     for candidate in candidates:
+        if not get_any_candidate and candidate["token_str"][:2] != "##" and candidate["token_str"].isalpha():
             cand = candidate["sequence"]
             score = candidate["score"]
             cand_list.append(cand)
             score_list.append('{0:.5f}'.format(score))
+        elif get_any_candidate:
+            cand = candidate["sequence"]
+            score = candidate["score"]
+            cand_list.append(cand)
+            score_list.append('{0:.5f}'.format(score))
+        if len(score_list) == TOPK:
             break
+    if len(cand_list) < 1:
+        return filter_candidates(candidates, get_any_candidate=True)
+    else:
+        return cand_list[0]
+def infer_candidates(nlp, line):
+    line = re.sub("’", "'", line)
+    line = re.sub("…", "...", line)
+    inputs = nlp._parse_and_tokenize(line)
+    outputs = nlp._forward(inputs, return_tensors=True)
+    input_ids = inputs["input_ids"][0]
+    masked_index = torch.nonzero(input_ids == nlp.tokenizer.mask_token_id,
+                                 as_tuple=False)
+    logits = outputs[0, masked_index.item(), :]
+    probs = logits.softmax(dim=0)
+    values, predictions = probs.topk(TOPK)
+    result = []
+    for v, p in zip(values.tolist(), predictions.tolist()):
+        tokens = input_ids.numpy()
+        tokens[masked_index] = p
+        # Filter padding out:
+        tokens = tokens[np.where(tokens != nlp.tokenizer.pad_token_id)]
+        l = []
+        token_list = [nlp.tokenizer.decode([token], skip_special_tokens=True) for token in tokens]
+        print(token_list)
+        for idx, token in enumerate(token_list):
+            if token.startswith('##'):
+                l[-1] += token[2:]
+            elif idx == masked_index.item():
+                l += ['<b style="color: #ff0000;">', token, "</b>"]
+            else:
+                l += [token]
+        sequence = " ".join(l).strip()
+        result.append(
+            {
+                "sequence": sequence,
+                "score": v,
+                "token": p,
+                "token_str": nlp.tokenizer.decode(p),
+                "masked_index": masked_index.item()
+            }
+        )
+    return result
+def rewrite_poem(poem, ml_model=MODELS["ALBERTI"], masking=True, language="es"):
+    nlp = pipeline("fill-mask", model=ml_model)
+    unmasked_lines = []
+    masked_lines = []
+    for line in poem:
+        if line == "":
+            unmasked_lines.append("")
+            masked_lines.append("")
+            continue
+        if masking:
+            masked_line = mask_line(line, language)
+        else:
+            masked_line = line
+        masked_lines.append(masked_line)
+        unmasked_line_candidates = infer_candidates(nlp, masked_line)
+        unmasked_line = filter_candidates(unmasked_line_candidates)
+        unmasked_lines.append(unmasked_line)
+        unmasked_poem = "<br>".join(unmasked_lines)
+    return unmasked_poem, masked_lines
+instructions_text_0 = st.sidebar.markdown(
+    """# ALBERTI vs BERT 🥊
+We present ALBERTI, our BERT-based multilingual model for poetry.""")
+instructions_text_1 = st.sidebar.markdown(
+    """We have trained bert on a huge (for poetry, that is) corpus of
+multilingual poetry to try to get a more 'poetic' model. This is the result
+of our work.
+You can find more information on the [project's site](https://huggingface.co/flax-community/alberti-bert-base-multilingual-cased)""")
+sample_chooser = st.sidebar.selectbox(
+    "Choose a poem",
+    (SAMPLE_POEMS.keys())
+)
+instructions_text_2 = st.sidebar.markdown("""# How to use
+You can choose from a list of example poems in Spanish, English, French, German,
+Chinese and Arabic, but you can also paste a poem o write it yourself!
+Then click on 'Rewrite!' to do the masking and the fill-mask task on the chosen
+poem.""")
+col1, col2, col3 = st.beta_columns(3)
+st.markdown(
+    """
+    <style>
+        label {
+        font-size: 1rem !important;
+        font-weight: bold !important;
+        }
+    </style>
+    """, unsafe_allow_html=True)
+if sample_chooser:
+    model_list = set(MODELS.values())
+    user_input = col1.text_area("Input poem",
+                                "\n".join(SAMPLE_POEMS[sample_chooser]),
+                                height=600)
+    poem = user_input.split("\n")
+    rewrite_button = col1.button("Rewrite!")
+    if "[MASK]" in user_input or "<mask>" in user_input:
+        col1.error("You don't have to mask the poem, we'll do it for you!")
+if rewrite_button:
+    lang = langid.classify(user_input)[0]
+    unmasked_poem, masked_poem = rewrite_poem(poem, language=lang)
+    user_input_2 = col2.write(f"""<b>Output poem from ALBERTI</b>
+{unmasked_poem}""", unsafe_allow_html=True)
+    unmasked_poem_2, _ = rewrite_poem(masked_poem, ml_model=MODELS["mBERT"],
+                                      masking=False)
+    user_input_3 = col3.write(f"""<b>Output poem from mBERT</b>
+{unmasked_poem_2}""", unsafe_allow_html=True)

poems.py ADDED Viewed

	@@ -0,0 +1,173 @@

+SAMPLE_POEMS = {
+    "es_1": [
+        "A través del follaje perenne",
+        "Que oír deja rumores extraños,",
+        "Y entre un mar de ondulante verdura,",
+        "Amorosa mansión de los pájaros,",
+        "Desde mis ventanas veo",
+        "El templo que quise tanto.",
+        "",
+        "El templo que tanto quise...",
+        "Pues no sé decir ya si le quiero,",
+        "Que en el rudo vaivén que sin tregua",
+        "Se agitan mis pensamientos,",
+        "Dudo si el rencor adusto",
+        "Vive unido al amor en mi pecho."],
+    "es_2": [
+        "Es hielo abrasador, es fuego helado,",
+        "es herida que duele y no se siente,",
+        "es un soñado bien, un mal presente,",
+        "es un breve descanso muy cansado.",
+        "",
+        "Es un descuido que nos da cuidado,",
+        "un cobarde con nombre de valiente,",
+        "un andar solitario entre la gente,",
+        "un amar solamente ser amado.",
+        "",
+        "Es una libertad encarcelada,",
+        "que dura hasta el postrero paroxismo;",
+        "enfermedad que crece si es curada.",
+        "Éste es el niño Amor, éste es su abismo.",
+        "¿Mirad cuál amistad tendrá con nada",
+        "el que en todo es contrario de sí mismo!"],
+    "en_1": [
+        "Two roads diverged in a yellow wood,",
+        "And sorry I could not travel both",
+        "And be one traveler, long I stood",
+        "And looked down one as far as I could",
+        "To where it bent in the undergrowth;",
+        "",
+        "Then took the other, as just as fair,",
+        "And having perhaps the better claim,",
+        "Because it was grassy and wanted wear;",
+        "Though as for that the passing there",
+        "Had worn them really about the same,",
+        "",
+        "And both that morning equally lay",
+        "In leaves no step had trodden black.",
+        "Oh, I kept the first for another day!",
+        "Yet knowing how way leads on to way,",
+        "I doubted if I should ever come back.",
+        "",
+        "I shall be telling this with a sigh",
+        "Somewhere ages and ages hence:",
+        "Two roads diverged in a wood, and I—",
+        "I took the one less traveled by,",
+        "And that has made all the difference."],
+    "en_2": [
+        "April is the cruellest month, breeding",
+        "Lilacs out of the dead land, mixing",
+        "Memory and desire, stirring",
+        "Dull roots with spring rain.",
+        "Winter kept us warm, covering",
+        "Earth in forgetful snow, feeding",
+        "A little life with dried tubers.",
+        "Summer surprised us, coming over the Starnbergersee",
+        "With a shower of rain; we stopped in the colonnade,",
+        "And went on in sunlight, into the Hofgarten,",
+        "And drank coffee, and talked for an hour.",
+        "Bin gar keine Russin, stamm' aus Litauen, echt deutsch.",
+        "And when we were children, staying at the arch-duke's,",
+        "My cousin's, he took me out on a sled,",
+        "And I was frightened. He said, Marie,",
+        "Marie, hold on tight. And down we went.",
+        "In the mountains, there you feel free.",
+        "I read, much of the night, and go south in the winter."],
+    "fr_1": [
+        "Demain, dès l'aube, à l'heure où blanchit la campagne,",
+        "Je partirai. Vois-tu, je sais que tu m'attends.",
+        "J'irai par la forêt, j'irai par la montagne.",
+        "Je ne puis demeurer loin de toi plus longtemps.",
+        "",
+        "Je marcherai les yeux fixés sur mes pensées,",
+        "Sans rien voir au dehors, sans entendre aucun bruit,",
+        "Seul, inconnu, le dos courbé, les mains croisées,",
+        "Triste, et le jour pour moi sera comme la nuit.",
+        "",
+        "Je ne regarderai ni l'or du soir qui tombe,",
+        "Ni les voiles au loin descendant vers Harfleur,",
+        "Et quand j'arriverai, je mettrai sur ta tombe",
+        "Un bouquet de houx vert et de bruyère en fleur."],
+    "fr_2": [
+        "Cheminement de tous les clochers",
+        "sur le ciel",
+        "guet-apens très doux",
+        "des aéroplanes",
+        "sur ton cœur",
+        "comme les hirondelles",
+        "que tu apprivoises",
+        "avec ton ombre",
+        "",
+        "Tu peux t'éloigner",
+        "dans la magie",
+        "des fleurs nocturnes",
+        "tu peux prendre la tempête",
+        "pour amie",
+        "je serai ce lac de brume",
+        "à ton arrivée",
+        "ce lac de brume",
+        "et tu diras que tu aimes",
+        "toutes les lumières",
+        "de la  ville."],
+    "de_1": [
+        "Der du von dem Himmel bist,",
+        "Alles Leid und Schmerzen stillest,",
+        "Den, der doppelt elend ist,",
+        "Doppelt mit Erquickung füllest;",
+        "Ach, ich bin des Treibens müde!",
+        "Was soll all der Schmerz und Lust?",
+        "Süßer Friede,",
+        "Komm, ach komm in meine Brust!"],
+    "de_2": [
+        "Wieder duftet der Wald. ",
+        "Es heben die schwebenden Lerchen",
+        "mit sich den Himmel empor, der unseren Schultern schwer war; ",
+        "zwar sah man noch durch die Äste den Tag, wie er leer war,- ",
+        "aber nach langen, regnenden Nachmittagen ",
+        "kommen die goldübersonnten ",
+        "neueren Stunden, ",
+        "vor denen flüchtend an fernen Häuserfronten ",
+        "alle die wunden Fenster furchtsam mit Flügeln schlagen. ",
+        "Dann wird es still. Sogar der Regen geht leiser",
+        "über der Steine ruhig dunkelnden Glanz.",
+        "Alle Geräusche ducken sich ganz",
+        "in die glänzenden Knospen der Reiser."],
+    "zh_1": [
+        "春眠不觉晓，",
+        "处处闻啼鸟。",
+        "",
+        "夜来风雨声，",
+        "花落知多少"],
+    "zh_2": [
+        "关关雎鸠，在河之洲。",
+        "窈窕淑女，君子好逑。",
+        "",
+        "参差荇菜，左右流之。",
+        "窈窕淑女，寤寐求之。",
+        "",
+        "求之不得，寤寐思服。",
+        "悠哉悠哉，辗转反侧。",
+        "",
+        "参差荇菜，左右采之。",
+        "窈窕淑女，琴瑟友之。",
+        "",
+        "参差荇菜，左右毛之。",
+        "窈窕淑女，钟鼓乐之。"],
+    "ar_1": [
+        "داب نعشق لأليمه نجيمه",
+        "من يحبك ويموت فيك",
+        "إن قتلت عاد يكون بيك",
+        "لو قدر قلبي يخليك",
+        "لم يدبّر ذا النُّغيمة",
+        "يا مطرنَنِ شِلِباطُ (يا مذهول)",
+        "تُن حزين تنِ بناطُ (إنك مكروب)",
+        "ترى اليوم وَشْطاطُ (ضائعاً)",
+        "لم تذقي فيه غير لقيمة"],
+    "ar_2": [
+        "حَيّوا تُماضِرَ وَاِربَعوا صَحبي\t\tوَقِفوا فَإِنَّ وُقوفَكُم حَسبي",
+        "أَخُناسُ قَد هامَ الفُؤادُ بِكُم\t\tوَأَصابَهُ تَبَلٌ مِنَ الحُبِّ",
+        "ما إِن رَأَيتُ وَلا سَمِعتُ بِهِ\t\tكَاليَومِ طالي أَينُقٍ جُربِ",
+        "مُتَبَذِّلاً تَبدو مَحاسِنُهُ\t\tضَعُ الهِناءَ مَواضِعَ النُقبِ",
+        "مُتَحَسِّراً نَضَحَ الهِناءَ بِهِ\t\tضحَ العَبيرِ بِرَيطَةِ العَصبِ",
+        "فَسَليهُمُ عَنّي خُناسُ إِذا\t\tعَضَّ الجَميعَ الخَطبُ ما خَطبي"]
+}

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 transformers
-pandas
 torch
-tensorflow

 transformers
 torch
+streamlit
+icu_tokenizer
+langid