Spaces:
Runtime error
Runtime error
Better UI and model comparison
Browse files- app.py +165 -16
- poems.py +173 -0
- requirements.txt +3 -2
app.py
CHANGED
@@ -1,33 +1,182 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
-
def filter_candidates(candidates):
|
8 |
-
df = pd.DataFrame(columns=["Candidates", "Probability"])
|
9 |
cand_list = []
|
10 |
score_list = []
|
11 |
for candidate in candidates:
|
12 |
-
if candidate["token_str"][:2] != "##":
|
13 |
cand = candidate["sequence"]
|
14 |
score = candidate["score"]
|
15 |
cand_list.append(cand)
|
16 |
score_list.append('{0:.5f}'.format(score))
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
break
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
|
|
|
|
|
|
27 |
|
28 |
-
user_input = st.text_input("Mask token: [MASK]", "Me encanta escribir [MASK].")
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
|
|
|
1 |
+
import random
|
2 |
+
import re
|
3 |
+
from poems import SAMPLE_POEMS
|
4 |
+
|
5 |
+
import langid
|
6 |
+
import numpy as np
|
7 |
import streamlit as st
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from icu_tokenizer import Tokenizer
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
MODELS = {
|
14 |
+
"ALBERTI": "flax-community/alberti-bert-base-multilingual-cased",
|
15 |
+
"mBERT": "bert-base-multilingual-cased"
|
16 |
+
}
|
17 |
+
|
18 |
+
TOPK = 50
|
19 |
+
st.set_page_config(layout="wide")
|
20 |
+
|
21 |
|
22 |
+
def mask_line(line, language="es", restrictive=True):
|
23 |
+
tokenizer = Tokenizer(lang=language)
|
24 |
+
token_list = tokenizer.tokenize(line)
|
25 |
+
if lang != "zh":
|
26 |
+
restrictive = not all([len(token) <= 3 for token in token_list])
|
27 |
+
random_num = random.randint(0, len(token_list) - 1)
|
28 |
+
random_word = token_list[random_num]
|
29 |
+
if not restrictive:
|
30 |
+
token_list[random_num] = "[MASK]"
|
31 |
+
masked_l = " ".join(token_list)
|
32 |
+
return masked_l
|
33 |
+
elif len(random_word) > 3 or (lang == "zh" and random_word.isalpha()):
|
34 |
+
token_list[random_num] = "[MASK]"
|
35 |
+
masked_l = " ".join(token_list)
|
36 |
+
return masked_l
|
37 |
+
else:
|
38 |
+
return mask_line(line, language)
|
39 |
|
40 |
|
41 |
+
def filter_candidates(candidates, get_any_candidate=False):
|
|
|
42 |
cand_list = []
|
43 |
score_list = []
|
44 |
for candidate in candidates:
|
45 |
+
if not get_any_candidate and candidate["token_str"][:2] != "##" and candidate["token_str"].isalpha():
|
46 |
cand = candidate["sequence"]
|
47 |
score = candidate["score"]
|
48 |
cand_list.append(cand)
|
49 |
score_list.append('{0:.5f}'.format(score))
|
50 |
+
elif get_any_candidate:
|
51 |
+
cand = candidate["sequence"]
|
52 |
+
score = candidate["score"]
|
53 |
+
cand_list.append(cand)
|
54 |
+
score_list.append('{0:.5f}'.format(score))
|
55 |
+
if len(score_list) == TOPK:
|
56 |
break
|
57 |
+
if len(cand_list) < 1:
|
58 |
+
return filter_candidates(candidates, get_any_candidate=True)
|
59 |
+
else:
|
60 |
+
return cand_list[0]
|
61 |
+
|
62 |
+
|
63 |
+
def infer_candidates(nlp, line):
|
64 |
+
line = re.sub("’", "'", line)
|
65 |
+
line = re.sub("…", "...", line)
|
66 |
+
inputs = nlp._parse_and_tokenize(line)
|
67 |
+
outputs = nlp._forward(inputs, return_tensors=True)
|
68 |
+
input_ids = inputs["input_ids"][0]
|
69 |
+
masked_index = torch.nonzero(input_ids == nlp.tokenizer.mask_token_id,
|
70 |
+
as_tuple=False)
|
71 |
+
logits = outputs[0, masked_index.item(), :]
|
72 |
+
probs = logits.softmax(dim=0)
|
73 |
+
values, predictions = probs.topk(TOPK)
|
74 |
+
result = []
|
75 |
+
for v, p in zip(values.tolist(), predictions.tolist()):
|
76 |
+
tokens = input_ids.numpy()
|
77 |
+
tokens[masked_index] = p
|
78 |
+
# Filter padding out:
|
79 |
+
tokens = tokens[np.where(tokens != nlp.tokenizer.pad_token_id)]
|
80 |
+
l = []
|
81 |
+
token_list = [nlp.tokenizer.decode([token], skip_special_tokens=True) for token in tokens]
|
82 |
+
print(token_list)
|
83 |
+
for idx, token in enumerate(token_list):
|
84 |
+
if token.startswith('##'):
|
85 |
+
l[-1] += token[2:]
|
86 |
+
elif idx == masked_index.item():
|
87 |
+
l += ['<b style="color: #ff0000;">', token, "</b>"]
|
88 |
+
else:
|
89 |
+
l += [token]
|
90 |
+
sequence = " ".join(l).strip()
|
91 |
+
result.append(
|
92 |
+
{
|
93 |
+
"sequence": sequence,
|
94 |
+
"score": v,
|
95 |
+
"token": p,
|
96 |
+
"token_str": nlp.tokenizer.decode(p),
|
97 |
+
"masked_index": masked_index.item()
|
98 |
+
}
|
99 |
+
)
|
100 |
+
return result
|
101 |
+
|
102 |
+
|
103 |
+
def rewrite_poem(poem, ml_model=MODELS["ALBERTI"], masking=True, language="es"):
|
104 |
+
nlp = pipeline("fill-mask", model=ml_model)
|
105 |
+
unmasked_lines = []
|
106 |
+
masked_lines = []
|
107 |
+
for line in poem:
|
108 |
+
if line == "":
|
109 |
+
unmasked_lines.append("")
|
110 |
+
masked_lines.append("")
|
111 |
+
continue
|
112 |
+
if masking:
|
113 |
+
masked_line = mask_line(line, language)
|
114 |
+
else:
|
115 |
+
masked_line = line
|
116 |
+
masked_lines.append(masked_line)
|
117 |
+
unmasked_line_candidates = infer_candidates(nlp, masked_line)
|
118 |
+
unmasked_line = filter_candidates(unmasked_line_candidates)
|
119 |
+
unmasked_lines.append(unmasked_line)
|
120 |
+
unmasked_poem = "<br>".join(unmasked_lines)
|
121 |
+
return unmasked_poem, masked_lines
|
122 |
+
|
123 |
+
|
124 |
+
instructions_text_0 = st.sidebar.markdown(
|
125 |
+
"""# ALBERTI vs BERT 🥊
|
126 |
+
|
127 |
+
We present ALBERTI, our BERT-based multilingual model for poetry.""")
|
128 |
+
|
129 |
+
instructions_text_1 = st.sidebar.markdown(
|
130 |
+
"""We have trained bert on a huge (for poetry, that is) corpus of
|
131 |
+
multilingual poetry to try to get a more 'poetic' model. This is the result
|
132 |
+
of our work.
|
133 |
+
|
134 |
+
You can find more information on the [project's site](https://huggingface.co/flax-community/alberti-bert-base-multilingual-cased)""")
|
135 |
+
|
136 |
+
sample_chooser = st.sidebar.selectbox(
|
137 |
+
"Choose a poem",
|
138 |
+
(SAMPLE_POEMS.keys())
|
139 |
+
)
|
140 |
+
|
141 |
+
instructions_text_2 = st.sidebar.markdown("""# How to use
|
142 |
+
|
143 |
+
You can choose from a list of example poems in Spanish, English, French, German,
|
144 |
+
Chinese and Arabic, but you can also paste a poem o write it yourself!
|
145 |
+
|
146 |
+
Then click on 'Rewrite!' to do the masking and the fill-mask task on the chosen
|
147 |
+
poem.""")
|
148 |
+
|
149 |
+
col1, col2, col3 = st.beta_columns(3)
|
150 |
+
|
151 |
+
st.markdown(
|
152 |
+
"""
|
153 |
+
<style>
|
154 |
+
label {
|
155 |
+
font-size: 1rem !important;
|
156 |
+
font-weight: bold !important;
|
157 |
+
}
|
158 |
+
</style>
|
159 |
+
""", unsafe_allow_html=True)
|
160 |
|
161 |
+
if sample_chooser:
|
162 |
+
model_list = set(MODELS.values())
|
163 |
+
user_input = col1.text_area("Input poem",
|
164 |
+
"\n".join(SAMPLE_POEMS[sample_chooser]),
|
165 |
+
height=600)
|
166 |
+
poem = user_input.split("\n")
|
167 |
+
rewrite_button = col1.button("Rewrite!")
|
168 |
+
if "[MASK]" in user_input or "<mask>" in user_input:
|
169 |
+
col1.error("You don't have to mask the poem, we'll do it for you!")
|
170 |
|
171 |
+
if rewrite_button:
|
172 |
+
lang = langid.classify(user_input)[0]
|
173 |
+
unmasked_poem, masked_poem = rewrite_poem(poem, language=lang)
|
174 |
+
user_input_2 = col2.write(f"""<b>Output poem from ALBERTI</b>
|
175 |
|
|
|
176 |
|
177 |
+
{unmasked_poem}""", unsafe_allow_html=True)
|
178 |
+
unmasked_poem_2, _ = rewrite_poem(masked_poem, ml_model=MODELS["mBERT"],
|
179 |
+
masking=False)
|
180 |
+
user_input_3 = col3.write(f"""<b>Output poem from mBERT</b>
|
181 |
|
182 |
+
{unmasked_poem_2}""", unsafe_allow_html=True)
|
poems.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SAMPLE_POEMS = {
|
2 |
+
"es_1": [
|
3 |
+
"A través del follaje perenne",
|
4 |
+
"Que oír deja rumores extraños,",
|
5 |
+
"Y entre un mar de ondulante verdura,",
|
6 |
+
"Amorosa mansión de los pájaros,",
|
7 |
+
"Desde mis ventanas veo",
|
8 |
+
"El templo que quise tanto.",
|
9 |
+
"",
|
10 |
+
"El templo que tanto quise...",
|
11 |
+
"Pues no sé decir ya si le quiero,",
|
12 |
+
"Que en el rudo vaivén que sin tregua",
|
13 |
+
"Se agitan mis pensamientos,",
|
14 |
+
"Dudo si el rencor adusto",
|
15 |
+
"Vive unido al amor en mi pecho."],
|
16 |
+
"es_2": [
|
17 |
+
"Es hielo abrasador, es fuego helado,",
|
18 |
+
"es herida que duele y no se siente,",
|
19 |
+
"es un soñado bien, un mal presente,",
|
20 |
+
"es un breve descanso muy cansado.",
|
21 |
+
"",
|
22 |
+
"Es un descuido que nos da cuidado,",
|
23 |
+
"un cobarde con nombre de valiente,",
|
24 |
+
"un andar solitario entre la gente,",
|
25 |
+
"un amar solamente ser amado.",
|
26 |
+
"",
|
27 |
+
"Es una libertad encarcelada,",
|
28 |
+
"que dura hasta el postrero paroxismo;",
|
29 |
+
"enfermedad que crece si es curada.",
|
30 |
+
"Éste es el niño Amor, éste es su abismo.",
|
31 |
+
"¿Mirad cuál amistad tendrá con nada",
|
32 |
+
"el que en todo es contrario de sí mismo!"],
|
33 |
+
"en_1": [
|
34 |
+
"Two roads diverged in a yellow wood,",
|
35 |
+
"And sorry I could not travel both",
|
36 |
+
"And be one traveler, long I stood",
|
37 |
+
"And looked down one as far as I could",
|
38 |
+
"To where it bent in the undergrowth;",
|
39 |
+
"",
|
40 |
+
"Then took the other, as just as fair,",
|
41 |
+
"And having perhaps the better claim,",
|
42 |
+
"Because it was grassy and wanted wear;",
|
43 |
+
"Though as for that the passing there",
|
44 |
+
"Had worn them really about the same,",
|
45 |
+
"",
|
46 |
+
"And both that morning equally lay",
|
47 |
+
"In leaves no step had trodden black.",
|
48 |
+
"Oh, I kept the first for another day!",
|
49 |
+
"Yet knowing how way leads on to way,",
|
50 |
+
"I doubted if I should ever come back.",
|
51 |
+
"",
|
52 |
+
"I shall be telling this with a sigh",
|
53 |
+
"Somewhere ages and ages hence:",
|
54 |
+
"Two roads diverged in a wood, and I—",
|
55 |
+
"I took the one less traveled by,",
|
56 |
+
"And that has made all the difference."],
|
57 |
+
"en_2": [
|
58 |
+
"April is the cruellest month, breeding",
|
59 |
+
"Lilacs out of the dead land, mixing",
|
60 |
+
"Memory and desire, stirring",
|
61 |
+
"Dull roots with spring rain.",
|
62 |
+
"Winter kept us warm, covering",
|
63 |
+
"Earth in forgetful snow, feeding",
|
64 |
+
"A little life with dried tubers.",
|
65 |
+
"Summer surprised us, coming over the Starnbergersee",
|
66 |
+
"With a shower of rain; we stopped in the colonnade,",
|
67 |
+
"And went on in sunlight, into the Hofgarten,",
|
68 |
+
"And drank coffee, and talked for an hour.",
|
69 |
+
"Bin gar keine Russin, stamm' aus Litauen, echt deutsch.",
|
70 |
+
"And when we were children, staying at the arch-duke's,",
|
71 |
+
"My cousin's, he took me out on a sled,",
|
72 |
+
"And I was frightened. He said, Marie,",
|
73 |
+
"Marie, hold on tight. And down we went.",
|
74 |
+
"In the mountains, there you feel free.",
|
75 |
+
"I read, much of the night, and go south in the winter."],
|
76 |
+
"fr_1": [
|
77 |
+
"Demain, dès l'aube, à l'heure où blanchit la campagne,",
|
78 |
+
"Je partirai. Vois-tu, je sais que tu m'attends.",
|
79 |
+
"J'irai par la forêt, j'irai par la montagne.",
|
80 |
+
"Je ne puis demeurer loin de toi plus longtemps.",
|
81 |
+
"",
|
82 |
+
"Je marcherai les yeux fixés sur mes pensées,",
|
83 |
+
"Sans rien voir au dehors, sans entendre aucun bruit,",
|
84 |
+
"Seul, inconnu, le dos courbé, les mains croisées,",
|
85 |
+
"Triste, et le jour pour moi sera comme la nuit.",
|
86 |
+
"",
|
87 |
+
"Je ne regarderai ni l'or du soir qui tombe,",
|
88 |
+
"Ni les voiles au loin descendant vers Harfleur,",
|
89 |
+
"Et quand j'arriverai, je mettrai sur ta tombe",
|
90 |
+
"Un bouquet de houx vert et de bruyère en fleur."],
|
91 |
+
"fr_2": [
|
92 |
+
"Cheminement de tous les clochers",
|
93 |
+
"sur le ciel",
|
94 |
+
"guet-apens très doux",
|
95 |
+
"des aéroplanes",
|
96 |
+
"sur ton cœur",
|
97 |
+
"comme les hirondelles",
|
98 |
+
"que tu apprivoises",
|
99 |
+
"avec ton ombre",
|
100 |
+
"",
|
101 |
+
"Tu peux t'éloigner",
|
102 |
+
"dans la magie",
|
103 |
+
"des fleurs nocturnes",
|
104 |
+
"tu peux prendre la tempête",
|
105 |
+
"pour amie",
|
106 |
+
"je serai ce lac de brume",
|
107 |
+
"à ton arrivée",
|
108 |
+
"ce lac de brume",
|
109 |
+
"et tu diras que tu aimes",
|
110 |
+
"toutes les lumières",
|
111 |
+
"de la ville."],
|
112 |
+
"de_1": [
|
113 |
+
"Der du von dem Himmel bist,",
|
114 |
+
"Alles Leid und Schmerzen stillest,",
|
115 |
+
"Den, der doppelt elend ist,",
|
116 |
+
"Doppelt mit Erquickung füllest;",
|
117 |
+
"Ach, ich bin des Treibens müde!",
|
118 |
+
"Was soll all der Schmerz und Lust?",
|
119 |
+
"Süßer Friede,",
|
120 |
+
"Komm, ach komm in meine Brust!"],
|
121 |
+
"de_2": [
|
122 |
+
"Wieder duftet der Wald. ",
|
123 |
+
"Es heben die schwebenden Lerchen",
|
124 |
+
"mit sich den Himmel empor, der unseren Schultern schwer war; ",
|
125 |
+
"zwar sah man noch durch die Äste den Tag, wie er leer war,- ",
|
126 |
+
"aber nach langen, regnenden Nachmittagen ",
|
127 |
+
"kommen die goldübersonnten ",
|
128 |
+
"neueren Stunden, ",
|
129 |
+
"vor denen flüchtend an fernen Häuserfronten ",
|
130 |
+
"alle die wunden Fenster furchtsam mit Flügeln schlagen. ",
|
131 |
+
"Dann wird es still. Sogar der Regen geht leiser",
|
132 |
+
"über der Steine ruhig dunkelnden Glanz.",
|
133 |
+
"Alle Geräusche ducken sich ganz",
|
134 |
+
"in die glänzenden Knospen der Reiser."],
|
135 |
+
"zh_1": [
|
136 |
+
"春眠不觉晓,",
|
137 |
+
"处处闻啼鸟。",
|
138 |
+
"",
|
139 |
+
"夜来风雨声,",
|
140 |
+
"花落知多少"],
|
141 |
+
"zh_2": [
|
142 |
+
"关关雎鸠,在河之洲。",
|
143 |
+
"窈窕淑女,君子好逑。",
|
144 |
+
"",
|
145 |
+
"参差荇菜,左右流之。",
|
146 |
+
"窈窕淑女,寤寐求之。",
|
147 |
+
"",
|
148 |
+
"求之不得,寤寐思服。",
|
149 |
+
"悠哉悠哉,辗转反侧。",
|
150 |
+
"",
|
151 |
+
"参差荇菜,左右采之。",
|
152 |
+
"窈窕淑女,琴瑟友之。",
|
153 |
+
"",
|
154 |
+
"参差荇菜,左右毛之。",
|
155 |
+
"窈窕淑女,钟鼓乐之。"],
|
156 |
+
"ar_1": [
|
157 |
+
"داب نعشق لأليمه نجيمه",
|
158 |
+
"من يحبك ويموت فيك",
|
159 |
+
"إن قتلت عاد يكون بيك",
|
160 |
+
"لو قدر قلبي يخليك",
|
161 |
+
"لم يدبّر ذا النُّغيمة",
|
162 |
+
"يا مطرنَنِ شِلِباطُ (يا مذهول)",
|
163 |
+
"تُن حزين تنِ بناطُ (إنك مكروب)",
|
164 |
+
"ترى اليوم وَشْطاطُ (ضائعاً)",
|
165 |
+
"لم تذقي فيه غير لقيمة"],
|
166 |
+
"ar_2": [
|
167 |
+
"حَيّوا تُماضِرَ وَاِربَعوا صَحبي\t\tوَقِفوا فَإِنَّ وُقوفَكُم حَسبي",
|
168 |
+
"أَخُناسُ قَد هامَ الفُؤادُ بِكُم\t\tوَأَصابَهُ تَبَلٌ مِنَ الحُبِّ",
|
169 |
+
"ما إِن رَأَيتُ وَلا سَمِعتُ بِهِ\t\tكَاليَومِ طالي أَينُقٍ جُربِ",
|
170 |
+
"مُتَبَذِّلاً تَبدو مَحاسِنُهُ\t\tضَعُ الهِناءَ مَواضِعَ النُقبِ",
|
171 |
+
"مُتَحَسِّراً نَضَحَ الهِناءَ بِهِ\t\tضحَ العَبيرِ بِرَيطَةِ العَصبِ",
|
172 |
+
"فَسَليهُمُ عَنّي خُناسُ إِذا\t\tعَضَّ الجَميعَ الخَطبُ ما خَطبي"]
|
173 |
+
}
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
transformers
|
2 |
-
pandas
|
3 |
torch
|
4 |
-
|
|
|
|
|
|
1 |
transformers
|
|
|
2 |
torch
|
3 |
+
streamlit
|
4 |
+
icu_tokenizer
|
5 |
+
langid
|