Elron Bandel commited on
Commit
3411193
1 Parent(s): c467c21
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +192 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: AlephBERT
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
 
1
  ---
2
  title: AlephBERT
3
+ emoji: 🥙
4
+ colorFrom: pink
5
+ colorTo: pink
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ from transformers.tokenization_utils import TruncationStrategy
4
+
5
+ import tokenizers
6
+ import pandas as pd
7
+ import requests
8
+
9
+ st.set_page_config(
10
+ page_title='AlephBERT Demo',
11
+ page_icon="🥙",
12
+ initial_sidebar_state="expanded",
13
+ )
14
+
15
+ # st.markdown(
16
+ # """
17
+ # <style>
18
+
19
+ # .sidebar .sidebar-content {
20
+ # background-image: linear-gradient(#3377ff, #80aaff);
21
+ # }
22
+
23
+ # footer {
24
+ # color:white;
25
+ # visibility: hidden;
26
+ # }
27
+ # input {
28
+ # direction: rtl;
29
+ # }
30
+ # .stTextInput .instructions {
31
+ # color: grey;
32
+ # font-size: 9px;}
33
+
34
+ # </style>
35
+ # <div style="color:white; font-size:13px; font-family:monospace;position: fixed; z-index: 1; bottom: 0; right:0; background-color: #f63766;margin:3px;padding:8px;border-radius: 5px;"><a href="https://huggingface.co/onlplab/alephbert-base" target="_blank" style="text-decoration: none;color: white;">Use aleph-bert in your project </a></div>
36
+ # """,
37
+ # unsafe_allow_html=True,
38
+ # )
39
+
40
+ models = {
41
+ "AlephBERT-base": {
42
+ "name_or_path":"onlplab/alephbert-base",
43
+ "description":"AlephBERT base model",
44
+ },
45
+ "HeBERT-base-TAU": {
46
+ "name_or_path":"avichr/heBERT",
47
+ "description":"HeBERT model created by TAU"
48
+ },
49
+ "mBERT-base-multilingual-cased": {
50
+ "name_or_path":"bert-base-multilingual-cased",
51
+ "description":"Multilingual BERT model"
52
+ }
53
+ }
54
+
55
+ @st.cache(show_spinner=False)
56
+ def get_json_from_url(url):
57
+ return models
58
+ return requests.get(url).json()
59
+
60
+ # models = get_json_from_url('https://huggingface.co/spaces/biu-nlp/AlephBERT/raw/main/models.json')
61
+
62
+
63
+
64
+ @st.cache(show_spinner=False, hash_funcs={tokenizers.Tokenizer: str})
65
+ def load_model(model):
66
+ pipe = pipeline('fill-mask', models[model]['name_or_path'])
67
+ def do_tokenize(inputs):
68
+ return pipe.tokenizer(
69
+ inputs,
70
+ add_special_tokens=True,
71
+ return_tensors=pipe.framework,
72
+ padding=True,
73
+ truncation=TruncationStrategy.DO_NOT_TRUNCATE,
74
+ )
75
+
76
+ def _parse_and_tokenize(
77
+ inputs, tokenized=False, **kwargs
78
+ ):
79
+ if not tokenized:
80
+ inputs = do_tokenize(inputs)
81
+ return inputs
82
+
83
+ pipe._parse_and_tokenize = _parse_and_tokenize
84
+
85
+ return pipe, do_tokenize
86
+
87
+
88
+
89
+
90
+
91
+ st.title('AlephBERT🥙')
92
+ st.sidebar.markdown(
93
+ """<div><a target="_blank" href="https://nlp.biu.ac.il/~rtsarfaty/onlp#"><img src="https://nlp.biu.ac.il/~rtsarfaty/static/landing_static/img/onlp_logo.png" style="filter: invert(100%);display: block;margin-left: auto;margin-right: auto;
94
+ width: 70%;"></a>
95
+ <p style="color:white; font-size:13px; font-family:monospace; text-align: center">AlephBERT Demo &bull; <a href="https://nlp.biu.ac.il/~rtsarfaty/onlp#" style="text-decoration: none;color: white;" target="_blank">ONLP Lab</a></p></div>
96
+ <br>""",
97
+ unsafe_allow_html=True,
98
+ )
99
+
100
+ mode = 'Models'
101
+
102
+ if mode == 'Models':
103
+ model = st.sidebar.selectbox(
104
+ 'Select Model',
105
+ list(models.keys()))
106
+ masking_level = st.sidebar.selectbox('Masking Level:', ['Tokens', 'SubWords'])
107
+ n_res = st.sidebar.number_input(
108
+ 'Number Of Results',
109
+ format='%d',
110
+ value=5,
111
+ min_value=1,
112
+ max_value=100)
113
+
114
+ model_tags = model.split('-')
115
+ model_tags[0] = 'Model:' + model_tags[0]
116
+
117
+ st.markdown(''.join([f'<span style="color:white; font-size:13px; font-family:monospace; background-color: #f63766;margin:3px;padding:8px;border-radius: 5px;">{tag}</span>' for tag in model_tags]),unsafe_allow_html=True)
118
+ st.markdown('___')
119
+ ####
120
+ #prepare the model
121
+ ####
122
+
123
+ unmasker, tokenize = load_model(model)
124
+
125
+
126
+ ####
127
+ # get inputs
128
+ ####
129
+
130
+ input_text = st.text_input('Insert text you want to mask', '')
131
+ if input_text:
132
+ input_masked = None
133
+ tokenized = tokenize(input_text)
134
+ ids = tokenized['input_ids'].tolist()[0]
135
+ subwords = unmasker.tokenizer.convert_ids_to_tokens(ids)
136
+
137
+ if masking_level == 'Tokens':
138
+ tokens = str(input_text).split()
139
+ masked_token = st.selectbox('Select token to mask:', [''] + tokens)
140
+ if masked_token != '':
141
+ input_masked = ' '.join(token if token != masked_token else '[MASK]' for token in tokens)
142
+ display_input = input_masked
143
+ if masking_level == 'SubWords':
144
+ tokens = subwords
145
+ idx = st.selectbox('Select token to mask:', list(range(0,len(tokens)-1)), format_func=lambda i: tokens[i] if i else '')
146
+ tokenized['input_ids'][0][idx] = unmasker.tokenizer.mask_token_id
147
+ ids = tokenized['input_ids'].tolist()[0]
148
+ display_input = ' '.join(unmasker.tokenizer.convert_ids_to_tokens(ids[1:-1]))
149
+ if idx:
150
+ input_masked = tokenized
151
+
152
+ if input_masked:
153
+ st.markdown('#### Input:')
154
+ ids = tokenized['input_ids'].tolist()[0]
155
+ subwords = unmasker.tokenizer.convert_ids_to_tokens(ids)
156
+ st.markdown(f'<p dir="rtl">{display_input}</p>',
157
+ unsafe_allow_html=True,
158
+ )
159
+ st.markdown('#### Outputs:')
160
+ res = unmasker(input_masked, tokenized=masking_level == 'SubWords', top_k=n_res)
161
+ if res:
162
+ res = [{'Prediction':r['token_str'], 'Completed Sentence':r['sequence'].replace('[SEP]', '').replace('[CLS]', ''), 'Score':r['score']} for r in res]
163
+ res_table = pd.DataFrame(res)
164
+ st.table(res_table)
165
+
166
+
167
+
168
+ # cols = st.beta_columns(len(tokens))
169
+ # genre = st.radio(
170
+ # 'Select token to mask:', tokens)
171
+ # for col, token in zip(cols, reversed(tokens)):
172
+ # col.text(token)
173
+
174
+ # st.text(tokens)
175
+ # res = unmasker(input_text)
176
+ # res_table = pd.DataFrame(res)
177
+ # st.table(res_table)
178
+ # st.text(res)
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ sentencepiece
3
+ transformers==4.4.2
4
+ tokenizers
5
+ pandas