Owos commited on
Commit
c06d96e
1 Parent(s): 5d3dcaa

upadated progress bar

Browse files
Files changed (1) hide show
  1. app.py +101 -65
app.py CHANGED
@@ -9,12 +9,10 @@ from huggingface_hub import HfApi, hf_hub_download
9
  from torch.utils.data import Dataset, DataLoader
10
 
11
  st.set_page_config(
12
- page_title="Koya Recommendation System",
13
- initial_sidebar_state="auto",
14
  )
15
 
16
 
17
-
18
  st.markdown(
19
  """
20
 
@@ -23,88 +21,122 @@ st.markdown(
23
  You can try it below"""
24
  )
25
 
 
26
  @st.cache
27
  def get_model_infos(multilingual="multilingual"):
28
  api = HfApi()
29
  model_infos = api.list_models(filter=["fill-mask", multilingual], cardData=True)
30
- data = [['id','task', 'lang', 'sha']]
31
- count =0
32
  for model in model_infos:
33
  try:
34
- data.append([model.modelId, model.pipeline_tag, model.cardData['language'], model.sha])
 
 
 
 
 
 
 
35
  except:
36
- data.append([model.modelId, model.pipeline_tag, None, model.sha])
37
 
38
  df = pd.DataFrame.from_records(data[1:], columns=data[0])
39
  return df
40
 
 
41
  class MLMDataset(Dataset):
42
- def __init__(self,sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
  self.sentence = sentence
44
  self.tokenizer = tokenizer
45
 
46
- self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
47
 
48
  self.num_samples = self.tensor_input.size()[-1] - 2
49
 
50
  self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
51
- self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
 
 
 
 
52
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
53
 
54
  # Added by Chris Emezue on 29.01.2023
55
- # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
56
- unk_mask = torch.ones(self.batch_input.size()[0],self.batch_input.size()[1],self.tokenizer.vocab_size)
 
 
 
 
57
  batch_input_for_unk = self.batch_input.unsqueeze(-1).expand(unk_mask.size())
58
  self.unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
59
 
60
-
61
  self.mask = torch.zeros(self.batch_input.size())
62
- src = torch.ones(self.batch_input.size(0)).unsqueeze(0).T
63
 
64
  self.mask.scatter_(1, self.random_ids, src)
65
  self.masked_input = self.batch_input.masked_fill(self.mask == 1, MLM_MASK_TOKEN)
66
- self.labels = self.batch_input.masked_fill(self.masked_input != MLM_MASK_TOKEN, -100)
 
 
67
  # If logits change when labels is not set to -100:
68
  # If we are using the logits, this does not change it then. but if are using the loss,
69
  # then this has an effect.
70
 
71
- assert self.masked_input.shape[0]==self.labels.shape[0] == self.mask.shape[0] == self.unk_mask.shape[0]
 
 
 
 
 
72
 
73
  def __len__(self):
74
  return self.masked_input.shape[0]
75
 
76
-
77
- def __getitem__(self,idx):
78
- return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
 
 
 
 
79
 
80
 
81
- def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE):
82
- mlm_dataset = MLMDataset(sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
83
- dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
 
 
84
 
85
- score =1
86
 
87
- for i,batch in enumerate(dataloader):
88
- masked_input, mask,labels, unk_mask = batch
89
  output = model(masked_input, labels=labels)
90
 
91
- logits_ = output['logits']
92
- logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
 
 
93
 
94
  indices = torch.nonzero(mask)
95
- logits_of_interest = logits[indices[:,0],indices[:,1],:]
96
 
97
- labels_of_interest = labels[indices[:,0],indices[:,1]]
98
- log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
99
-
100
- batch_score = (log_probs.sum()/(-1 *mlm_dataset.num_samples)).exp().item() # exp(x+y) = exp(x)*exp(y)
 
 
101
  score *= batch_score
102
  return score
103
 
104
 
105
-
106
- def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples):
107
- '''
 
108
  IDEA
109
  -----------------
110
  PP = perplexity(P) where perplexity(P) function just computes:
@@ -116,68 +148,71 @@ def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_sa
116
 
117
  Note: everytime you run this function, the results change slightly (but the ordering should be relatively the same),
118
  because the tokens to mask are chosen randomly.
119
- '''
120
-
121
- tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
122
  batch_input = tensor_input.repeat(num_samples, 1)
123
-
124
- random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
 
 
125
  random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
126
 
127
  # Added by Chris Emezue on 29.01.2023
128
- # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
129
- unk_mask = torch.ones(batch_input.size()[0],batch_input.size()[1],tokenizer.vocab_size)
 
 
130
  batch_input_for_unk = batch_input.unsqueeze(-1).expand(unk_mask.size())
131
  unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
132
 
133
-
134
  mask = torch.zeros(batch_input.size())
135
- src = torch.ones(batch_input.size(0)).unsqueeze(0).T
136
 
137
  mask.scatter_(1, random_ids, src)
138
  masked_input = batch_input.masked_fill(mask == 1, MLM_MASK_TOKEN)
139
- labels = batch_input.masked_fill( masked_input != MLM_MASK_TOKEN, -100)
140
  # If logits change when labels is not set to -100:
141
  # If we are using the logits, this does not change it then. but if are using the loss,
142
  # then this has an effect.
143
 
144
-
145
  output = model(masked_input, labels=labels)
146
 
147
- logits_ = output['logits']
148
- logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
 
 
149
 
150
  indices = torch.nonzero(mask)
151
- logits_of_interest = logits[indices[:,0],indices[:,1],:]
152
 
153
- labels_of_interest = labels[indices[:,0],indices[:,1]]
154
- log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
155
-
156
- score = (log_probs.sum()/(-1 *num_samples)).exp().item()
157
 
158
  return score
159
 
160
 
161
  def sort_dictionary(dict):
162
-
163
  keys = list(dict.keys())
164
  values = list(dict.values())
165
  sorted_value_index = np.argsort(values)
166
  sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
167
  return sorted_dict
168
 
 
169
  def set_seed():
170
  np.random.seed(2023)
171
  torch.manual_seed(2023)
172
 
173
 
174
-
175
-
176
  sentence = st.text_input("Please input a sample sentence in the target language")
177
 
178
  models = get_model_infos(multilingual=None)
179
- selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
180
-
181
  )
182
 
183
  run = st.button("Get Scores")
@@ -187,19 +222,20 @@ if run:
187
  st.write(progress_text)
188
  my_bar = st.progress(0)
189
 
190
-
191
- scores={}
192
  for index, model_id in enumerate(selected_models):
193
  tokenizer = AutoTokenizer.from_pretrained(model_id)
194
  model = AutoModelWithLMHead.from_pretrained(model_id)
195
  if model_id.startswith("castorini"):
196
- tokenizer.model_max_length = 512
197
- MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
198
  MLM_UNK_TOKEN = tokenizer.unk_token_id
199
 
200
  BATCH_SIZE = 1
201
- score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE)
 
 
202
  scores[model_id] = score
203
- my_bar.progress(index + 100/len(selected_models))
204
  scores = sort_dictionary(scores)
205
- st.write("Our recommendation is:", scores)
 
9
  from torch.utils.data import Dataset, DataLoader
10
 
11
  st.set_page_config(
12
+ page_title="Koya Recommendation System", initial_sidebar_state="auto",
 
13
  )
14
 
15
 
 
16
  st.markdown(
17
  """
18
 
 
21
  You can try it below"""
22
  )
23
 
24
+
25
  @st.cache
26
  def get_model_infos(multilingual="multilingual"):
27
  api = HfApi()
28
  model_infos = api.list_models(filter=["fill-mask", multilingual], cardData=True)
29
+ data = [["id", "task", "lang", "sha"]]
30
+ count = 0
31
  for model in model_infos:
32
  try:
33
+ data.append(
34
+ [
35
+ model.modelId,
36
+ model.pipeline_tag,
37
+ model.cardData["language"],
38
+ model.sha,
39
+ ]
40
+ )
41
  except:
42
+ data.append([model.modelId, model.pipeline_tag, None, model.sha])
43
 
44
  df = pd.DataFrame.from_records(data[1:], columns=data[0])
45
  return df
46
 
47
+
48
  class MLMDataset(Dataset):
49
+ def __init__(self, sentence, tokenizer, MLM_MASK_TOKEN, MLM_UNK_TOKEN):
50
  self.sentence = sentence
51
  self.tokenizer = tokenizer
52
 
53
+ self.tensor_input = self.tokenizer(sentence, return_tensors="pt")["input_ids"]
54
 
55
  self.num_samples = self.tensor_input.size()[-1] - 2
56
 
57
  self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
58
+ self.random_ids = np.random.choice(
59
+ [i for i in range(1, self.tensor_input.size(1) - 1)],
60
+ self.num_samples,
61
+ replace=False,
62
+ ) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
63
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
64
 
65
  # Added by Chris Emezue on 29.01.2023
66
+ # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
67
+ unk_mask = torch.ones(
68
+ self.batch_input.size()[0],
69
+ self.batch_input.size()[1],
70
+ self.tokenizer.vocab_size,
71
+ )
72
  batch_input_for_unk = self.batch_input.unsqueeze(-1).expand(unk_mask.size())
73
  self.unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
74
 
 
75
  self.mask = torch.zeros(self.batch_input.size())
76
+ src = torch.ones(self.batch_input.size(0)).unsqueeze(0).T
77
 
78
  self.mask.scatter_(1, self.random_ids, src)
79
  self.masked_input = self.batch_input.masked_fill(self.mask == 1, MLM_MASK_TOKEN)
80
+ self.labels = self.batch_input.masked_fill(
81
+ self.masked_input != MLM_MASK_TOKEN, -100
82
+ )
83
  # If logits change when labels is not set to -100:
84
  # If we are using the logits, this does not change it then. but if are using the loss,
85
  # then this has an effect.
86
 
87
+ assert (
88
+ self.masked_input.shape[0]
89
+ == self.labels.shape[0]
90
+ == self.mask.shape[0]
91
+ == self.unk_mask.shape[0]
92
+ )
93
 
94
  def __len__(self):
95
  return self.masked_input.shape[0]
96
 
97
+ def __getitem__(self, idx):
98
+ return (
99
+ self.masked_input[idx],
100
+ self.mask[idx],
101
+ self.labels[idx],
102
+ self.unk_mask[idx],
103
+ )
104
 
105
 
106
+ def get_sense_score_batched(
107
+ sentence, tokenizer, model, MLM_MASK_TOKEN, MLM_UNK_TOKEN, BATCH_SIZE
108
+ ):
109
+ mlm_dataset = MLMDataset(sentence, tokenizer, MLM_MASK_TOKEN, MLM_UNK_TOKEN)
110
+ dataloader = DataLoader(mlm_dataset, batch_size=BATCH_SIZE)
111
 
112
+ score = 1
113
 
114
+ for i, batch in enumerate(dataloader):
115
+ masked_input, mask, labels, unk_mask = batch
116
  output = model(masked_input, labels=labels)
117
 
118
+ logits_ = output["logits"]
119
+ logits = (
120
+ logits_ * unk_mask
121
+ ) # Penalizing the unk tokens by setting their probs to zero
122
 
123
  indices = torch.nonzero(mask)
124
+ logits_of_interest = logits[indices[:, 0], indices[:, 1], :]
125
 
126
+ labels_of_interest = labels[indices[:, 0], indices[:, 1]]
127
+ log_probs = logits_of_interest.gather(1, labels_of_interest.view(-1, 1))
128
+
129
+ batch_score = (
130
+ (log_probs.sum() / (-1 * mlm_dataset.num_samples)).exp().item()
131
+ ) # exp(x+y) = exp(x)*exp(y)
132
  score *= batch_score
133
  return score
134
 
135
 
136
+ def get_sense_score(
137
+ sentence, tokenizer, model, MLM_MASK_TOKEN, MLM_UNK_TOKEN, num_samples
138
+ ):
139
+ """
140
  IDEA
141
  -----------------
142
  PP = perplexity(P) where perplexity(P) function just computes:
 
148
 
149
  Note: everytime you run this function, the results change slightly (but the ordering should be relatively the same),
150
  because the tokens to mask are chosen randomly.
151
+ """
152
+
153
+ tensor_input = tokenizer(sentence, return_tensors="pt")["input_ids"]
154
  batch_input = tensor_input.repeat(num_samples, 1)
155
+
156
+ random_ids = np.random.choice(
157
+ [i for i in range(1, tensor_input.size(1) - 1)], num_samples, replace=False
158
+ ) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
159
  random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
160
 
161
  # Added by Chris Emezue on 29.01.2023
162
+ # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
163
+ unk_mask = torch.ones(
164
+ batch_input.size()[0], batch_input.size()[1], tokenizer.vocab_size
165
+ )
166
  batch_input_for_unk = batch_input.unsqueeze(-1).expand(unk_mask.size())
167
  unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
168
 
 
169
  mask = torch.zeros(batch_input.size())
170
+ src = torch.ones(batch_input.size(0)).unsqueeze(0).T
171
 
172
  mask.scatter_(1, random_ids, src)
173
  masked_input = batch_input.masked_fill(mask == 1, MLM_MASK_TOKEN)
174
+ labels = batch_input.masked_fill(masked_input != MLM_MASK_TOKEN, -100)
175
  # If logits change when labels is not set to -100:
176
  # If we are using the logits, this does not change it then. but if are using the loss,
177
  # then this has an effect.
178
 
 
179
  output = model(masked_input, labels=labels)
180
 
181
+ logits_ = output["logits"]
182
+ logits = (
183
+ logits_ * unk_mask
184
+ ) # Penalizing the unk tokens by setting their probs to zero
185
 
186
  indices = torch.nonzero(mask)
187
+ logits_of_interest = logits[indices[:, 0], indices[:, 1], :]
188
 
189
+ labels_of_interest = labels[indices[:, 0], indices[:, 1]]
190
+ log_probs = logits_of_interest.gather(1, labels_of_interest.view(-1, 1))
191
+
192
+ score = (log_probs.sum() / (-1 * num_samples)).exp().item()
193
 
194
  return score
195
 
196
 
197
  def sort_dictionary(dict):
198
+
199
  keys = list(dict.keys())
200
  values = list(dict.values())
201
  sorted_value_index = np.argsort(values)
202
  sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
203
  return sorted_dict
204
 
205
+
206
  def set_seed():
207
  np.random.seed(2023)
208
  torch.manual_seed(2023)
209
 
210
 
 
 
211
  sentence = st.text_input("Please input a sample sentence in the target language")
212
 
213
  models = get_model_infos(multilingual=None)
214
+ selected_models = st.multiselect(
215
+ "Select of number of models you would like to compare", models["id"]
216
  )
217
 
218
  run = st.button("Get Scores")
 
222
  st.write(progress_text)
223
  my_bar = st.progress(0)
224
 
225
+ scores = {}
 
226
  for index, model_id in enumerate(selected_models):
227
  tokenizer = AutoTokenizer.from_pretrained(model_id)
228
  model = AutoModelWithLMHead.from_pretrained(model_id)
229
  if model_id.startswith("castorini"):
230
+ tokenizer.model_max_length = 512
231
+ MLM_MASK_TOKEN = tokenizer.mask_token_id # [(103, '[MASK]')]
232
  MLM_UNK_TOKEN = tokenizer.unk_token_id
233
 
234
  BATCH_SIZE = 1
235
+ score = get_sense_score_batched(
236
+ sentence, tokenizer, model, MLM_MASK_TOKEN, MLM_UNK_TOKEN, BATCH_SIZE
237
+ )
238
  scores[model_id] = score
239
+ my_bar.progress(index + 1 / len(selected_models))
240
  scores = sort_dictionary(scores)
241
+ st.write("Our recommendation is:", scores)