jed-tiotuico commited on
Commit
8a1f10f
1 Parent(s): a49ac49

fixed return value

Browse files
Files changed (1) hide show
  1. handler.py +56 -4
handler.py CHANGED
@@ -146,6 +146,51 @@ def get_model():
146
  return AurelioRNN
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  class EndpointHandler:
150
  def __init__(self, path=""):
151
  # load the optimized model
@@ -183,7 +228,6 @@ class EndpointHandler:
183
  "embedding_dim": embedding_dim,
184
  "rnn_units": rnn_units,
185
  }
186
- # load_from_hub
187
  lstm = get_model()
188
  model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
189
  model.eval() # Set the model to evaluation mode
@@ -191,7 +235,6 @@ class EndpointHandler:
191
  dir_path = os.path.abspath(os.path.dirname(__file__))
192
  # Load the Kapampangan vocabulary
193
  kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
194
- # Define the source and destination paths
195
  seq_length = 64
196
 
197
  tokenizer = ByteLevelBPETokenizer(
@@ -210,7 +253,16 @@ class EndpointHandler:
210
  vocabulary=kapampangan_vocabulary,
211
  tokenizer=tokenizer,
212
  )
213
- predictions.append(generated_text)
 
 
 
 
 
 
 
 
 
 
214
 
215
- # return preductions as concated string
216
  return predictions
 
146
  return AurelioRNN
147
 
148
 
149
+ def calculate_perplexity_on_text(model, text, seq_length, tokenizer):
150
+ loss_fn = nn.CrossEntropyLoss()
151
+
152
+ model.eval()
153
+ total_loss = 0
154
+ total_words = 0
155
+
156
+ # Tokenize the text
157
+ encoded = tokenizer.encode(text)
158
+ ids = encoded.ids
159
+ if len(ids) <= seq_length:
160
+ print(
161
+ "Input text is too short to calculate perplexity. length:",
162
+ len(ids),
163
+ "seq_length:",
164
+ seq_length,
165
+ )
166
+ return float(
167
+ "inf"
168
+ )
169
+
170
+ inputs = [ids[i : i + seq_length] for i in range(len(ids) - seq_length)]
171
+ targets = [ids[i + 1 : i + seq_length + 1] for i in range(len(ids) - seq_length)]
172
+
173
+ state_h, state_c = model.init_state(1)
174
+
175
+ with torch.no_grad():
176
+ for i in range(len(inputs)):
177
+ input_tensor = torch.tensor(inputs[i]).unsqueeze(0).to(device)
178
+ target_tensor = torch.tensor(targets[i]).unsqueeze(0).to(device)
179
+
180
+ output, (state_h, state_c) = model(
181
+ input_tensor, (state_h.detach(), state_c.detach())
182
+ )
183
+ loss = loss_fn(output.transpose(1, 2), target_tensor)
184
+ total_loss += loss.item()
185
+ total_words += seq_length
186
+
187
+ average_loss = total_loss / total_words
188
+
189
+ perplexity = np.exp(average_loss)
190
+ return perplexity
191
+
192
+
193
+
194
  class EndpointHandler:
195
  def __init__(self, path=""):
196
  # load the optimized model
 
228
  "embedding_dim": embedding_dim,
229
  "rnn_units": rnn_units,
230
  }
 
231
  lstm = get_model()
232
  model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
233
  model.eval() # Set the model to evaluation mode
 
235
  dir_path = os.path.abspath(os.path.dirname(__file__))
236
  # Load the Kapampangan vocabulary
237
  kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
 
238
  seq_length = 64
239
 
240
  tokenizer = ByteLevelBPETokenizer(
 
253
  vocabulary=kapampangan_vocabulary,
254
  tokenizer=tokenizer,
255
  )
256
+ perplexity = calculate_perplexity_on_text(
257
+ model, generated_text, seq_length=seq_length - 1, tokenizer=tokenizer
258
+ )
259
+ predictions.append(
260
+ {
261
+ "label": error_rate,
262
+ "score": 1 - error_rate,
263
+ "generated_text": generated_text,
264
+ "perplexity": perplexity
265
+ }
266
+ )
267
 
 
268
  return predictions