Commit
·
8a1f10f
1
Parent(s):
a49ac49
fixed return value
Browse files- handler.py +56 -4
handler.py
CHANGED
@@ -146,6 +146,51 @@ def get_model():
|
|
146 |
return AurelioRNN
|
147 |
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
class EndpointHandler:
|
150 |
def __init__(self, path=""):
|
151 |
# load the optimized model
|
@@ -183,7 +228,6 @@ class EndpointHandler:
|
|
183 |
"embedding_dim": embedding_dim,
|
184 |
"rnn_units": rnn_units,
|
185 |
}
|
186 |
-
# load_from_hub
|
187 |
lstm = get_model()
|
188 |
model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
|
189 |
model.eval() # Set the model to evaluation mode
|
@@ -191,7 +235,6 @@ class EndpointHandler:
|
|
191 |
dir_path = os.path.abspath(os.path.dirname(__file__))
|
192 |
# Load the Kapampangan vocabulary
|
193 |
kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
|
194 |
-
# Define the source and destination paths
|
195 |
seq_length = 64
|
196 |
|
197 |
tokenizer = ByteLevelBPETokenizer(
|
@@ -210,7 +253,16 @@ class EndpointHandler:
|
|
210 |
vocabulary=kapampangan_vocabulary,
|
211 |
tokenizer=tokenizer,
|
212 |
)
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
-
# return preductions as concated string
|
216 |
return predictions
|
|
|
146 |
return AurelioRNN
|
147 |
|
148 |
|
149 |
+
def calculate_perplexity_on_text(model, text, seq_length, tokenizer):
|
150 |
+
loss_fn = nn.CrossEntropyLoss()
|
151 |
+
|
152 |
+
model.eval()
|
153 |
+
total_loss = 0
|
154 |
+
total_words = 0
|
155 |
+
|
156 |
+
# Tokenize the text
|
157 |
+
encoded = tokenizer.encode(text)
|
158 |
+
ids = encoded.ids
|
159 |
+
if len(ids) <= seq_length:
|
160 |
+
print(
|
161 |
+
"Input text is too short to calculate perplexity. length:",
|
162 |
+
len(ids),
|
163 |
+
"seq_length:",
|
164 |
+
seq_length,
|
165 |
+
)
|
166 |
+
return float(
|
167 |
+
"inf"
|
168 |
+
)
|
169 |
+
|
170 |
+
inputs = [ids[i : i + seq_length] for i in range(len(ids) - seq_length)]
|
171 |
+
targets = [ids[i + 1 : i + seq_length + 1] for i in range(len(ids) - seq_length)]
|
172 |
+
|
173 |
+
state_h, state_c = model.init_state(1)
|
174 |
+
|
175 |
+
with torch.no_grad():
|
176 |
+
for i in range(len(inputs)):
|
177 |
+
input_tensor = torch.tensor(inputs[i]).unsqueeze(0).to(device)
|
178 |
+
target_tensor = torch.tensor(targets[i]).unsqueeze(0).to(device)
|
179 |
+
|
180 |
+
output, (state_h, state_c) = model(
|
181 |
+
input_tensor, (state_h.detach(), state_c.detach())
|
182 |
+
)
|
183 |
+
loss = loss_fn(output.transpose(1, 2), target_tensor)
|
184 |
+
total_loss += loss.item()
|
185 |
+
total_words += seq_length
|
186 |
+
|
187 |
+
average_loss = total_loss / total_words
|
188 |
+
|
189 |
+
perplexity = np.exp(average_loss)
|
190 |
+
return perplexity
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
class EndpointHandler:
|
195 |
def __init__(self, path=""):
|
196 |
# load the optimized model
|
|
|
228 |
"embedding_dim": embedding_dim,
|
229 |
"rnn_units": rnn_units,
|
230 |
}
|
|
|
231 |
lstm = get_model()
|
232 |
model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
|
233 |
model.eval() # Set the model to evaluation mode
|
|
|
235 |
dir_path = os.path.abspath(os.path.dirname(__file__))
|
236 |
# Load the Kapampangan vocabulary
|
237 |
kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
|
|
|
238 |
seq_length = 64
|
239 |
|
240 |
tokenizer = ByteLevelBPETokenizer(
|
|
|
253 |
vocabulary=kapampangan_vocabulary,
|
254 |
tokenizer=tokenizer,
|
255 |
)
|
256 |
+
perplexity = calculate_perplexity_on_text(
|
257 |
+
model, generated_text, seq_length=seq_length - 1, tokenizer=tokenizer
|
258 |
+
)
|
259 |
+
predictions.append(
|
260 |
+
{
|
261 |
+
"label": error_rate,
|
262 |
+
"score": 1 - error_rate,
|
263 |
+
"generated_text": generated_text,
|
264 |
+
"perplexity": perplexity
|
265 |
+
}
|
266 |
+
)
|
267 |
|
|
|
268 |
return predictions
|