Update app.py
Browse files
app.py
CHANGED
@@ -232,47 +232,49 @@ def add_new_eval(
|
|
232 |
|
233 |
#print(reference_set, flush=True)
|
234 |
if len(line['cqs']) < 3 or type(line['cqs']) is not list: # make sure there are at least 3 cqs
|
|
|
235 |
#return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
winner = np.argmax(sims)
|
248 |
-
# make sure the similarity of the winning reference sentence is at least 0.65
|
249 |
-
if sims[winner] > 0.65:
|
250 |
-
label = references[indx][winner]['label']
|
251 |
-
else:
|
252 |
-
label = 'not_able_to_evaluate'
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
268 |
print(winner, flush=True)
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
#print(id_to_eval, intervention_score, flush=True)
|
278 |
scores.append(intervention_score)
|
|
|
232 |
|
233 |
#print(reference_set, flush=True)
|
234 |
if len(line['cqs']) < 3 or type(line['cqs']) is not list: # make sure there are at least 3 cqs
|
235 |
+
num_cqs += 3
|
236 |
#return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
|
237 |
+
continue
|
238 |
+
for cq in line['cqs'][:3]: # here only take the first 3 cqs
|
239 |
+
if type(cq) is not dict:
|
240 |
+
num_cqs += 1
|
241 |
+
continue
|
242 |
+
cq_text = cq['cq']
|
243 |
+
|
244 |
+
if METRIC == 'similarity':
|
245 |
+
sentence_embedding = similarity_model.encode(cq_text)
|
246 |
+
#reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
|
247 |
+
sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
+
winner = np.argmax(sims)
|
250 |
+
# make sure the similarity of the winning reference sentence is at least 0.65
|
251 |
+
if sims[winner] > 0.65:
|
252 |
+
label = references[indx][winner]['label']
|
253 |
+
else:
|
254 |
+
label = 'not_able_to_evaluate'
|
255 |
+
|
256 |
+
if METRIC == 'gemma':
|
257 |
+
prompts = get_prompts(cq_text, '\n'.join(reference_set))
|
258 |
+
winner = run_model(model, tokenizer, prompts['compare'])
|
259 |
+
try: # here make sure the output is the id of a reference cq
|
260 |
+
if winner.strip() != 'Similar reference not found.':
|
261 |
+
label = references[index][int(winner)]['label']
|
262 |
+
else:
|
263 |
+
label = 'not_able_to_evaluate'
|
264 |
print(winner, flush=True)
|
265 |
+
except IndexError:
|
266 |
+
label = 'evaluation_issue'
|
267 |
+
print(winner, flush=True)
|
268 |
+
except ValueError:
|
269 |
+
label = 'evaluation_issue'
|
270 |
+
print(winner, flush=True)
|
271 |
+
|
272 |
+
#print(label, flush=True)
|
273 |
+
num_cqs += 1
|
274 |
+
if label == 'Useful':
|
275 |
+
intervention_score += 1/3
|
276 |
+
if label == 'not_able_to_evaluate':
|
277 |
+
nae += 1
|
278 |
|
279 |
#print(id_to_eval, intervention_score, flush=True)
|
280 |
scores.append(intervention_score)
|