Update app.py
Browse files
app.py
CHANGED
|
@@ -215,7 +215,7 @@ def add_new_eval(
|
|
| 215 |
|
| 216 |
call_start()
|
| 217 |
|
| 218 |
-
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
| 219 |
with open(file_path, 'r') as f:
|
| 220 |
data = json.load(f)
|
| 221 |
scores = []
|
|
@@ -231,45 +231,48 @@ def add_new_eval(
|
|
| 231 |
# TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
|
| 232 |
|
| 233 |
#print(reference_set, flush=True)
|
| 234 |
-
if len(line['cqs']) < 3: # make sure there are at least 3 cqs
|
| 235 |
-
return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
prompts = get_prompts(cq_text, '\n'.join(reference_set))
|
| 253 |
-
winner = run_model(model, tokenizer, prompts['compare'])
|
| 254 |
-
try: # here make sure the output is the id of a reference cq
|
| 255 |
-
if winner.strip() != 'Similar reference not found.':
|
| 256 |
-
label = references[index][int(winner)]['label']
|
| 257 |
-
else:
|
| 258 |
label = 'not_able_to_evaluate'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
print(winner, flush=True)
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
intervention_score += 1/3
|
| 271 |
-
if label == 'not_able_to_evaluate':
|
| 272 |
-
nae += 1
|
| 273 |
|
| 274 |
#print(id_to_eval, intervention_score, flush=True)
|
| 275 |
scores.append(intervention_score)
|
|
|
|
| 215 |
|
| 216 |
call_start()
|
| 217 |
|
| 218 |
+
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
| 219 |
with open(file_path, 'r') as f:
|
| 220 |
data = json.load(f)
|
| 221 |
scores = []
|
|
|
|
| 231 |
# TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
|
| 232 |
|
| 233 |
#print(reference_set, flush=True)
|
| 234 |
+
if len(line['cqs']) < 3 and type(line['cqs']) is list: # make sure there are at least 3 cqs
|
| 235 |
+
#return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
|
| 236 |
+
intervention_score = 0
|
| 237 |
+
nae = 3
|
| 238 |
+
else:
|
| 239 |
+
for cq in line['cqs'][:3]: # here only take the first 3 cqs
|
| 240 |
+
cq_text = cq['cq']
|
| 241 |
+
|
| 242 |
+
if METRIC == 'similarity':
|
| 243 |
+
sentence_embedding = similarity_model.encode(cq_text)
|
| 244 |
+
#reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
|
| 245 |
+
sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
|
| 246 |
+
|
| 247 |
+
winner = np.argmax(sims)
|
| 248 |
+
# make sure the similarity of the winning reference sentence is at least 0.65
|
| 249 |
+
if sims[winner] > 0.65:
|
| 250 |
+
label = references[indx][winner]['label']
|
| 251 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
label = 'not_able_to_evaluate'
|
| 253 |
+
|
| 254 |
+
if METRIC == 'gemma':
|
| 255 |
+
prompts = get_prompts(cq_text, '\n'.join(reference_set))
|
| 256 |
+
winner = run_model(model, tokenizer, prompts['compare'])
|
| 257 |
+
try: # here make sure the output is the id of a reference cq
|
| 258 |
+
if winner.strip() != 'Similar reference not found.':
|
| 259 |
+
label = references[index][int(winner)]['label']
|
| 260 |
+
else:
|
| 261 |
+
label = 'not_able_to_evaluate'
|
| 262 |
+
print(winner, flush=True)
|
| 263 |
+
except IndexError:
|
| 264 |
+
label = 'evaluation_issue'
|
| 265 |
print(winner, flush=True)
|
| 266 |
+
except ValueError:
|
| 267 |
+
label = 'evaluation_issue'
|
| 268 |
+
print(winner, flush=True)
|
| 269 |
+
|
| 270 |
+
#print(label, flush=True)
|
| 271 |
+
num_cqs += 1
|
| 272 |
+
if label == 'Useful':
|
| 273 |
+
intervention_score += 1/3
|
| 274 |
+
if label == 'not_able_to_evaluate':
|
| 275 |
+
nae += 1
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
#print(id_to_eval, intervention_score, flush=True)
|
| 278 |
scores.append(intervention_score)
|