Blanca commited on
Commit
b7146d7
·
verified ·
1 Parent(s): 61c5e3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -38
app.py CHANGED
@@ -215,7 +215,7 @@ def add_new_eval(
215
 
216
  call_start()
217
 
218
- with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # check where is this saved
219
  with open(file_path, 'r') as f:
220
  data = json.load(f)
221
  scores = []
@@ -231,45 +231,48 @@ def add_new_eval(
231
  # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
232
 
233
  #print(reference_set, flush=True)
234
- if len(line['cqs']) < 3: # make sure there are at least 3 cqs
235
- return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
236
- for cq in line['cqs'][:3]: # here only take the first 3 cqs
237
- cq_text = cq['cq']
238
-
239
- if METRIC == 'similarity':
240
- sentence_embedding = similarity_model.encode(cq_text)
241
- #reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
242
- sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
243
-
244
- winner = np.argmax(sims)
245
- # make sure the similarity of the winning reference sentence is at least 0.65
246
- if sims[winner] > 0.65:
247
- label = references[indx][winner]['label']
248
- else:
249
- label = 'not_able_to_evaluate'
250
-
251
- if METRIC == 'gemma':
252
- prompts = get_prompts(cq_text, '\n'.join(reference_set))
253
- winner = run_model(model, tokenizer, prompts['compare'])
254
- try: # here make sure the output is the id of a reference cq
255
- if winner.strip() != 'Similar reference not found.':
256
- label = references[index][int(winner)]['label']
257
- else:
258
  label = 'not_able_to_evaluate'
 
 
 
 
 
 
 
 
 
 
 
 
259
  print(winner, flush=True)
260
- except IndexError:
261
- label = 'evaluation_issue'
262
- print(winner, flush=True)
263
- except ValueError:
264
- label = 'evaluation_issue'
265
- print(winner, flush=True)
266
-
267
- #print(label, flush=True)
268
- num_cqs += 1
269
- if label == 'Useful':
270
- intervention_score += 1/3
271
- if label == 'not_able_to_evaluate':
272
- nae += 1
273
 
274
  #print(id_to_eval, intervention_score, flush=True)
275
  scores.append(intervention_score)
 
215
 
216
  call_start()
217
 
218
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
219
  with open(file_path, 'r') as f:
220
  data = json.load(f)
221
  scores = []
 
231
  # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
232
 
233
  #print(reference_set, flush=True)
234
+ if len(line['cqs']) < 3 and type(line['cqs']) is list: # make sure there are at least 3 cqs
235
+ #return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
236
+ intervention_score = 0
237
+ nae = 3
238
+ else:
239
+ for cq in line['cqs'][:3]: # here only take the first 3 cqs
240
+ cq_text = cq['cq']
241
+
242
+ if METRIC == 'similarity':
243
+ sentence_embedding = similarity_model.encode(cq_text)
244
+ #reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
245
+ sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
246
+
247
+ winner = np.argmax(sims)
248
+ # make sure the similarity of the winning reference sentence is at least 0.65
249
+ if sims[winner] > 0.65:
250
+ label = references[indx][winner]['label']
251
+ else:
 
 
 
 
 
 
252
  label = 'not_able_to_evaluate'
253
+
254
+ if METRIC == 'gemma':
255
+ prompts = get_prompts(cq_text, '\n'.join(reference_set))
256
+ winner = run_model(model, tokenizer, prompts['compare'])
257
+ try: # here make sure the output is the id of a reference cq
258
+ if winner.strip() != 'Similar reference not found.':
259
+ label = references[index][int(winner)]['label']
260
+ else:
261
+ label = 'not_able_to_evaluate'
262
+ print(winner, flush=True)
263
+ except IndexError:
264
+ label = 'evaluation_issue'
265
  print(winner, flush=True)
266
+ except ValueError:
267
+ label = 'evaluation_issue'
268
+ print(winner, flush=True)
269
+
270
+ #print(label, flush=True)
271
+ num_cqs += 1
272
+ if label == 'Useful':
273
+ intervention_score += 1/3
274
+ if label == 'not_able_to_evaluate':
275
+ nae += 1
 
 
 
276
 
277
  #print(id_to_eval, intervention_score, flush=True)
278
  scores.append(intervention_score)