Spaces:

HiTZ
/

Critical_Questions_Leaderboard

Running

App Files Files Community

Blanca commited on Oct 1

Commit

b7146d7

verified ·

1 Parent(s): 61c5e3f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -38

app.py CHANGED Viewed

@@ -215,7 +215,7 @@ def add_new_eval(
     call_start()
-    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # check where is this saved
         with open(file_path, 'r') as f:
             data = json.load(f)
             scores = []
@@ -231,45 +231,48 @@ def add_new_eval(
                         # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
                         #print(reference_set, flush=True)
-                        if len(line['cqs']) < 3: # make sure there are at least 3 cqs
-                            return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
-                        for cq in line['cqs'][:3]: # here only take the first 3 cqs
-                            cq_text = cq['cq']
-                            if METRIC == 'similarity':
-                                sentence_embedding = similarity_model.encode(cq_text)
-                                #reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
-                                sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
-                                winner = np.argmax(sims)
-                                # make sure the similarity of the winning reference sentence is at least 0.65
-                                if sims[winner] > 0.65:
-                                    label = references[indx][winner]['label']
-                                else:
-                                    label = 'not_able_to_evaluate'
-                            if METRIC == 'gemma':
-                                prompts = get_prompts(cq_text, '\n'.join(reference_set))
-                                winner = run_model(model, tokenizer, prompts['compare'])
-                                try: # here make sure the output is the id of a reference cq
-                                    if winner.strip() != 'Similar reference not found.':
-                                        label = references[index][int(winner)]['label']
-                                    else:
                                         label = 'not_able_to_evaluate'
                                         print(winner, flush=True)
-                                except IndexError:
-                                    label = 'evaluation_issue'
-                                    print(winner, flush=True)
-                                except ValueError:
-                                    label = 'evaluation_issue'
-                                    print(winner, flush=True)
-                            #print(label, flush=True)
-                            num_cqs += 1
-                            if label == 'Useful':
-                                intervention_score += 1/3
-                            if label == 'not_able_to_evaluate':
-                                nae += 1
                 #print(id_to_eval, intervention_score, flush=True)
                 scores.append(intervention_score)

     call_start()
+    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
         with open(file_path, 'r') as f:
             data = json.load(f)
             scores = []
                         # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
                         #print(reference_set, flush=True)
+                        if len(line['cqs']) < 3 and type(line['cqs']) is list: # make sure there are at least 3 cqs
+                            #return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
+                            intervention_score = 0
+                            nae = 3
+                        else:
+                            for cq in line['cqs'][:3]: # here only take the first 3 cqs
+                                cq_text = cq['cq']
+                                if METRIC == 'similarity':
+                                    sentence_embedding = similarity_model.encode(cq_text)
+                                    #reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
+                                    sims = similarity_model.similarity(sentence_embedding, reference_embeddings).tolist()[0]
+                                    winner = np.argmax(sims)
+                                    # make sure the similarity of the winning reference sentence is at least 0.65
+                                    if sims[winner] > 0.65:
+                                        label = references[indx][winner]['label']
+                                    else:
                                         label = 'not_able_to_evaluate'
+                                if METRIC == 'gemma':
+                                    prompts = get_prompts(cq_text, '\n'.join(reference_set))
+                                    winner = run_model(model, tokenizer, prompts['compare'])
+                                    try: # here make sure the output is the id of a reference cq
+                                        if winner.strip() != 'Similar reference not found.':
+                                            label = references[index][int(winner)]['label']
+                                        else:
+                                            label = 'not_able_to_evaluate'
+                                            print(winner, flush=True)
+                                    except IndexError:
+                                        label = 'evaluation_issue'
                                         print(winner, flush=True)
+                                    except ValueError:
+                                        label = 'evaluation_issue'
+                                        print(winner, flush=True)
+                                #print(label, flush=True)
+                                num_cqs += 1
+                                if label == 'Useful':
+                                    intervention_score += 1/3
+                                if label == 'not_able_to_evaluate':
+                                    nae += 1
                 #print(id_to_eval, intervention_score, flush=True)
                 scores.append(intervention_score)