Spaces:

quantpi
/

llm-assessments

Running

App Files Files Community

mmahesh873 commited on Apr 5

Commit

9101fdb

•

1 Parent(s): 4d8a229

minor changes to structure

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -16,6 +16,15 @@ import os
 prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
 prompt_options_dict = {
     'Prompt option 0 (with typos and grammatical errors, two shot prompting)': 'prompt_option_0.json',
     'Prompt option 1 (Zero shot prompting)': 'prompt_option_1.json',
@@ -23,27 +32,16 @@ prompt_options_dict = {
     'Prompt option 3 (Prompt option 0 with minor fixes)': 'prompt_option_3.json'
 }
 t_result_file = st.selectbox(
-     'Select result file:',
      list(prompt_options_dict.keys()))
 result_file = prefix_post_processing + prompt_options_dict[t_result_file]
 prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
-# with open(result_file, 'r') as f:
-#     data_dict = json.load(f)
 with urllib.request.urlopen(result_file) as url:
     data_dict = json.load(url)
-st.header('Evaluation dataset')
-st.write(other_info_dict['data_description'])
-overall_performance = round(data_dict["Overall performance"]*100, 2)
-# %%
-st.header("Prompt")
-st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
 # File uploader
 with open(f'prompt_{prompt_option}.txt', "r") as file:
     file_contents = file.read()
@@ -54,6 +52,8 @@ st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an
 # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
 # %%
 st.header('Performance metric')
 st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
 with st.container():

 prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
+st.header('Evaluation dataset')
+st.write(other_info_dict['data_description'])
+# %%
+st.header("Prompt")
+st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
 prompt_options_dict = {
     'Prompt option 0 (with typos and grammatical errors, two shot prompting)': 'prompt_option_0.json',
     'Prompt option 1 (Zero shot prompting)': 'prompt_option_1.json',
     'Prompt option 3 (Prompt option 0 with minor fixes)': 'prompt_option_3.json'
 }
 t_result_file = st.selectbox(
+     'Select the prompt:',
      list(prompt_options_dict.keys()))
 result_file = prefix_post_processing + prompt_options_dict[t_result_file]
 prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
 with urllib.request.urlopen(result_file) as url:
     data_dict = json.load(url)
 # File uploader
 with open(f'prompt_{prompt_option}.txt', "r") as file:
     file_contents = file.read()
 # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
 # %%
+overall_performance = round(data_dict["Overall performance"]*100, 2)
 st.header('Performance metric')
 st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
 with st.container():