Spaces:
Running
Running
mmahesh873
commited on
Commit
•
9101fdb
1
Parent(s):
4d8a229
minor changes to structure
Browse files
app.py
CHANGED
@@ -16,6 +16,15 @@ import os
|
|
16 |
prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
|
17 |
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
prompt_options_dict = {
|
20 |
'Prompt option 0 (with typos and grammatical errors, two shot prompting)': 'prompt_option_0.json',
|
21 |
'Prompt option 1 (Zero shot prompting)': 'prompt_option_1.json',
|
@@ -23,27 +32,16 @@ prompt_options_dict = {
|
|
23 |
'Prompt option 3 (Prompt option 0 with minor fixes)': 'prompt_option_3.json'
|
24 |
}
|
25 |
t_result_file = st.selectbox(
|
26 |
-
'Select
|
27 |
list(prompt_options_dict.keys()))
|
28 |
|
29 |
result_file = prefix_post_processing + prompt_options_dict[t_result_file]
|
30 |
|
31 |
prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
|
32 |
|
33 |
-
# with open(result_file, 'r') as f:
|
34 |
-
# data_dict = json.load(f)
|
35 |
-
|
36 |
with urllib.request.urlopen(result_file) as url:
|
37 |
data_dict = json.load(url)
|
38 |
|
39 |
-
st.header('Evaluation dataset')
|
40 |
-
st.write(other_info_dict['data_description'])
|
41 |
-
overall_performance = round(data_dict["Overall performance"]*100, 2)
|
42 |
-
|
43 |
-
|
44 |
-
# %%
|
45 |
-
st.header("Prompt")
|
46 |
-
st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
|
47 |
# File uploader
|
48 |
with open(f'prompt_{prompt_option}.txt', "r") as file:
|
49 |
file_contents = file.read()
|
@@ -54,6 +52,8 @@ st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an
|
|
54 |
|
55 |
# 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
|
56 |
# %%
|
|
|
|
|
57 |
st.header('Performance metric')
|
58 |
st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
|
59 |
with st.container():
|
|
|
16 |
prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
|
17 |
|
18 |
|
19 |
+
st.header('Evaluation dataset')
|
20 |
+
st.write(other_info_dict['data_description'])
|
21 |
+
|
22 |
+
|
23 |
+
# %%
|
24 |
+
st.header("Prompt")
|
25 |
+
st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
|
26 |
+
|
27 |
+
|
28 |
prompt_options_dict = {
|
29 |
'Prompt option 0 (with typos and grammatical errors, two shot prompting)': 'prompt_option_0.json',
|
30 |
'Prompt option 1 (Zero shot prompting)': 'prompt_option_1.json',
|
|
|
32 |
'Prompt option 3 (Prompt option 0 with minor fixes)': 'prompt_option_3.json'
|
33 |
}
|
34 |
t_result_file = st.selectbox(
|
35 |
+
'Select the prompt:',
|
36 |
list(prompt_options_dict.keys()))
|
37 |
|
38 |
result_file = prefix_post_processing + prompt_options_dict[t_result_file]
|
39 |
|
40 |
prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
|
41 |
|
|
|
|
|
|
|
42 |
with urllib.request.urlopen(result_file) as url:
|
43 |
data_dict = json.load(url)
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# File uploader
|
46 |
with open(f'prompt_{prompt_option}.txt', "r") as file:
|
47 |
file_contents = file.read()
|
|
|
52 |
|
53 |
# 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
|
54 |
# %%
|
55 |
+
overall_performance = round(data_dict["Overall performance"]*100, 2)
|
56 |
+
|
57 |
st.header('Performance metric')
|
58 |
st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
|
59 |
with st.container():
|