mmahesh873 commited on
Commit
ad2544a
1 Parent(s): c54a69d

added new updates

Browse files
Files changed (2) hide show
  1. app.py +21 -16
  2. config.py +3 -3
app.py CHANGED
@@ -9,11 +9,11 @@ from config import other_info_dict
9
  st.title("Microsoft Phi-2 LLM assessment")
10
  # st.image('model_card.png', caption='Hugging face description', use_column_width=True)
11
  st.write("""
12
- Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical LLM models, Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
13
 
14
  import urllib.request
15
  import os
16
- result_file = os.environ.getattribute("POST_PROCESSING_JSON")
17
 
18
  with urllib.request.urlopen(result_file) as url:
19
  data_dict = json.load(url)
@@ -25,26 +25,24 @@ overall_performance = round(data_dict["Overall performance"]*100, 2)
25
 
26
  # %%
27
  st.header("Prompt")
28
-
29
  # File uploader
30
  with open('prompt_0.txt', "r") as file:
31
  file_contents = file.read()
32
  # st.write(file_contents)
33
- st.text_area("", value=file_contents, height=300)
34
- st.write("For each data point in the evaluation dataset, the context, question is added to the above prompt.")
35
-
36
- st.write("The answer for the question is extracted from the output of the LLM.")
37
- st.write("In the case, the LLM answers <NO ANSWER>, the output is set to an empty string.")
38
 
39
  # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
40
  # %%
41
  st.header('Performance metric')
42
- st.write(""" The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script at https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer. In this assessment, the minimum performance threshold is set to 0.65. If the average performance on a set of data points is below this threshold, it is assumed that the performance does not meet the standards on this set. Moreover, to ensure a reliable and trustworthy assessment, 95% confidence intervals are systematically computed each time the performance is evaluated.""")
43
  with st.container():
44
  st.write(f"**Overall performance: {overall_performance}%**")
45
  # %%
46
  st.header("Bias ratios")
47
- st.write('Bias ratio is defined as the ratio of the highest performance to the lowest performance among reliable categories for a characteristic.')
48
  fairness_results = data_dict['Fairness results']
49
 
50
  characteristic_list = []
@@ -64,10 +62,17 @@ st.dataframe(ch_df)
64
 
65
 
66
  # %%
67
- st.header("Perturber families performance")
 
 
 
 
 
 
 
68
 
69
- st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
70
- st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
71
 
72
  global_perturber_families = data_dict['Perturber Families']
73
  t_pert_fig = None
@@ -105,12 +110,12 @@ st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
105
 
106
 
107
  # %%
108
- st.header("Performance, Fairness, Robustness")
109
 
110
  embedder_categories = data_dict['Embedder categories']
111
 
112
  option = st.selectbox(
113
- 'Select higher-level categorization/characteristic:',
114
  list(embedder_categories.keys()))
115
 
116
 
@@ -209,6 +214,6 @@ for item in global_perturber_families:
209
  t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
210
  t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
211
 
212
- st.write(f'The following plot illustrates the normalized performance of the model across different categories for perturbation family: {family_name}.')
213
  st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
214
  st.markdown("---")
 
9
  st.title("Microsoft Phi-2 LLM assessment")
10
  # st.image('model_card.png', caption='Hugging face description', use_column_width=True)
11
  st.write("""
12
+ Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
13
 
14
  import urllib.request
15
  import os
16
+ result_file = os.environ["POST_PROCESSING_JSON"]
17
 
18
  with urllib.request.urlopen(result_file) as url:
19
  data_dict = json.load(url)
 
25
 
26
  # %%
27
  st.header("Prompt")
28
+ st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
29
  # File uploader
30
  with open('prompt_0.txt', "r") as file:
31
  file_contents = file.read()
32
  # st.write(file_contents)
33
+ st.text_area("Prompt template:", value=file_contents, height=300)
34
+ st.write("The answer to the question is obtained by post-processing the output of the LLM, wherein any additional content starting from the first 'Context: ' is disregarded.")
35
+ st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an empty string.")
 
 
36
 
37
  # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
38
  # %%
39
  st.header('Performance metric')
40
+ st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
41
  with st.container():
42
  st.write(f"**Overall performance: {overall_performance}%**")
43
  # %%
44
  st.header("Bias ratios")
45
+ st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristic.')
46
  fairness_results = data_dict['Fairness results']
47
 
48
  characteristic_list = []
 
62
 
63
 
64
  # %%
65
+ st.header("Robustness")
66
+
67
+ st.write(f"""We evaluate the robustness of the LLM by assessing the variation in performance when perturbations are introduced to the content outside of the prompt template. The following plot shows the performance across different levels of perturbation within a perturbation family that consists of a series of perturbation methods. We consider the following perturbation families.
68
+
69
+ - ProbTypos: {other_info_dict['ProbTypos_description']}
70
+
71
+ - MaxTypo: {other_info_dict['MaxTypo_description']}
72
+ """)
73
 
74
+ # st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
75
+ # st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
76
 
77
  global_perturber_families = data_dict['Perturber Families']
78
  t_pert_fig = None
 
110
 
111
 
112
  # %%
113
+ st.header("Characteristic results")
114
 
115
  embedder_categories = data_dict['Embedder categories']
116
 
117
  option = st.selectbox(
118
+ 'Select characteristic:',
119
  list(embedder_categories.keys()))
120
 
121
 
 
214
  t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
215
  t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
216
 
217
+ st.write(f'The following plot illustrates the normalized performance of the model across different categories for the perturbation family: {family_name}.')
218
  st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
219
  st.markdown("---")
config.py CHANGED
@@ -1,7 +1,7 @@
1
  other_info_dict = {
2
- "data_description": "We perform the LLM assessment with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
3
- "ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by tens of percents: level 1 = 10%, level 2 = 20%, level 3 = 30% and so on. Maximum typos per word equals 1 everywhere. We evaluated the robustness on a level 1, 3, 5.",
4
- "MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1, level 2 = 2, level 3 = 3 and so on. Probability of a typo in a word equals 10% everywhere. We evaluated the robustness on a level 1, 3, 5.",
5
  "ethnicity_categories_text": """
6
  Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.
7
 
 
1
  other_info_dict = {
2
+ "data_description": "We perform the LLM assessment across different trustworthy dimensions such as performance, robustness and bias with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
3
+ "ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by: level 1 = 10%, level 2 = 30%, level 3 = 50%. Maximum typos per word equals 1 everywhere. ",
4
+ "MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1 typo per word, level 2 = 3 typos per word, level 3 = 5 typos per word. Probability of a typo in a word equals 10% everywhere. ",
5
  "ethnicity_categories_text": """
6
  Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.
7