Spaces:
Running
Running
Commit
·
ad2544a
1
Parent(s):
c54a69d
added new updates
Browse files
app.py
CHANGED
@@ -9,11 +9,11 @@ from config import other_info_dict
|
|
9 |
st.title("Microsoft Phi-2 LLM assessment")
|
10 |
# st.image('model_card.png', caption='Hugging face description', use_column_width=True)
|
11 |
st.write("""
|
12 |
-
Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical LLM
|
13 |
|
14 |
import urllib.request
|
15 |
import os
|
16 |
-
result_file = os.environ
|
17 |
|
18 |
with urllib.request.urlopen(result_file) as url:
|
19 |
data_dict = json.load(url)
|
@@ -25,26 +25,24 @@ overall_performance = round(data_dict["Overall performance"]*100, 2)
|
|
25 |
|
26 |
# %%
|
27 |
st.header("Prompt")
|
28 |
-
|
29 |
# File uploader
|
30 |
with open('prompt_0.txt', "r") as file:
|
31 |
file_contents = file.read()
|
32 |
# st.write(file_contents)
|
33 |
-
st.text_area("", value=file_contents, height=300)
|
34 |
-
st.write("
|
35 |
-
|
36 |
-
st.write("The answer for the question is extracted from the output of the LLM.")
|
37 |
-
st.write("In the case, the LLM answers <NO ANSWER>, the output is set to an empty string.")
|
38 |
|
39 |
# 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
|
40 |
# %%
|
41 |
st.header('Performance metric')
|
42 |
-
st.write("""
|
43 |
with st.container():
|
44 |
st.write(f"**Overall performance: {overall_performance}%**")
|
45 |
# %%
|
46 |
st.header("Bias ratios")
|
47 |
-
st.write('Bias ratio is defined as the ratio of the
|
48 |
fairness_results = data_dict['Fairness results']
|
49 |
|
50 |
characteristic_list = []
|
@@ -64,10 +62,17 @@ st.dataframe(ch_df)
|
|
64 |
|
65 |
|
66 |
# %%
|
67 |
-
st.header("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
|
70 |
-
st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
|
71 |
|
72 |
global_perturber_families = data_dict['Perturber Families']
|
73 |
t_pert_fig = None
|
@@ -105,12 +110,12 @@ st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
|
|
105 |
|
106 |
|
107 |
# %%
|
108 |
-
st.header("
|
109 |
|
110 |
embedder_categories = data_dict['Embedder categories']
|
111 |
|
112 |
option = st.selectbox(
|
113 |
-
'Select
|
114 |
list(embedder_categories.keys()))
|
115 |
|
116 |
|
@@ -209,6 +214,6 @@ for item in global_perturber_families:
|
|
209 |
t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
|
210 |
t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
|
211 |
|
212 |
-
st.write(f'The following plot illustrates the normalized performance of the model across different categories for perturbation family: {family_name}.')
|
213 |
st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
|
214 |
st.markdown("---")
|
|
|
9 |
st.title("Microsoft Phi-2 LLM assessment")
|
10 |
# st.image('model_card.png', caption='Hugging face description', use_column_width=True)
|
11 |
st.write("""
|
12 |
+
Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
|
13 |
|
14 |
import urllib.request
|
15 |
import os
|
16 |
+
result_file = os.environ["POST_PROCESSING_JSON"]
|
17 |
|
18 |
with urllib.request.urlopen(result_file) as url:
|
19 |
data_dict = json.load(url)
|
|
|
25 |
|
26 |
# %%
|
27 |
st.header("Prompt")
|
28 |
+
st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
|
29 |
# File uploader
|
30 |
with open('prompt_0.txt', "r") as file:
|
31 |
file_contents = file.read()
|
32 |
# st.write(file_contents)
|
33 |
+
st.text_area("Prompt template:", value=file_contents, height=300)
|
34 |
+
st.write("The answer to the question is obtained by post-processing the output of the LLM, wherein any additional content starting from the first 'Context: ' is disregarded.")
|
35 |
+
st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an empty string.")
|
|
|
|
|
36 |
|
37 |
# 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
|
38 |
# %%
|
39 |
st.header('Performance metric')
|
40 |
+
st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
|
41 |
with st.container():
|
42 |
st.write(f"**Overall performance: {overall_performance}%**")
|
43 |
# %%
|
44 |
st.header("Bias ratios")
|
45 |
+
st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristic.')
|
46 |
fairness_results = data_dict['Fairness results']
|
47 |
|
48 |
characteristic_list = []
|
|
|
62 |
|
63 |
|
64 |
# %%
|
65 |
+
st.header("Robustness")
|
66 |
+
|
67 |
+
st.write(f"""We evaluate the robustness of the LLM by assessing the variation in performance when perturbations are introduced to the content outside of the prompt template. The following plot shows the performance across different levels of perturbation within a perturbation family that consists of a series of perturbation methods. We consider the following perturbation families.
|
68 |
+
|
69 |
+
- ProbTypos: {other_info_dict['ProbTypos_description']}
|
70 |
+
|
71 |
+
- MaxTypo: {other_info_dict['MaxTypo_description']}
|
72 |
+
""")
|
73 |
|
74 |
+
# st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
|
75 |
+
# st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
|
76 |
|
77 |
global_perturber_families = data_dict['Perturber Families']
|
78 |
t_pert_fig = None
|
|
|
110 |
|
111 |
|
112 |
# %%
|
113 |
+
st.header("Characteristic results")
|
114 |
|
115 |
embedder_categories = data_dict['Embedder categories']
|
116 |
|
117 |
option = st.selectbox(
|
118 |
+
'Select characteristic:',
|
119 |
list(embedder_categories.keys()))
|
120 |
|
121 |
|
|
|
214 |
t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
|
215 |
t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
|
216 |
|
217 |
+
st.write(f'The following plot illustrates the normalized performance of the model across different categories for the perturbation family: {family_name}.')
|
218 |
st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
|
219 |
st.markdown("---")
|
config.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
other_info_dict = {
|
2 |
-
"data_description": "We perform the LLM assessment with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
|
3 |
-
"ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by
|
4 |
-
"MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1, level 2 =
|
5 |
"ethnicity_categories_text": """
|
6 |
Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.
|
7 |
|
|
|
1 |
other_info_dict = {
|
2 |
+
"data_description": "We perform the LLM assessment across different trustworthy dimensions such as performance, robustness and bias with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
|
3 |
+
"ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by: level 1 = 10%, level 2 = 30%, level 3 = 50%. Maximum typos per word equals 1 everywhere. ",
|
4 |
+
"MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1 typo per word, level 2 = 3 typos per word, level 3 = 5 typos per word. Probability of a typo in a word equals 10% everywhere. ",
|
5 |
"ethnicity_categories_text": """
|
6 |
Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.
|
7 |
|