Spaces:

quantpi
/

llm-assessments

Running

App Files Files Community

mmahesh873 commited on Apr 3, 2024

Commit

ad2544a

1 Parent(s): c54a69d

added new updates

Browse files

Files changed (2) hide show

app.py +21 -16
config.py +3 -3

app.py CHANGED Viewed

@@ -9,11 +9,11 @@ from config import other_info_dict
 st.title("Microsoft Phi-2 LLM assessment")
 # st.image('model_card.png', caption='Hugging face description', use_column_width=True)
 st.write("""
-    Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical LLM models, Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
 import urllib.request
 import os
-result_file = os.environ.getattribute("POST_PROCESSING_JSON")
 with urllib.request.urlopen(result_file) as url:
     data_dict = json.load(url)
@@ -25,26 +25,24 @@ overall_performance = round(data_dict["Overall performance"]*100, 2)
 # %%
 st.header("Prompt")
 # File uploader
 with open('prompt_0.txt', "r") as file:
     file_contents = file.read()
     # st.write(file_contents)
-st.text_area("", value=file_contents, height=300)
-st.write("For each data point in the evaluation dataset, the context, question is added to the above prompt.")
-st.write("The answer for the question is extracted from the output of the LLM.")
-st.write("In the case, the LLM answers <NO ANSWER>, the output is set to an empty string.")
 # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
 # %%
 st.header('Performance metric')
-st.write(""" The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script at https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer. In this assessment, the minimum performance threshold is set to 0.65. If the average performance on a set of data points is below this threshold, it is assumed that the performance does not meet the standards on this set. Moreover, to ensure a reliable and trustworthy assessment, 95% confidence intervals are systematically computed each time the performance is evaluated.""")
 with st.container():
     st.write(f"**Overall performance: {overall_performance}%**")
 # %%
 st.header("Bias ratios")
-st.write('Bias ratio is defined as the ratio of the highest performance to the lowest performance among reliable categories for a characteristic.')
 fairness_results = data_dict['Fairness results']
 characteristic_list = []
@@ -64,10 +62,17 @@ st.dataframe(ch_df)
 # %%
-st.header("Perturber families performance")
-st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
-st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
 global_perturber_families = data_dict['Perturber Families']
 t_pert_fig = None
@@ -105,12 +110,12 @@ st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
 # %%
-st.header("Performance, Fairness, Robustness")
 embedder_categories = data_dict['Embedder categories']
 option = st.selectbox(
-     'Select higher-level categorization/characteristic:',
      list(embedder_categories.keys()))
@@ -209,6 +214,6 @@ for item in global_perturber_families:
     t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
     t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
-    st.write(f'The following plot illustrates the normalized performance of the model across different categories for perturbation family: {family_name}.')
     st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
 st.markdown("---")

 st.title("Microsoft Phi-2 LLM assessment")
 # st.image('model_card.png', caption='Hugging face description', use_column_width=True)
 st.write("""
+    Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
 import urllib.request
 import os
+result_file = os.environ["POST_PROCESSING_JSON"]
 with urllib.request.urlopen(result_file) as url:
     data_dict = json.load(url)
 # %%
 st.header("Prompt")
+st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
 # File uploader
 with open('prompt_0.txt', "r") as file:
     file_contents = file.read()
     # st.write(file_contents)
+st.text_area("Prompt template:", value=file_contents, height=300)
+st.write("The answer to the question is obtained by post-processing the output of the LLM, wherein any additional content starting from the first 'Context: ' is disregarded.")
+st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an empty string.")
 # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
 # %%
 st.header('Performance metric')
+st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
 with st.container():
     st.write(f"**Overall performance: {overall_performance}%**")
 # %%
 st.header("Bias ratios")
+st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristic.')
 fairness_results = data_dict['Fairness results']
 characteristic_list = []
 # %%
+st.header("Robustness")
+st.write(f"""We evaluate the robustness of the LLM by assessing the variation in performance when perturbations are introduced to the content outside of the prompt template. The following plot shows the performance across different levels of perturbation within a perturbation family that consists of a series of perturbation methods. We consider the following perturbation families.
+- ProbTypos: {other_info_dict['ProbTypos_description']}
+- MaxTypo: {other_info_dict['MaxTypo_description']}
+""")
+# st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
+# st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
 global_perturber_families = data_dict['Perturber Families']
 t_pert_fig = None
 # %%
+st.header("Characteristic results")
 embedder_categories = data_dict['Embedder categories']
 option = st.selectbox(
+     'Select characteristic:',
      list(embedder_categories.keys()))
     t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
     t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
+    st.write(f'The following plot illustrates the normalized performance of the model across different categories for the perturbation family: {family_name}.')
     st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
 st.markdown("---")

config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 other_info_dict = {
-    "data_description": "We perform the LLM assessment with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
-    "ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by tens of percents: level 1 = 10%, level 2 = 20%, level 3 = 30% and so on. Maximum typos per word equals 1 everywhere. We evaluated the robustness on a level 1, 3, 5.",
-    "MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1, level 2 = 2, level 3 = 3 and so on. Probability of a typo in a word equals 10% everywhere. We evaluated the robustness on a level 1, 3, 5.",
     "ethnicity_categories_text": """
     Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.

 other_info_dict = {
+    "data_description": "We perform the LLM assessment across different trustworthy dimensions such as performance, robustness and bias with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
+    "ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by: level 1 = 10%, level 2 = 30%, level 3 = 50%. Maximum typos per word equals 1 everywhere. ",
+    "MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1 typo per word, level 2 = 3 typos per word, level 3 = 5 typos per word. Probability of a typo in a word equals 10% everywhere. ",
     "ethnicity_categories_text": """
     Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.