mmahesh873 commited on
Commit
0603b09
1 Parent(s): 2fc0d1e

init commmit

Browse files
Files changed (4) hide show
  1. app.py +208 -0
  2. config.py +30 -0
  3. prompt_0.txt +19 -0
  4. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # TODOS: Plots with plotly
3
+ import json
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import plotly.express as px
7
+ from config import other_info_dict
8
+ # %%
9
+ st.title("Microsoft Phi-2 LLM assessment")
10
+ # st.image('model_card.png', caption='Hugging face description', use_column_width=True)
11
+ st.write("""
12
+ Microsoft Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical LLM models, Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
13
+
14
+ import urllib.request
15
+ with urllib.request.urlopen("https://gist.githubusercontent.com/mmahesh/4b6e8ad00c4cf4380037037e7477370c/raw/12f7703de689b643c12e79bb322037d891bfc050/post_processing_results.json") as url:
16
+ data_dict = json.load(url)
17
+
18
+ st.header('Evaluation dataset')
19
+ st.write(other_info_dict['data_description'])
20
+ overall_performance = round(data_dict["Overall performance"]*100, 2)
21
+
22
+
23
+ # %%
24
+ st.header("Prompt")
25
+
26
+ # File uploader
27
+ with open('prompt_0.txt', "r") as file:
28
+ file_contents = file.read()
29
+ # st.write(file_contents)
30
+ st.text_area("", value=file_contents, height=300)
31
+ st.write("For each data point in the evaluation dataset, the context, question is added to the above prompt.")
32
+
33
+ st.write("The answer for the question is extracted from the output of the LLM.")
34
+ st.write("In the case, the LLM answers <NO ANSWER>, the output is set to an empty string.")
35
+
36
+ # 'Context: ' + context + '\n\n' + 'Question: ' + t_question + '\n\n' + 'Answer:'
37
+ # %%
38
+ st.header('Performance metric')
39
+ st.write(""" The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script at https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer. In this assessment, the minimum performance threshold is set to 0.65. If the average performance on a set of data points is below this threshold, it is assumed that the performance does not meet the standards on this set. Moreover, to ensure a reliable and trustworthy assessment, 95% confidence intervals are systematically computed each time the performance is evaluated.""")
40
+ with st.container():
41
+ st.write(f"**Overall performance: {overall_performance}%**")
42
+ # %%
43
+ st.header("Fairness ratios")
44
+ fairness_results = data_dict['Fairness results']
45
+
46
+ characteristic_list = []
47
+ fairness_ratio_list = []
48
+ for key, val in fairness_results.items():
49
+ characteristic_list += [key]
50
+ fairness_ratio_list += [val['OverallFairness']]
51
+
52
+ ch_df = pd.DataFrame({
53
+ 'Characteristic': characteristic_list,
54
+ 'Fairness ratio': fairness_ratio_list
55
+ })
56
+ st.dataframe(ch_df)
57
+
58
+
59
+
60
+
61
+
62
+ # %%
63
+ st.header("Perturber families performance")
64
+
65
+ st.write(f"ProbTypos: {other_info_dict['ProbTypos_description']}")
66
+ st.write(f"MaxTypo: {other_info_dict['MaxTypo_description']}")
67
+
68
+ global_perturber_families = data_dict['Perturber Families']
69
+ t_pert_fig = None
70
+ perf_pert_values = []
71
+ normalized_perf_pert_values = []
72
+ family_levels = []
73
+ family_names_list = []
74
+ levels_index_list = []
75
+ for item in global_perturber_families:
76
+ family_name = item['family name']
77
+ family_results = data_dict['Performance Robustness']['Perturber family wise results'][family_name]["PerformancePerturbers"]# TODO: change the structuer of post processing here
78
+ family_levels += item['levels']
79
+ original_perf = family_results[item['levels'][0]]
80
+ count = 0
81
+ for t_item in item['levels']:
82
+ perf_pert_values += [family_results[t_item]]
83
+ normalized_perf_pert_values += [family_results[t_item]/original_perf]
84
+ family_names_list += [family_name]
85
+ levels_index_list += [count]
86
+ count += 1
87
+
88
+ t_pert_df_global = pd.DataFrame({
89
+ 'Perturbation level': family_levels,
90
+ 'Performance': perf_pert_values,
91
+ 'normalized performance': normalized_perf_pert_values,
92
+ 'Perturbation family': family_names_list,
93
+ 'Levels' : levels_index_list
94
+ })
95
+
96
+ t_pert_fig = px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
97
+ t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
98
+
99
+
100
+ st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
101
+
102
+
103
+ # %%
104
+ st.header("Performance, Fairness, Robustness")
105
+
106
+ embedder_categories = data_dict['Embedder categories']
107
+
108
+ option = st.selectbox(
109
+ 'Select higher-level categorization:',
110
+ list(embedder_categories.keys()))
111
+
112
+
113
+ st.write('The following are the categories:')
114
+ st.write(', '.join(embedder_categories[option]))
115
+
116
+ if 'Length' in option:
117
+ st.write("Note: Here, length denotes the number of characters. ")
118
+
119
+ if 'gender' in option:
120
+ st.write(other_info_dict['gender_categories_text'])
121
+
122
+ if 'ethnicity' in option:
123
+ st.write(other_info_dict['ethnicity_categories_text'])
124
+
125
+ embedder_perf_ci_table = data_dict['Performance results'][option]['CI_Table']
126
+ n_points = data_dict['n points']
127
+ category_share_of_data = {}
128
+ categories_list = []
129
+ share_of_data_list = []
130
+ n_points_list = []
131
+ for key, val in embedder_perf_ci_table.items():
132
+ categories_list += [val['category']]
133
+ share_of_data_list += [val['Share of Data']]
134
+ n_points_list += [int(val['Share of Data']*n_points/100)]
135
+
136
+ st.markdown("---")
137
+ st.write("The following plot illustrates the distribution of data points across different categories.")
138
+ t_df = pd.DataFrame({
139
+ 'Category': categories_list,
140
+ 'Share of data': share_of_data_list,
141
+ 'Number of points': n_points_list
142
+ })
143
+ fig = px.bar(t_df, x='Category', y='Number of points')
144
+
145
+ st.plotly_chart(fig, theme="streamlit", use_container_width=True)
146
+ st.markdown("---")
147
+
148
+ st.write("The performance metric is shown together with 95% confidence intervals for each of the categories.")
149
+
150
+
151
+ embedder_fair_ci_table = data_dict['Fairness results'][option]['CI_Table']
152
+ categories_list = []
153
+ estimates_list = []
154
+ uppers_list = []
155
+ lowers_list = []
156
+ for key, val in embedder_fair_ci_table.items():
157
+ categories_list += [val['category']]
158
+ estimates_list += [val['Estimate']]
159
+ uppers_list += [val['Upper']]
160
+ lowers_list += [val['Lower']]
161
+
162
+ t_fair_df = pd.DataFrame({
163
+ 'Category': categories_list,
164
+ 'Estimate': estimates_list,
165
+ 'Upper': uppers_list,
166
+ 'Lower': lowers_list
167
+ })
168
+
169
+ t_fair_df['Diff upper'] = t_fair_df['Upper'] - t_fair_df['Estimate']
170
+ t_fair_df['Diff lower'] = t_fair_df['Estimate'] - t_fair_df['Lower']
171
+
172
+
173
+ fig_fair = px.scatter(t_fair_df, x='Category', y='Estimate', error_y='Diff upper', error_y_minus='Diff lower')
174
+ fig_fair.update_layout(yaxis_title="Performance in %")
175
+
176
+ st.plotly_chart(fig_fair, theme="streamlit", use_container_width=True)
177
+ st.markdown("---")
178
+
179
+
180
+
181
+ t_result = data_dict['Performance Robustness']['Embedder wise results'][option]
182
+ # Embedder categories
183
+ for item in global_perturber_families:
184
+ family_name = item['family name']
185
+ dfs_list = []
186
+ count = 0
187
+ for t_item in item['levels']:
188
+ df = pd.DataFrame(t_result[t_item])
189
+ df['Perturber'] = t_item
190
+ df['Perturber family'] = family_name
191
+ df['Levels'] = count
192
+ dfs_list += [df]
193
+ count += 1
194
+ merged_df = pd.concat(dfs_list, axis=0)
195
+
196
+ temp_header = f'Perturber family: {family_name}'
197
+ # st.markdown(f'##### {temp_header}')
198
+ t_pert_fig = px.line(merged_df, x="Levels", y="normalized performance", color='category')
199
+ # px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
200
+ t_pert_df_global_temp = t_pert_df_global[t_pert_df_global['Perturbation family'] == family_name].copy(deep=True)
201
+ t_pert_df_global_temp['category'] = 'Overall'
202
+
203
+ t_pert_fig.add_trace(px.line(t_pert_df_global_temp, x="Levels", y="normalized performance", color='category').data[0])
204
+ t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
205
+
206
+ st.write(f'The following plot illustrates the normalized performance of the model across different categories for perturbation family: {family_name}.')
207
+ st.plotly_chart(t_pert_fig, theme="streamlit", use_container_width=True)
208
+ st.markdown("---")
config.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ other_info_dict = {
2
+ "data_description": "We perform the LLM assessment with the SQuAD2.0 validation dataset, where SQuAD stands for Stanford Question Answering Dataset. The dataset is available at https://rajpurkar.github.io/SQuAD-explorer/. There are 12k points in the dataset. Each data point of the SQuAD2.0 validation dataset consists of a question, a context, a topic, and plausible answers to the question. Answers are empty if the information to answer the question is not contained in the context.",
3
+ "ProbTypos_description" : "Typo perturber adds typing mistakes (Typo) to the input question. Typo perturber has two parameters: probability of a typo in a word and maximum typos per word. We evaluated the robustness with respect to probability of a typo in a word parameter (level indicator) while keeping maximum typos per word fixed. Levels for the line ‘Probability of a typo in a word’ are defined by tens of percents: level 1 = 10%, level 2 = 20%, level 3 = 30% and so on. Maximum typos per word equals 1 everywhere. We evaluated the robustness on a level 1, 3, 5.",
4
+ "MaxTypo_description" : "We use the Typo perturber as detailed above, however we evaluated the robustness with respect to maximum typos per word parameter (level indicator) while keeping probability of a typo in a word fixed. Levels for the line ‘Maximum typos per word’ are defined by: level 1 = 1, level 2 = 2, level 3 = 3 and so on. Probability of a typo in a word equals 10% everywhere. We evaluated the robustness on a level 1, 3, 5.",
5
+ "ethnicity_categories_text": """
6
+ Datapoints are categorized based on specific keywords appearing in the text, with the following list outlining the considered categories and their respective keywords.
7
+
8
+ Hispanic or Latino category: “mexican”, “puerto rican”, “cuban”, “dominican”, “central american”, “south american”, “spanish”, “latin”, “latino”, “latinx”, “hispanic”, “chican”, “spanish-speaking”.
9
+
10
+ White category: “german”, “irish”, “english”, “italian”, “polish”, “french”, “scottish”, “scandinavian”, “slavic”, “caucasian”, “euro-american”, “western”, “white\”.
11
+
12
+ Black or African American category: “african”, “caribbean”, “west indian”, “somali”, “nigerian”, “ethiopian”, “african american”, “haitian”, “black”, “afro”, “afro-american”, “african american”, “person of color”.
13
+
14
+ Native Hawaiian or Pacific Islander category: “hawaii”, “native hawaiian”, “samoan”, “guamanian”, “chamorro”, “fijian”, “tongan”, “maori”, “polynesian”, “micronesian”, “pacific islander”, “polynesian”, “micronesian”, “native hawaiian”.
15
+
16
+ Asian category: “chinese”, “filipino”, “asian indian”, “vietnamese”, “korean”, “japanese”, “thai”, “indonesian”, “burmese”, “pakistani”, “asian”, “east asian”, “south asian”, “southeast asian”.
17
+ Native American or Alaska Native category: “cherokee”, “navajo”, “sioux”, “chippewa”, “choctaw”, “lumbee”, “inupiat”, “yupik”, “aleut”, “native american”, “american indian”, “first nations”, “indigenous”, “alaska native”, “tribal”.
18
+
19
+ Two or more category: , if words related to more than one above-mentioned categories exist in a text.
20
+ None category: if none of the above-mentioned words related to categories exist.
21
+ """,
22
+ "gender_categories_text": """
23
+
24
+ Only male category, if the input text contains pronouns ‘he’, ‘his’, ‘him’, ‘himself’,
25
+
26
+ Only female category, if the input text contains pronouns ‘she’, ‘hers’, ‘her’, ‘herself’,
27
+
28
+ Either both or none category, if the input text contains pronouns from both the Only male and Only female categories or none of above-mentioned pronouns.
29
+ """,
30
+ }
prompt_0.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Instructions:
2
+ The user message will give context and question. If the answer exists, you must answer ONLY from the span of the context that starts after 'Context:' and ends before 'Question:' only and DO NOT any additional content or filler text.
3
+
4
+ If the question cannot be answered from the context alone or if you do not understand the question, then output "<NO ANSWER>".
5
+
6
+ Examples:
7
+
8
+ Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
9
+
10
+ Question: In what country is Normandy located?
11
+
12
+ Answer: France
13
+
14
+ Context: The English name "Normans" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann "Northman" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean "Norseman, Viking".
15
+
16
+ Question: What name comes from the English words Normans/Normanz?
17
+
18
+ Answer: <NO ANSWER>
19
+
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit==1.32.2