mmahesh873 commited on
Commit
be11b4e
1 Parent(s): 86bd91c

first draft of comparison across models and prompts

Browse files
Files changed (1) hide show
  1. app.py +39 -27
app.py CHANGED
@@ -13,24 +13,33 @@ st.title("LLM assessments: Microsoft's Phi-2 and Google's Gemma-7b")
13
  import urllib.request
14
  import os
15
 
16
- model_select = st.selectbox(
17
- 'Select the model:',
18
- [
19
  "Microsoft's Phi-2",
20
  "Google's Gemma-7b"
21
- ])
22
 
23
- if model_select == "Microsoft's Phi-2":
24
- prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
25
- st.write("""
26
- Microsoft's Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
27
 
28
- else:
29
- prefix_post_processing = os.environ["POST_PROCESSING_JSON_GEMMA"]
30
- st.write("""
31
- Google Gemma-7b (https://huggingface.co/google/gemma-7b) is a Large Language Models (LLM) with 8.54 billion parameters. As per the https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf, Gemma-7b performs well in language understanding, reasoning, and safety tasks. This model is one of state of the art open models built based on similar technologies that were used to create Google's Gemini models.""")
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  # prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
@@ -76,30 +85,33 @@ for t_opt in options:
76
  result_processor_obj_dict = {}
77
  result_file_dict = {}
78
  data_dicts_dict = {}
79
- for t_result_file in options:
80
- result_file = prefix_post_processing + prompt_options_dict[t_result_file]
 
 
81
 
82
- prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
83
 
84
- with urllib.request.urlopen(result_file) as url:
85
- data_dict = json.load(url)
86
 
87
-
88
 
89
- result_processor_obj_dict[t_result_file] = ResultsProcessor(
90
- prompt_option=prompt_option,
91
- result_file=result_file,
92
- data_dict= data_dict
93
- )
94
- data_dicts_dict[t_result_file] = data_dict
95
- result_file_dict[t_result_file] = result_file
96
 
 
97
 
98
  print(result_processor_obj_dict)
99
 
100
 
101
 
102
-
103
 
104
  # t_result_file = st.selectbox(
105
  # 'Select the prompt:',
@@ -349,7 +361,7 @@ temp_options_2 = st.multiselect(
349
 
350
  st.markdown("---")
351
  for t_opt in temp_options_2:
352
- results_pert_rob_dict = result_processor_obj_dict[t_result_file].get_performance_robustness(option)
353
  merged_dfs_list = results_pert_rob_dict['merged_dfs_list']
354
  t_pert_df_global_temps_list = results_pert_rob_dict['t_pert_df_global_temps_list']
355
  family_names_list = results_pert_rob_dict['family_names_list']
 
13
  import urllib.request
14
  import os
15
 
16
+ llm_options = [
 
 
17
  "Microsoft's Phi-2",
18
  "Google's Gemma-7b"
19
+ ]
20
 
21
+ model_select_options = st.multiselect(
22
+ 'Select one or more models:',
23
+ llm_options,
24
+ [llm_options[0]])
25
 
 
 
 
 
26
 
27
 
28
+ prefix_post_processing_dict = {}
29
+ st.markdown("---")
30
+ for t_opt in model_select_options:
31
+ st.write(t_opt)
32
+ if t_opt == "Microsoft's Phi-2":
33
+ prefix_post_processing_dict[t_opt] = os.environ["POST_PROCESSING_JSON"]
34
+ st.write("""
35
+ Microsoft's Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
36
+
37
+ else:
38
+ prefix_post_processing_dict[t_opt] = os.environ["POST_PROCESSING_JSON_GEMMA"]
39
+ st.write("""
40
+ Google Gemma-7b (https://huggingface.co/google/gemma-7b) is a Large Language Models (LLM) with 8.54 billion parameters. As per the https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf, Gemma-7b performs well in language understanding, reasoning, and safety tasks. This model is one of state of the art open models built based on similar technologies that were used to create Google's Gemini models.""")
41
+ st.markdown("---")
42
+
43
 
44
 
45
  # prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
 
85
  result_processor_obj_dict = {}
86
  result_file_dict = {}
87
  data_dicts_dict = {}
88
+ main_options = []
89
+ for model_option in model_select_options:
90
+ for t_result_file in options:
91
+ result_file = prefix_post_processing_dict[model_option] + prompt_options_dict[t_result_file]
92
 
93
+ prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
94
 
95
+ with urllib.request.urlopen(result_file) as url:
96
+ data_dict = json.load(url)
97
 
98
+
99
 
100
+ result_processor_obj_dict[model_option + ' with ' + t_result_file] = ResultsProcessor(
101
+ prompt_option=prompt_option,
102
+ result_file=result_file,
103
+ data_dict= data_dict
104
+ )
105
+ data_dicts_dict[model_option + ' with ' + t_result_file] = data_dict
106
+ result_file_dict[model_option + ' with ' + t_result_file] = result_file
107
 
108
+ main_options += [model_option + ' with ' + t_result_file]
109
 
110
  print(result_processor_obj_dict)
111
 
112
 
113
 
114
+ options = main_options
115
 
116
  # t_result_file = st.selectbox(
117
  # 'Select the prompt:',
 
361
 
362
  st.markdown("---")
363
  for t_opt in temp_options_2:
364
+ results_pert_rob_dict = result_processor_obj_dict[t_opt].get_performance_robustness(option)
365
  merged_dfs_list = results_pert_rob_dict['merged_dfs_list']
366
  t_pert_df_global_temps_list = results_pert_rob_dict['t_pert_df_global_temps_list']
367
  family_names_list = results_pert_rob_dict['family_names_list']