Spaces:

quantpi
/

llm-assessments

Running

App Files Files Community

mmahesh873 commited on Apr 17, 2024

Commit

be11b4e

1 Parent(s): 86bd91c

first draft of comparison across models and prompts

Browse files

Files changed (1) hide show

app.py +39 -27

app.py CHANGED Viewed

@@ -13,24 +13,33 @@ st.title("LLM assessments: Microsoft's Phi-2 and Google's Gemma-7b")
 import urllib.request
 import os
-model_select = st.selectbox(
-    'Select the model:',
-    [
         "Microsoft's Phi-2",
         "Google's Gemma-7b"
-    ])
-if model_select == "Microsoft's Phi-2":
-    prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
-    st.write("""
-    Microsoft's Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
-else:
-    prefix_post_processing = os.environ["POST_PROCESSING_JSON_GEMMA"]
-    st.write("""
-    Google Gemma-7b (https://huggingface.co/google/gemma-7b) is a Large Language Models (LLM) with 8.54 billion parameters. As per the https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf, Gemma-7b performs well in language understanding, reasoning, and safety tasks. This model is one of state of the art open models built based on similar technologies that were used to create Google's Gemini models.""")
 # prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
@@ -76,30 +85,33 @@ for t_opt in options:
 result_processor_obj_dict = {}
 result_file_dict = {}
 data_dicts_dict = {}
-for t_result_file in options:
-    result_file = prefix_post_processing + prompt_options_dict[t_result_file]
-    prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
-    with urllib.request.urlopen(result_file) as url:
-        data_dict = json.load(url)
-    result_processor_obj_dict[t_result_file] = ResultsProcessor(
-        prompt_option=prompt_option,
-        result_file=result_file,
-        data_dict= data_dict
-    )
-    data_dicts_dict[t_result_file] = data_dict
-    result_file_dict[t_result_file] = result_file
 print(result_processor_obj_dict)
 # t_result_file = st.selectbox(
 #      'Select the prompt:',
@@ -349,7 +361,7 @@ temp_options_2 = st.multiselect(
 st.markdown("---")
 for t_opt in temp_options_2:
-    results_pert_rob_dict = result_processor_obj_dict[t_result_file].get_performance_robustness(option)
     merged_dfs_list = results_pert_rob_dict['merged_dfs_list']
     t_pert_df_global_temps_list = results_pert_rob_dict['t_pert_df_global_temps_list']
     family_names_list = results_pert_rob_dict['family_names_list']

 import urllib.request
 import os
+llm_options = [
         "Microsoft's Phi-2",
         "Google's Gemma-7b"
+    ]
+model_select_options = st.multiselect(
+    'Select one or more models:',
+    llm_options,
+    [llm_options[0]])
+prefix_post_processing_dict = {}
+st.markdown("---")
+for t_opt in model_select_options:
+    st.write(t_opt)
+    if t_opt == "Microsoft's Phi-2":
+        prefix_post_processing_dict[t_opt] = os.environ["POST_PROCESSING_JSON"]
+        st.write("""
+        Microsoft's Phi-2 (https://huggingface.co/microsoft/phi-2) is a Transformer model with 2.7 billion parameters. Performance on benchmarks for common sense, language understanding, and logical reasoning is nearly state-of-the-art among models with less than 13 billion parameters. Unlike typical Large Language Models (LLM), Phi-2 has not been fine-tuned through reinforcement learning from human feedback.""")
+    else:
+        prefix_post_processing_dict[t_opt] = os.environ["POST_PROCESSING_JSON_GEMMA"]
+        st.write("""
+        Google Gemma-7b (https://huggingface.co/google/gemma-7b) is a Large Language Models (LLM) with 8.54 billion parameters. As per the https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf, Gemma-7b performs well in language understanding, reasoning, and safety tasks. This model is one of state of the art open models built based on similar technologies that were used to create Google's Gemini models.""")
+    st.markdown("---")
 # prefix_post_processing = os.environ["POST_PROCESSING_JSON"]
 result_processor_obj_dict = {}
 result_file_dict = {}
 data_dicts_dict = {}
+main_options = []
+for model_option in model_select_options:
+    for t_result_file in options:
+        result_file = prefix_post_processing_dict[model_option] + prompt_options_dict[t_result_file]
+        prompt_option = int(prompt_options_dict[t_result_file].split('_')[-1].split('.')[0])
+        with urllib.request.urlopen(result_file) as url:
+            data_dict = json.load(url)
+        result_processor_obj_dict[model_option + ' with ' + t_result_file] = ResultsProcessor(
+            prompt_option=prompt_option,
+            result_file=result_file,
+            data_dict= data_dict
+        )
+        data_dicts_dict[model_option + ' with ' + t_result_file] = data_dict
+        result_file_dict[model_option + ' with ' + t_result_file] = result_file
+        main_options += [model_option + ' with ' + t_result_file]
 print(result_processor_obj_dict)
+options = main_options
 # t_result_file = st.selectbox(
 #      'Select the prompt:',
 st.markdown("---")
 for t_opt in temp_options_2:
+    results_pert_rob_dict = result_processor_obj_dict[t_opt].get_performance_robustness(option)
     merged_dfs_list = results_pert_rob_dict['merged_dfs_list']
     t_pert_df_global_temps_list = results_pert_rob_dict['t_pert_df_global_temps_list']
     family_names_list = results_pert_rob_dict['family_names_list']