Spaces:

Nexusflow
/

Nexus_Function_Calling_Leaderboard

Running

App Files Files Community

zhen-dong-nexusflow commited on Dec 8, 2023

Commit

551935e

1 Parent(s): 3f48470

modify details of UI

Browse files

Files changed (2) hide show

app.py +19 -19
strings.py +1 -1

app.py CHANGED Viewed

@@ -47,15 +47,15 @@ hover_css = """
 # Updated results reflecting the new screenshot
 RESULTS = {
-    'Climate': {"GPT4": 0.6809, "GPT3.5": 0.2553, "NexusRaven-V2": 0.7021, "Gorilla open-function-v1": 0.0213},
-    'Heldout_Combined': {"GPT4": 0.4814, "GPT3.5": 0.4495, "NexusRaven-V2": 0.5990},
-    'Places_API': {"GPT4": 0.4375, "GPT3.5": 0.2500, "NexusRaven-V2": 0.5000, "Gorilla open-function-v1": 0.0208},
-    'OTX': {"GPT4": 0.9022, "GPT3.5": 0.8913, "NexusRaven-V2": 0.9022, "Gorilla open-function-v1": 0.2935},
-    'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
-    'VT_Multi_Dependency': {"GPT4": 0.3673, "GPT3.5": 0.0204, "NexusRaven-V2": 0.3878, "Gorilla open-function-v1": 0.0000},
-    'VT_Multi_Disconnected': {"GPT4": 0.2857, "GPT3.5": 0.1429, "NexusRaven-V2": 0.4286, "Gorilla open-function-v1": 0.0000},
-    'CVECPE': {"GPT4": 0.7700, "GPT3.5": 0.4800, "NexusRaven-V2": 0.6667, "Gorilla open-function-v1": 0.0897},
-    'CVECPE_Multi_Dependency': {"GPT4": 0.0714, "GPT3.5": 0.0714, "NexusRaven-V2": 0.2500, "Gorilla open-function-v1": 0.0000},
 }
 SAMPLES = {
@@ -206,9 +206,9 @@ def calculate_capability_scores(results, type):
         capability_data = pd.DataFrame({
             'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
-            'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
-            'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']],
-            'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     elif type == "many apis many args":
         otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
@@ -223,10 +223,10 @@ def calculate_capability_scores(results, type):
         capability_data = pd.DataFrame({
             'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
-            'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
-            'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
             'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
-            places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
             # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
             # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
@@ -241,14 +241,14 @@ def display_radar_chart(type):
                         markers=True,  # Adding markers
                         color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
                         template='plotly_dark',
-                        title='Capability Radar Chart on General Abilities')
     elif type == "many apis many args":
         data = calculate_capability_scores(RESULTS, "many apis many args")
         fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
                         markers=True,  # Adding markers
                         color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
                         template='plotly_dark',
-                        title='Capability Radar Chart on All Subtasks')
     # Customize the lines and markers
     fig.update_traces(marker=dict(size=10), line=dict(width=4))
@@ -332,7 +332,7 @@ with gr.Blocks(theme="dark") as demo:  # Set the theme here
         with gr.Tab(tab_name):
             # Create and display DataFrame
-            with gr.Accordion("Details of the " + tab_name + " :", open=False) as accordion:
                 gr.Markdown(api_descriptions[key])
                 if key == "Heldout_Combined":
                     accordion.open = True
@@ -383,7 +383,7 @@ with gr.Blocks(theme="dark") as demo:  # Set the theme here
         params.set('__theme', 'dark');
         window.location.search = params.toString();
       }
-      }""",
         )

 # Updated results reflecting the new screenshot
 RESULTS = {
+    'Climate': {"NexusRaven-V2": 0.7021, "GPT4-1106": 0.6809, "GPT3.5": 0.2553, "Gorilla open-function-v1": 0.0213},
+    'Heldout_Combined': {"NexusRaven-V2": 0.5990, "GPT4-1106": 0.4814, "GPT3.5": 0.4495},
+    'Places_API': {"NexusRaven-V2": 0.5000, "GPT4-1106": 0.4375, "GPT3.5": 0.2500, "Gorilla open-function-v1": 0.0208},
+    'OTX': {"NexusRaven-V2": 0.9022, "GPT4-1106": 0.9022, "GPT3.5": 0.8913, "Gorilla open-function-v1": 0.2935},
+    'VirusTotal': {"GPT4-1106": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
+    'VT_Multi_Dependency': {"NexusRaven-V2": 0.3878, "GPT4-1106": 0.3673, "GPT3.5": 0.0204, "Gorilla open-function-v1": 0.0000},
+    'VT_Multi_Disconnected': {"NexusRaven-V2": 0.4286, "GPT4-1106": 0.2857, "GPT3.5": 0.1429, "Gorilla open-function-v1": 0.0000},
+    'CVECPE': {"GPT4-1106": 0.7700, "NexusRaven-V2": 0.6667, "GPT3.5": 0.4800, "Gorilla open-function-v1": 0.0897},
+    'CVECPE_Multi_Dependency': {"NexusRaven-V2": 0.2500, "GPT4-1106": 0.0714, "GPT3.5": 0.0714, "Gorilla open-function-v1": 0.0000},
 }
 SAMPLES = {
         capability_data = pd.DataFrame({
             'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
+            'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']],
+            'GPT4-1106': [single_calls_avg['GPT4-1106'], nested_calls_avg['GPT4-1106'], parallel_calls_avg['GPT4-1106']],
+            'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     elif type == "many apis many args":
         otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
         capability_data = pd.DataFrame({
             'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
             'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
+            places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']],
+            'GPT4-1106': [otx_avg['GPT4-1106'], virustotal_avg['GPT4-1106'], vt_multi_dependency_avg['GPT4-1106'], vt_multi_disconnected_avg['GPT4-1106'], cvecpe_avg['GPT4-1106'], cvecpe_multi_dependency_avg['GPT4-1106'], places_avg['GPT4-1106'], climate_avg['GPT4-1106'], heldout_avg['GPT4-1106']],
+            'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']]
             # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
             # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
                         markers=True,  # Adding markers
                         color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
                         template='plotly_dark',
+                        title='Capability Radar Chart on Different Function Calling Types')
     elif type == "many apis many args":
         data = calculate_capability_scores(RESULTS, "many apis many args")
         fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
                         markers=True,  # Adding markers
                         color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
                         template='plotly_dark',
+                        title='Capability Radar Chart on All Tasks')
     # Customize the lines and markers
     fig.update_traces(marker=dict(size=10), line=dict(width=4))
         with gr.Tab(tab_name):
             # Create and display DataFrame
+            with gr.Accordion("Details of the " + tab_name + ":", open=False) as accordion:
                 gr.Markdown(api_descriptions[key])
                 if key == "Heldout_Combined":
                     accordion.open = True
         params.set('__theme', 'dark');
         window.location.search = params.toString();
       }
+      }"""
         )

strings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # The natural language descriptions of different APIs
 api_descriptions = {
 "Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
-"Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open Functions V1 on The Stack API dataset. However, we manually converted a few randomly chosen samples and we observe the relative performance of Gorilla here is similar to Gorilla's relative performance on other tasks.",
 "Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
 "VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
 "VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",

 # The natural language descriptions of different APIs
 api_descriptions = {
 "Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
+"Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open Function V1 on the Stack API dataset. However, we manually converted a few randomly chosen samples and we observe the relative performance of Gorilla here is similar to Gorilla's relative performance on other tasks.",
 "Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
 "VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
 "VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",