zhen-dong-nexusflow commited on
Commit
551935e
1 Parent(s): 3f48470

modify details of UI

Browse files
Files changed (2) hide show
  1. app.py +19 -19
  2. strings.py +1 -1
app.py CHANGED
@@ -47,15 +47,15 @@ hover_css = """
47
 
48
  # Updated results reflecting the new screenshot
49
  RESULTS = {
50
- 'Climate': {"GPT4": 0.6809, "GPT3.5": 0.2553, "NexusRaven-V2": 0.7021, "Gorilla open-function-v1": 0.0213},
51
- 'Heldout_Combined': {"GPT4": 0.4814, "GPT3.5": 0.4495, "NexusRaven-V2": 0.5990},
52
- 'Places_API': {"GPT4": 0.4375, "GPT3.5": 0.2500, "NexusRaven-V2": 0.5000, "Gorilla open-function-v1": 0.0208},
53
- 'OTX': {"GPT4": 0.9022, "GPT3.5": 0.8913, "NexusRaven-V2": 0.9022, "Gorilla open-function-v1": 0.2935},
54
- 'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
55
- 'VT_Multi_Dependency': {"GPT4": 0.3673, "GPT3.5": 0.0204, "NexusRaven-V2": 0.3878, "Gorilla open-function-v1": 0.0000},
56
- 'VT_Multi_Disconnected': {"GPT4": 0.2857, "GPT3.5": 0.1429, "NexusRaven-V2": 0.4286, "Gorilla open-function-v1": 0.0000},
57
- 'CVECPE': {"GPT4": 0.7700, "GPT3.5": 0.4800, "NexusRaven-V2": 0.6667, "Gorilla open-function-v1": 0.0897},
58
- 'CVECPE_Multi_Dependency': {"GPT4": 0.0714, "GPT3.5": 0.0714, "NexusRaven-V2": 0.2500, "Gorilla open-function-v1": 0.0000},
59
  }
60
 
61
  SAMPLES = {
@@ -206,9 +206,9 @@ def calculate_capability_scores(results, type):
206
 
207
  capability_data = pd.DataFrame({
208
  'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
209
- 'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
210
- 'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']],
211
- 'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
212
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
213
  elif type == "many apis many args":
214
  otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
@@ -223,10 +223,10 @@ def calculate_capability_scores(results, type):
223
 
224
  capability_data = pd.DataFrame({
225
  'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
226
- 'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
227
- 'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
228
  'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
229
- places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
 
 
230
  # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
231
  # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
232
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
@@ -241,14 +241,14 @@ def display_radar_chart(type):
241
  markers=True, # Adding markers
242
  color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
243
  template='plotly_dark',
244
- title='Capability Radar Chart on General Abilities')
245
  elif type == "many apis many args":
246
  data = calculate_capability_scores(RESULTS, "many apis many args")
247
  fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
248
  markers=True, # Adding markers
249
  color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
250
  template='plotly_dark',
251
- title='Capability Radar Chart on All Subtasks')
252
 
253
  # Customize the lines and markers
254
  fig.update_traces(marker=dict(size=10), line=dict(width=4))
@@ -332,7 +332,7 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
332
 
333
  with gr.Tab(tab_name):
334
  # Create and display DataFrame
335
- with gr.Accordion("Details of the " + tab_name + " :", open=False) as accordion:
336
  gr.Markdown(api_descriptions[key])
337
  if key == "Heldout_Combined":
338
  accordion.open = True
@@ -383,7 +383,7 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
383
  params.set('__theme', 'dark');
384
  window.location.search = params.toString();
385
  }
386
- }""",
387
  )
388
 
389
 
 
47
 
48
  # Updated results reflecting the new screenshot
49
  RESULTS = {
50
+ 'Climate': {"NexusRaven-V2": 0.7021, "GPT4-1106": 0.6809, "GPT3.5": 0.2553, "Gorilla open-function-v1": 0.0213},
51
+ 'Heldout_Combined': {"NexusRaven-V2": 0.5990, "GPT4-1106": 0.4814, "GPT3.5": 0.4495},
52
+ 'Places_API': {"NexusRaven-V2": 0.5000, "GPT4-1106": 0.4375, "GPT3.5": 0.2500, "Gorilla open-function-v1": 0.0208},
53
+ 'OTX': {"NexusRaven-V2": 0.9022, "GPT4-1106": 0.9022, "GPT3.5": 0.8913, "Gorilla open-function-v1": 0.2935},
54
+ 'VirusTotal': {"GPT4-1106": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
55
+ 'VT_Multi_Dependency': {"NexusRaven-V2": 0.3878, "GPT4-1106": 0.3673, "GPT3.5": 0.0204, "Gorilla open-function-v1": 0.0000},
56
+ 'VT_Multi_Disconnected': {"NexusRaven-V2": 0.4286, "GPT4-1106": 0.2857, "GPT3.5": 0.1429, "Gorilla open-function-v1": 0.0000},
57
+ 'CVECPE': {"GPT4-1106": 0.7700, "NexusRaven-V2": 0.6667, "GPT3.5": 0.4800, "Gorilla open-function-v1": 0.0897},
58
+ 'CVECPE_Multi_Dependency': {"NexusRaven-V2": 0.2500, "GPT4-1106": 0.0714, "GPT3.5": 0.0714, "Gorilla open-function-v1": 0.0000},
59
  }
60
 
61
  SAMPLES = {
 
206
 
207
  capability_data = pd.DataFrame({
208
  'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
209
+ 'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']],
210
+ 'GPT4-1106': [single_calls_avg['GPT4-1106'], nested_calls_avg['GPT4-1106'], parallel_calls_avg['GPT4-1106']],
211
+ 'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']]
212
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
213
  elif type == "many apis many args":
214
  otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
 
223
 
224
  capability_data = pd.DataFrame({
225
  'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
 
 
226
  'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
227
+ places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']],
228
+ 'GPT4-1106': [otx_avg['GPT4-1106'], virustotal_avg['GPT4-1106'], vt_multi_dependency_avg['GPT4-1106'], vt_multi_disconnected_avg['GPT4-1106'], cvecpe_avg['GPT4-1106'], cvecpe_multi_dependency_avg['GPT4-1106'], places_avg['GPT4-1106'], climate_avg['GPT4-1106'], heldout_avg['GPT4-1106']],
229
+ 'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']]
230
  # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
231
  # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
232
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
 
241
  markers=True, # Adding markers
242
  color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
243
  template='plotly_dark',
244
+ title='Capability Radar Chart on Different Function Calling Types')
245
  elif type == "many apis many args":
246
  data = calculate_capability_scores(RESULTS, "many apis many args")
247
  fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
248
  markers=True, # Adding markers
249
  color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
250
  template='plotly_dark',
251
+ title='Capability Radar Chart on All Tasks')
252
 
253
  # Customize the lines and markers
254
  fig.update_traces(marker=dict(size=10), line=dict(width=4))
 
332
 
333
  with gr.Tab(tab_name):
334
  # Create and display DataFrame
335
+ with gr.Accordion("Details of the " + tab_name + ":", open=False) as accordion:
336
  gr.Markdown(api_descriptions[key])
337
  if key == "Heldout_Combined":
338
  accordion.open = True
 
383
  params.set('__theme', 'dark');
384
  window.location.search = params.toString();
385
  }
386
+ }"""
387
  )
388
 
389
 
strings.py CHANGED
@@ -2,7 +2,7 @@
2
  # The natural language descriptions of different APIs
3
  api_descriptions = {
4
  "Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
5
- "Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open Functions V1 on The Stack API dataset. However, we manually converted a few randomly chosen samples and we observe the relative performance of Gorilla here is similar to Gorilla's relative performance on other tasks.",
6
  "Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
7
  "VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
8
  "VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",
 
2
  # The natural language descriptions of different APIs
3
  api_descriptions = {
4
  "Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
5
+ "Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open Function V1 on the Stack API dataset. However, we manually converted a few randomly chosen samples and we observe the relative performance of Gorilla here is similar to Gorilla's relative performance on other tasks.",
6
  "Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
7
  "VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
8
  "VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",