Commit
·
551935e
1
Parent(s):
3f48470
modify details of UI
Browse files- app.py +19 -19
- strings.py +1 -1
app.py
CHANGED
@@ -47,15 +47,15 @@ hover_css = """
|
|
47 |
|
48 |
# Updated results reflecting the new screenshot
|
49 |
RESULTS = {
|
50 |
-
'Climate': {"
|
51 |
-
'Heldout_Combined': {"
|
52 |
-
'Places_API': {"
|
53 |
-
'OTX': {"
|
54 |
-
'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
|
55 |
-
'VT_Multi_Dependency': {"
|
56 |
-
'VT_Multi_Disconnected': {"
|
57 |
-
'CVECPE': {"GPT4": 0.7700, "
|
58 |
-
'CVECPE_Multi_Dependency': {"
|
59 |
}
|
60 |
|
61 |
SAMPLES = {
|
@@ -206,9 +206,9 @@ def calculate_capability_scores(results, type):
|
|
206 |
|
207 |
capability_data = pd.DataFrame({
|
208 |
'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
|
209 |
-
'
|
210 |
-
'
|
211 |
-
'
|
212 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
213 |
elif type == "many apis many args":
|
214 |
otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
|
@@ -223,10 +223,10 @@ def calculate_capability_scores(results, type):
|
|
223 |
|
224 |
capability_data = pd.DataFrame({
|
225 |
'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
|
226 |
-
'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
|
227 |
-
'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
|
228 |
'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
|
229 |
-
places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
|
|
|
|
|
230 |
# 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
|
231 |
# places_avg['Gorilla'], climate_avg['Gorilla'], 0]
|
232 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
@@ -241,14 +241,14 @@ def display_radar_chart(type):
|
|
241 |
markers=True, # Adding markers
|
242 |
color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
|
243 |
template='plotly_dark',
|
244 |
-
title='Capability Radar Chart on
|
245 |
elif type == "many apis many args":
|
246 |
data = calculate_capability_scores(RESULTS, "many apis many args")
|
247 |
fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
|
248 |
markers=True, # Adding markers
|
249 |
color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
|
250 |
template='plotly_dark',
|
251 |
-
title='Capability Radar Chart on All
|
252 |
|
253 |
# Customize the lines and markers
|
254 |
fig.update_traces(marker=dict(size=10), line=dict(width=4))
|
@@ -332,7 +332,7 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
|
|
332 |
|
333 |
with gr.Tab(tab_name):
|
334 |
# Create and display DataFrame
|
335 |
-
with gr.Accordion("Details of the " + tab_name + "
|
336 |
gr.Markdown(api_descriptions[key])
|
337 |
if key == "Heldout_Combined":
|
338 |
accordion.open = True
|
@@ -383,7 +383,7 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
|
|
383 |
params.set('__theme', 'dark');
|
384 |
window.location.search = params.toString();
|
385 |
}
|
386 |
-
}"""
|
387 |
)
|
388 |
|
389 |
|
|
|
47 |
|
48 |
# Updated results reflecting the new screenshot
|
49 |
RESULTS = {
|
50 |
+
'Climate': {"NexusRaven-V2": 0.7021, "GPT4-1106": 0.6809, "GPT3.5": 0.2553, "Gorilla open-function-v1": 0.0213},
|
51 |
+
'Heldout_Combined': {"NexusRaven-V2": 0.5990, "GPT4-1106": 0.4814, "GPT3.5": 0.4495},
|
52 |
+
'Places_API': {"NexusRaven-V2": 0.5000, "GPT4-1106": 0.4375, "GPT3.5": 0.2500, "Gorilla open-function-v1": 0.0208},
|
53 |
+
'OTX': {"NexusRaven-V2": 0.9022, "GPT4-1106": 0.9022, "GPT3.5": 0.8913, "Gorilla open-function-v1": 0.2935},
|
54 |
+
'VirusTotal': {"GPT4-1106": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla open-function-v1": 0.0728},
|
55 |
+
'VT_Multi_Dependency': {"NexusRaven-V2": 0.3878, "GPT4-1106": 0.3673, "GPT3.5": 0.0204, "Gorilla open-function-v1": 0.0000},
|
56 |
+
'VT_Multi_Disconnected': {"NexusRaven-V2": 0.4286, "GPT4-1106": 0.2857, "GPT3.5": 0.1429, "Gorilla open-function-v1": 0.0000},
|
57 |
+
'CVECPE': {"GPT4-1106": 0.7700, "NexusRaven-V2": 0.6667, "GPT3.5": 0.4800, "Gorilla open-function-v1": 0.0897},
|
58 |
+
'CVECPE_Multi_Dependency': {"NexusRaven-V2": 0.2500, "GPT4-1106": 0.0714, "GPT3.5": 0.0714, "Gorilla open-function-v1": 0.0000},
|
59 |
}
|
60 |
|
61 |
SAMPLES = {
|
|
|
206 |
|
207 |
capability_data = pd.DataFrame({
|
208 |
'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
|
209 |
+
'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']],
|
210 |
+
'GPT4-1106': [single_calls_avg['GPT4-1106'], nested_calls_avg['GPT4-1106'], parallel_calls_avg['GPT4-1106']],
|
211 |
+
'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']]
|
212 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
213 |
elif type == "many apis many args":
|
214 |
otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1)
|
|
|
223 |
|
224 |
capability_data = pd.DataFrame({
|
225 |
'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
|
|
|
|
|
226 |
'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
|
227 |
+
places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']],
|
228 |
+
'GPT4-1106': [otx_avg['GPT4-1106'], virustotal_avg['GPT4-1106'], vt_multi_dependency_avg['GPT4-1106'], vt_multi_disconnected_avg['GPT4-1106'], cvecpe_avg['GPT4-1106'], cvecpe_multi_dependency_avg['GPT4-1106'], places_avg['GPT4-1106'], climate_avg['GPT4-1106'], heldout_avg['GPT4-1106']],
|
229 |
+
'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']]
|
230 |
# 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
|
231 |
# places_avg['Gorilla'], climate_avg['Gorilla'], 0]
|
232 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
|
|
241 |
markers=True, # Adding markers
|
242 |
color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
|
243 |
template='plotly_dark',
|
244 |
+
title='Capability Radar Chart on Different Function Calling Types')
|
245 |
elif type == "many apis many args":
|
246 |
data = calculate_capability_scores(RESULTS, "many apis many args")
|
247 |
fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True,
|
248 |
markers=True, # Adding markers
|
249 |
color_discrete_sequence=px.colors.qualitative.Pastel, # Using Pastel color scheme
|
250 |
template='plotly_dark',
|
251 |
+
title='Capability Radar Chart on All Tasks')
|
252 |
|
253 |
# Customize the lines and markers
|
254 |
fig.update_traces(marker=dict(size=10), line=dict(width=4))
|
|
|
332 |
|
333 |
with gr.Tab(tab_name):
|
334 |
# Create and display DataFrame
|
335 |
+
with gr.Accordion("Details of the " + tab_name + ":", open=False) as accordion:
|
336 |
gr.Markdown(api_descriptions[key])
|
337 |
if key == "Heldout_Combined":
|
338 |
accordion.open = True
|
|
|
383 |
params.set('__theme', 'dark');
|
384 |
window.location.search = params.toString();
|
385 |
}
|
386 |
+
}"""
|
387 |
)
|
388 |
|
389 |
|
strings.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
# The natural language descriptions of different APIs
|
3 |
api_descriptions = {
|
4 |
"Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
|
5 |
-
"Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open
|
6 |
"Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
|
7 |
"VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
|
8 |
"VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",
|
|
|
2 |
# The natural language descriptions of different APIs
|
3 |
api_descriptions = {
|
4 |
"Climate": "The Climate API, provided by the National Climatic Data Center (NCDC), offers access to a comprehensive database of weather and climate data, catering to developers who want to create custom scripts or programs. The API allows up to five requests per second and a maximum of 10,000 requests per day.",
|
5 |
+
"Heldout_Combined": "This dataset is obtained from the stack (BigCode). The stack is primarily used as a pre-training dataset for Code LLMs, aiding in tasks like code completion from natural language, documentation generation, and auto-completion of code snippets. \n\n Note that due to specific policies, the stack data are not publicly available yet, so we didn't provide examples here. Thanks for your understanding!\n\n**Due to the complexity in converting the pythonic representation into a JSON representation [As each sample has unique API definition format and hundreds across the dataset, which is unlike other tasks where all samples shares the same function list which we manually converted], we did not have a chance to benchmark Gorilla Open Function V1 on the Stack API dataset. However, we manually converted a few randomly chosen samples and we observe the relative performance of Gorilla here is similar to Gorilla's relative performance on other tasks.",
|
6 |
"Places_API": "The Places API by Google, part of the Google Maps Platform, offers detailed information about over 200 million places worldwide, including ratings, reviews, and business data. It enhances user experience by providing features like accessibility information, special and secondary opening hours, editorial summaries, detailed dining and shopping service attributes, and the ability to sort and auto-translate reviews.",
|
7 |
"VT_Multi_Dependency": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalities. Multi_Dependency means that in order to fulfill the task requested by user's query, the model needs to call multiple apis, where some apis rely on the results of other apis.",
|
8 |
"VT_Multi_Disconnected": "Built on top of the VirusTotal (VT) dataset, we added 17 supplementary APIs to achieve more advanced functionalites. Disconnected means that in order to fultill the task requested by the user's query, the model needs to call multiple apis, where the apis don't repy on each other.",
|