|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
from strings import api_descriptions, func_definitions |
|
|
|
|
|
|
|
bubble_html = """ |
|
<div style="{style}" class="bubble"> |
|
{text} |
|
</div> |
|
""" |
|
|
|
bubble_style = """ |
|
padding: 10px; |
|
margin: 5px; |
|
background: linear-gradient(to bottom right, #FFFFFF, #E8E8E8); /* Lighter background for contrast */ |
|
border-radius: 15px; |
|
border: 1px solid #a1a1a1; /* Lighter border for subtle definition */ |
|
box-shadow: 2px 2px 10px rgba(255,255,255,0.1); /* Softer shadow with a hint of white for depth */ |
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
font-size: calc(4vw + 4vh) / 2; /* Scales dynamically with the viewport */ |
|
text-align: center; |
|
display: flex; |
|
align-items: center; /* Centers text vertically */ |
|
justify-content: center; /* Centers text horizontally */ |
|
min-height: 50px; /* Adjust as needed */ |
|
max-height: 140px; /* Adjust as needed */ |
|
max-width: 100%; |
|
color: #333333; /* Dark text for contrast against light background */ |
|
overflow-wrap: break-word; /* Allows long words to be broken and wrap onto the next line */ |
|
""" |
|
|
|
|
|
hover_css = """ |
|
<style> |
|
.bubble:hover { |
|
transform: scale(1.05); /* Scales up the bubble */ |
|
z-index: 10; /* Ensures the scaled bubble is above others */ |
|
} |
|
</style> |
|
""" |
|
|
|
|
|
|
|
RESULTS = { |
|
'Climate': {"GPT4": 0.6808, "NexusRaven-V2": 0.7234}, |
|
'Heldout_Combined': {"GPT4": 0.4814, "NexusRaven-V2": 0.5990}, |
|
'Places_API': {"GPT4": 0.3541, "NexusRaven-V2": 0.5000}, |
|
'OTX': {"GPT4": 0.9130, "NexusRaven-V2": 0.9021}, |
|
'VirusTotal': {"GPT4": 0.8940, "NexusRaven-V2": 0.7815}, |
|
'VT_Multi_Dependency': {"GPT4": 0.3469, "NexusRaven-V2": 0.3673}, |
|
'VT_Multi_Disconnected': {"GPT4": 0.2380, "NexusRaven-V2": 0.3809}, |
|
'CVECPE': {"GPT4": 0.5769, "NexusRaven-V2": 0.4480}, |
|
'CVECPE_Multi_Dependency': {"GPT4": 0.1071, "NexusRaven-V2": 0.1607}, |
|
} |
|
|
|
SAMPLES = { |
|
'OTX': "data/OTX.json", |
|
'CVECPE' : "data/CVECPE.json", |
|
'CVECPE_Multi_Dependency' : "data/CVECPE_MultiAPIs.json", |
|
'VirusTotal' : 'data/VirusTotal.json', |
|
'VT_Multi_Dependency': 'data/VT_MultiAPIs_Nested.json', |
|
'VT_Multi_Disconnected': 'data/VT_MultiAPIs_Disconnected.json', |
|
'Climate' : 'data/Climate.jsonl', |
|
'Places_API' : 'data/Places_API.jsonl' |
|
} |
|
import json |
|
import random |
|
import gradio as gr |
|
|
|
def read_json_or_jsonl(file_path): |
|
""" |
|
Read a file and determine if it's JSON or JSONL. |
|
Return the data as a list of items. |
|
""" |
|
try: |
|
with open(file_path, 'r') as file: |
|
if file_path.endswith('.jsonl'): |
|
|
|
data = [json.loads(line) for line in file] |
|
else: |
|
|
|
data = json.load(file) |
|
for item in data: |
|
if "input" in item: |
|
item["Input"] = item["input"] |
|
|
|
return data |
|
except Exception as e: |
|
print(f"Error reading file: {e}") |
|
return [] |
|
|
|
def sample_data(data, sample_size=5): |
|
""" |
|
Randomly sample items from the data. |
|
""" |
|
if not data: |
|
return [] |
|
sample_size = min(sample_size, len(data)) |
|
return random.sample(data, sample_size) |
|
|
|
def highlight_row(s, column, value, color='yellow'): |
|
""" |
|
Highlight a row where the column has a specified value. |
|
|
|
Args: |
|
s (pd.Series): Row of the DataFrame. |
|
column (str): Column name to check the value. |
|
value (any): Value to check against. |
|
color (str): Background color for highlighting. Default is yellow. |
|
|
|
Returns: |
|
[str]: A list of CSS strings for each cell in the row. |
|
""" |
|
return [f'background-color: {color}' if v == value else '' |
|
for v in s[column]] |
|
|
|
def create_bar_chart(data, title, theme): |
|
df = pd.DataFrame.from_dict(data, orient='index', columns=['Score']).reset_index() |
|
df.rename(columns={'index': 'Model'}, inplace=True) |
|
|
|
|
|
colors = ['#636EFA', '#EF553B'] if theme == 'dark' else ['#00CC96', '#AB63FA'] |
|
|
|
fig = px.bar( |
|
df, x='Model', y='Score', title=title, |
|
color='Model', color_discrete_sequence=colors, |
|
text='Score', barmode='group' |
|
) |
|
|
|
|
|
fig.update_layout( |
|
plot_bgcolor='rgba(0,0,0,0)' if theme == 'dark' else 'rgba(255,255,255,1)', |
|
paper_bgcolor='rgba(0,0,0,0)' if theme == 'dark' else 'rgba(255,255,255,1)', |
|
font_color='white' if theme == 'dark' else 'black' |
|
) |
|
|
|
|
|
fig.update_traces( |
|
hoverinfo='all', hovertemplate='Model: %{x}<br>Score: %{y:.2f}' |
|
) |
|
|
|
|
|
max_score = df['Score'].max() |
|
fig.update_yaxes(range=[0, max_score + max_score * 0.1]) |
|
|
|
return fig |
|
|
|
|
|
|
|
simple_tasks = ['OTX', 'CVECPE', 'VirusTotal', 'VT_Multi_Disconnected', 'Heldout_Combined'] |
|
difficult_tasks = ['VT_Multi_Dependency', 'Climate', 'Places_API', 'CVECPE_Multi_Dependency'] |
|
|
|
|
|
def format_scores(val): |
|
if isinstance(val, float): |
|
val = val * 100 |
|
return f"{val:.4g}" |
|
return val |
|
|
|
|
|
def calculate_averages(results): |
|
all_tasks_avg = pd.DataFrame(results).mean(axis=1) |
|
simple_tasks_avg = pd.DataFrame({k: results[k] for k in simple_tasks}).mean(axis=1) |
|
difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1) |
|
|
|
avg_data = pd.DataFrame({ |
|
'All Tasks': all_tasks_avg, |
|
'Tasks with Single Call (simple)"': simple_tasks_avg, |
|
'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg |
|
}).reset_index().rename(columns={'index': 'Model'}) |
|
|
|
return avg_data |
|
|
|
|
|
def display_averages(): |
|
avg_data = calculate_averages(RESULTS) |
|
return avg_data |
|
|
|
|
|
|
|
single_calls = ['OTX', 'CVECPE', 'VirusTotal', 'Heldout_Combined'] |
|
nested_calls = ['VT_Multi_Dependency', 'Places_API', 'CVECPE_Multi_Dependency', 'Heldout_Combined'] |
|
parallel_calls = ['Climate', 'VT_Multi_Disconnected'] |
|
|
|
otx = ["OTX"] |
|
cvecpe = ['CVECPE'] |
|
virustotal = ['VirusTotal'] |
|
vt_multi_dependency = ['VT_Multi_Dependency'] |
|
places = ['Places_API'] |
|
cvecpe_multi_dependency = ['CVECPE_Multi_Dependency'] |
|
heldout = ['Heldout_Combined'] |
|
climate = ['Climate'] |
|
vt_multi_disconnected = ['VT_Multi_Disconnected'] |
|
|
|
|
|
def calculate_capability_scores(results, type): |
|
if type == "general ability": |
|
single_calls_avg = pd.DataFrame({k: results[k] for k in single_calls}).mean(axis=1) |
|
nested_calls_avg = pd.DataFrame({k: results[k] for k in nested_calls}).mean(axis=1) |
|
parallel_calls_avg = pd.DataFrame({k: results[k] for k in parallel_calls}).mean(axis=1) |
|
|
|
capability_data = pd.DataFrame({ |
|
'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'], |
|
'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']], |
|
'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']] |
|
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score') |
|
elif type == "many apis many args": |
|
otx_avg = pd.DataFrame({k: results[k] for k in otx}).mean(axis=1) |
|
cvecpe_avg = pd.DataFrame({k: results[k] for k in cvecpe}).mean(axis=1) |
|
virustotal_avg = pd.DataFrame({k: results[k] for k in virustotal}).mean(axis=1) |
|
vt_multi_dependency_avg = pd.DataFrame({k: results[k] for k in vt_multi_dependency}).mean(axis=1) |
|
places_avg = pd.DataFrame({k: results[k] for k in places}).mean(axis=1) |
|
cvecpe_multi_dependency_avg = pd.DataFrame({k: results[k] for k in cvecpe_multi_dependency}).mean(axis=1) |
|
heldout_avg = pd.DataFrame({k: results[k] for k in heldout}).mean(axis=1) |
|
climate_avg = pd.DataFrame({k: results[k] for k in climate}).mean(axis=1) |
|
vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1) |
|
|
|
capability_data = pd.DataFrame({ |
|
'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'CVECPE (Single)', 'CVECPE_Multi (Nested)', 'Places (Nested)', 'Climate (Parallel)', 'Stack (Nested)'], |
|
'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']], |
|
'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'], |
|
places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']] |
|
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score') |
|
|
|
return capability_data |
|
|
|
|
|
def display_radar_chart(type): |
|
if type == "general ability": |
|
data = calculate_capability_scores(RESULTS, "general ability") |
|
fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True, |
|
markers=True, |
|
color_discrete_sequence=px.colors.qualitative.Pastel, |
|
template='plotly_dark', |
|
title='Capability Radar Chart on General Abilities') |
|
elif type == "many apis many args": |
|
data = calculate_capability_scores(RESULTS, "many apis many args") |
|
fig = px.line_polar(data, r='Score', theta='Capability', color='Model', line_close=True, |
|
markers=True, |
|
color_discrete_sequence=px.colors.qualitative.Pastel, |
|
template='plotly_dark', |
|
title='Capability Radar Chart on All Subtasks') |
|
|
|
|
|
fig.update_traces(marker=dict(size=10), line=dict(width=4)) |
|
|
|
return fig |
|
|
|
|
|
INTRO_TEXT = """ |
|
# Nexus Function Calling Leaderboard |
|
|
|
Welcome to the Nexus Function Calling Leaderboard! We provide a focused benchmarking platform that evaluates a range of models on their ability to perform zero-shot function calling and API usage. Our leaderboard features the following highlights: |
|
|
|
- **Nine Varied Tasks**: We cover a broad spectrum, from cybersecurity and climate APIs to recommendation systems, along with some pure Python functions. |
|
- **Zero-Shot Challenges**: Models are tested on their innate ability to handle tasks they haven't seen before, showcasing their versatility and comprehension from the function definitions and user queries ONLY. |
|
- **Diverse Model Participation**: We included a mix of both open-source and closed-source models. We initially benchmarked three models, and we are more than happy to work together with the community to involve more models. |
|
|
|
This leaderboard is an exciting step towards understanding and improving the capabilities of large language models in diverse, real-world applications with building semantic interfaces around APIs! |
|
|
|
""" |
|
|
|
CSS = """ |
|
.intro-text { |
|
font-size: 26px; |
|
} |
|
footer { |
|
visibility: hidden; |
|
} |
|
""" |
|
|
|
|
|
custom_css = """ |
|
<style> |
|
.markdown-class { |
|
font-size: 16px !important; /* Adjust the font size as needed */ |
|
} |
|
</style> |
|
""" |
|
|
|
with gr.Blocks(theme='dark') as demo: |
|
gr.HTML( |
|
"""<img width="50" height="50" style="float:left; margin: 0px;" src="/file=logo.png"> |
|
<h1 style="overflow: hidden; padding-top: 17px; margin: 0px;">Nexusflow</h1> |
|
""" |
|
) |
|
with gr.Row(): |
|
gr.Image( |
|
"raven.png", |
|
show_label=False, |
|
show_share_button=True, |
|
min_width=40, |
|
scale=1, |
|
) |
|
with gr.Column(scale=4): |
|
gr.HTML(custom_css) |
|
gr.Markdown(INTRO_TEXT, elem_classes="markdown-class") |
|
with gr.Tab("Overall"): |
|
|
|
|
|
with gr.Accordion("Task Averages:"): |
|
gr.Dataframe(display_averages().map(format_scores)) |
|
|
|
with gr.Accordion("Model Capabilities:"): |
|
with gr.Row(): |
|
gr.Plot(display_radar_chart("general ability")) |
|
gr.Plot(display_radar_chart("many apis many args")) |
|
|
|
for key, value in RESULTS.items(): |
|
tab_names = { |
|
'OTX': 'OTX (Single)', |
|
'CVECPE': 'CVECPE (Single)', |
|
'VirusTotal': 'VirusTotal (Single)', |
|
'VT_Multi_Dependency': 'VT_Multi (Nested)', |
|
'Places_API': 'Places (Nested)', |
|
'CVECPE_Multi_Dependency': 'CVECPE_Multi (Nested)', |
|
'Heldout_Combined': 'Stack (Nested)', |
|
'Climate': 'Climate (Parallel)', |
|
'VT_Multi_Disconnected': 'VT_Multi (Parallel)' |
|
} |
|
|
|
tab_name = tab_names.get(key, key) |
|
|
|
with gr.Tab(tab_name): |
|
|
|
with gr.Accordion("Details of the " + tab_name + " :", open=False) as accordion: |
|
gr.Markdown(api_descriptions[key]) |
|
if key == "Heldout_Combined": |
|
accordion.open = True |
|
else: |
|
func_definition_list = func_definitions[key] |
|
|
|
with gr.Group(): |
|
for i in range(len(func_definition_list)): |
|
with gr.Accordion(func_definition_list[i][0], open=False): |
|
gr.Markdown(func_definition_list[i][1]) |
|
|
|
df = pd.DataFrame.from_dict(value, orient='index', columns=['Score']).reset_index() |
|
df.rename(columns={'index': 'Model'}, inplace=True) |
|
gr.Dataframe(df.map(format_scores)) |
|
if key in SAMPLES: |
|
file_path = SAMPLES[key] |
|
data = read_json_or_jsonl(file_path) |
|
samples = sample_data(data) |
|
|
|
|
|
|
|
for sample in samples: |
|
s = sample["Output"] |
|
|
|
n = 90 |
|
from black import Mode, format_str |
|
|
|
if isinstance(s, list): |
|
sample['Output'] = ''.join([format_str(item, mode=Mode()) for item in s]) |
|
else: |
|
sample['Output'] = format_str(s, mode=Mode()) |
|
|
|
samples = [[hover_css + bubble_html.format(style=bubble_style, text=sample['Input']), f"```python\n{sample['Output']}\n```".replace("; ", ";\n")] for sample in samples] |
|
gr.Dataset( |
|
|
|
components=[gr.HTML(), gr.Markdown()], |
|
headers= ["Prompt", "API Use"], |
|
label=f"{key} Samples", |
|
samples=samples |
|
) |
|
demo.load( |
|
None, |
|
None, |
|
js=""" |
|
() => { |
|
const params = new URLSearchParams(window.location.search); |
|
if (!params.has('__theme')) { |
|
params.set('__theme', 'dark'); |
|
window.location.search = params.toString(); |
|
} |
|
}""", |
|
) |
|
|
|
|
|
demo.launch(share=True, allowed_paths=["logo.png", "raven.png"]) |
|
|
|
|