Spaces:

gorilla-llm
/

berkeley-function-calling-leaderboard

Running

berkeley-function-calling-leaderboard / app.py

Huanzhi Mao

add 95th percentile latency stats

73e032d 7 months ago

55.6 kB

	import gradio as gr
	import json
	import webbrowser
	import os
	import re
	import pandas as pd
	import csv

	# from anthropic import Anthropic
	from openai import OpenAI
	from mistralai.client import MistralClient
	from mistralai.models.chat_completion import ChatMessage
	import requests

	mongoDBPassword = os.environ.get("MONGODB_PASSWORD")
	openaiKey = os.environ.get("OPENAI_API_KEY")
	mistralKey = os.environ.get("MISTRAL_API_KEY")
	# anthropicKey = os.environ.get("ANTHROPIC_API_KEY")

	EXAMPLES = [
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "dict",
	"properties": {
	"url": {
	"type": "string",
	"description": "Geocoding API converting a a pair of latitude and longitude coordinates to human readable addresses",
	"default": "https://geocode.maps.co/reverse",
	},
	"headers": {},
	"timeout": {
	"type": ["number", "tuple"],
	"description": "How many seconds to wait for the server to send data before giving up.",
	"required": False,
	},
	"params": {
	"lat": {
	"type": "number",
	"description": "Latitude of the location to reverse geocode.",
	"required": True,
	},
	"lon": {
	"type": "number",
	"description": "Longitude of the location to reverse geocode.",
	"required": True,
	},
	"format": {
	"type": "string",
	"description": "The desired response format. Options include 'xml', 'json', 'jsonv2', 'geojson', 'geocodejson'. Default is 'json'.",
	"required": False,
	},
	},
	"allow_redirects": {
	"type": "boolean",
	"description": "A Boolean to enable/disable redirection.",
	"default": True,
	"required": False,
	},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	"required": False,
	},
	"cert": {
	"type": ["string", "tuple"],
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	"required": False,
	},
	"cookies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	"required": False,
	},
	"proxies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	"required": False,
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	"required": False,
	},
	"verify": {
	"type": ["boolean", "string"],
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	"required": False,
	},
	},
	},
	},
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "dict",
	"properties": {
	"url": {
	"type": "string",
	"description": "The Date Nager API provides access holiday information for over 100 countries, including the ability to query for long weekends. It leverages ISO 3166-1 alpha-2 country codes to tailor the search to your specific region of interest. More information can be found in https://date.nager.at/Api",
	"default": "https://date.nager.at/api/v3/LongWeekend/{year}/{countryCode}",
	},
	"headers": {},
	"timeout": {
	"type": ["number", "tuple"],
	"description": "How many seconds to wait for the server to send data before giving up.",
	"required": False,
	},
	"params": {},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	"required": False,
	},
	"cert": {
	"type": ["string", "tuple"],
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	"required": False,
	},
	"cookies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	"required": False,
	},
	"proxies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	"required": False,
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	"required": False,
	},
	"verify": {
	"type": ["boolean", "string"],
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	"required": False,
	},
	},
	},
	},
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "dict",
	"properties": {
	"url": {
	"type": "string",
	"description": "The Open-Meteo API provides detailed weather forecasts for any location worldwide. It offers forecasts up to 16 days in advance and also provide past data. The API's response gives weather variables on an hourly basis, such as temperature, humidity, precipitation, wind speed and direction, etc. More information can be found in https://open-meteo.com/en/docs/",
	"default": "https://api.open-meteo.com/v1/forecast",
	},
	"headers": {},
	"timeout": {
	"type": ["number", "tuple"],
	"description": "How many seconds to wait for the server to send data before giving up.",
	"required": False,
	},
	"params": {
	"latitude": {
	"type": "string",
	"description": "Geographical WGS84 coordinates of the location. Multiple coordinates can be comma separated. E.g., &latitude=52.52,48.85&longitude=13.41,2.35. To return data for multiple locations the JSON output changes to a list of structures. CSV and XLSX formats add a column location_id. N is positive, S is negative",
	"required": True,
	},
	"longitude": {
	"type": "string",
	"description": "Geographical WGS84 coordinates of the location. Multiple coordinates can be comma separated. E is positive, W is negative",
	"required": True,
	},
	"elevation": {
	"type": "string",
	"description": "The elevation used for statistical downscaling. Per default, a 90 meter digital elevation model is used. You can manually set the elevation to correctly match mountain peaks. If &elevation=nan is specified, downscaling will be disabled and the API uses the average grid-cell height. For multiple locations, elevation can also be comma separated.",
	"required": False,
	},
	"hourly": {
	"type": "string",
	"description": "A list of weather variables which should be returned. Values can be comma separated, or multiple &hourly= parameters in the URL can be used. Support parameters: temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_speed_180m,wind_direction_10m,wind_direction_80m,wind_direction_120m,wind_direction_180m,wind_gusts_10m,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,global_tilted_irradiance,vapour_pressure_deficit,cape,evapotranspiration,et0_fao_evapotranspiration,precipitation,snowfall,precipitation_probability,rain,showers,weather_code,snow_depth,freezing_level_height,visibility,soil_temperature_0cm,soil_temperature_6cm,soil_temperature_18cm,soil_temperature_54cm,soil_moisture_0_to_1cm,soil_moisture_1_to_3cm,soil_moisture_3_to_9cm,soil_moisture_9_to_27cm,soil_moisture_27_to_81cm",
	"required": False,
	},
	"daily": {
	"type": "string",
	"description": "A list of daily weather variable aggregations which should be returned. Values can be comma separated, or multiple &daily= parameters in the URL can be used. If daily weather variables are specified, parameter timezone is required. Possible values supported temperature_2m_max, temperature_2m_min, apparent_temperature_max, apparent_temperature_min, precipitation_sum, rain_sum, showers_sum, snowfall_sum, precipitation_hours, ,precipitation_probability_max, precipitation_probability_min, precipitation_probability_mean, weather_code,sunrise,sunset,sunshine_duration, daylight_duration, wind_speed_10m_max, wind_gusts_10m_max, wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,uv_index_maxuv_index_clear_sky_max",
	"required": False,
	},
	"temperature_unit": {
	"type": "string",
	"description": "If fahrenheit is set, all temperature values are converted to Fahrenheit.",
	"required": False,
	"default": "celsius",
	},
	"wind_speed_unit": {
	"type": "string",
	"description": "Other wind speed units: ms, mph, and kn.",
	"required": False,
	"default": "kmh",
	},
	"precipitation_unit": {
	"type": "string",
	"description": "Other precipitation amount units: inch.",
	"required": False,
	"default": "mm",
	},
	"timeformat": {
	"type": "string",
	"description": "If format unixtime is selected, all time values are returned in UNIX epoch time in seconds. Please note that all timestamps are in GMT+0! For daily values with unix timestamps, please apply utc_offset_seconds again to get the correct date.",
	"required": False,
	"default": "iso8601",
	},
	"timezone": {
	"type": "string",
	"description": "If timezone is set, all timestamps are returned as local-time and data is returned starting at 00:00 local-time. Any time zone name from the time zone database is supported. If auto is set as a time zone, the coordinates will be automatically resolved to the local time zone. For multiple coordinates, a comma separated list of timezones can be specified.",
	"required": False,
	"default": "GMT",
	},
	"past_days": {
	"type": "integer",
	"description": "If past_days is set, yesterday or the day before yesterday data are also returned.",
	"required": False,
	"default": 0,
	},
	"forecast_days": {
	"type": "integer",
	"description": "Per default, only 7 days are returned. Up to 16 days of forecast are possible.",
	"required": False,
	"default": 7,
	},
	"forecast_hours": {
	"type": "integer",
	"description": "Similar to forecast_days, the number of timesteps of hourly data can be controlled.",
	"required": False,
	},
	"forecast_minutely_15": {
	"type": "integer",
	"description": "The number of timesteps of 15-minutely data can be controlled.",
	"required": False,
	},
	"past_hours": {
	"type": "integer",
	"description": "the number of timesteps of hourly data controlled",
	"required": False,
	},
	"past_minutely_15": {
	"type": "integer",
	"description": "the number of timesteps of 15 minute data controlled",
	"required": False,
	},
	"start_date": {
	"type": "string",
	"description": "The time interval to get weather data. A day must be specified as an ISO8601 date (e.g. 2022-06-30).",
	"required": False,
	},
	"end_date": {
	"type": "string",
	"description": "",
	"required": False,
	},
	"start_hour": {
	"type": "string",
	"description": "The time interval to get weather data for hourly data. Time must be specified as an ISO8601 date and time (e.g. 2022-06-30T12:00).",
	"required": False,
	},
	"end_hour": {
	"type": "string",
	"description": "",
	"required": False,
	},
	"start_minutely_15": {
	"type": "string",
	"description": "",
	"required": False,
	},
	"end_minutely_15": {
	"type": "string",
	"description": "",
	"required": False,
	},
	"models": {
	"type": "list",
	"items": {"type": "string"},
	"description": "A list of string, manually select one or more weather models. Per default, the best suitable weather models will be combined.",
	"required": False,
	},
	"cell_selection": {
	"type": "string",
	"description": "Set a preference how grid-cells are selected. The default land finds a suitable grid-cell on land with similar elevation to the requested coordinates using a 90-meter digital elevation model. sea prefers grid-cells on sea. nearest selects the nearest possible grid-cell.",
	"required": False,
	},
	"apikey": {
	"type": "string",
	"description": "Only required to commercial use to access reserved API resources for customers. The server URL requires the prefix customer-. See pricing for more information.",
	"required": False,
	},
	},
	"allow_redirects": {
	"type": "boolean",
	"description": "A Boolean to enable/disable redirection.",
	"default": True,
	"required": False,
	},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	"required": False,
	},
	"cert": {
	"type": ["string", "tuple"],
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	"required": False,
	},
	"cookies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	"required": False,
	},
	"proxies": {
	"type": "dict",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	"required": False,
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	"required": False,
	},
	"verify": {
	"type": ["boolean", "string"],
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	"required": False,
	},
	},
	},
	},
	]

	PROMPTS = [
	"Can you provide address for latitude 37.4224764 and longitude -122.0842499 using the Geocoding API?",
	"I'm planning a series of long weekend getaways for the upcoming year and I need to know when they'll occur in my country. Could you fetch me the list of long weekends for Canada in the year 2023? I'd like to integrate this information into my holiday planning app.",
	"I'm planning a camping trip and I need to know the weather forecast. Can you fetch me the weather data for the campsite located at latitude 35.68 and longitude -121.34 for the next 10 days including daily temperature and precipitation forecasts? Also, I prefer the temperature 2 minute max in Fahrenheit and sum of precipitation in inches.",
	]

	EXAMPLES = [
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "Get statistics for all countries about COVID-19",
	"default": "https://covid-193.p.rapidapi.com/statistics",
	},
	"headers": {
	"properties": {
	"X-RapidAPI-Key": {
	"type": "string",
	"description": "The API key for authenticating requests to RapidAPI.",
	},
	"X-RapidAPI-Host": {
	"type": "string",
	"description": "The host domain for the RapidAPI service being accessed.",
	},
	},
	"type": "object",
	"required": ["X-RapidAPI-Key", "X-RapidAPI-Host"],
	},
	"timeout": {
	"type": "string",
	"description": "How many seconds to wait for the server to send data before giving up.",
	},
	"params": {
	"properties": {
	"country": {
	"type": "string",
	"description": "Name of the country to retrieve data for. Use '[All]' to indicate a global history request.",
	}
	},
	"type": "object",
	"required": [],
	},
	"allow_redirects": {
	"type": "boolean",
	"description": "A Boolean to enable/disable redirection.",
	"default": True,
	},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	},
	"cert": {
	"type": "string",
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	},
	"cookies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	},
	"proxies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	},
	"verify": {
	"type": "string",
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	},
	},
	},
	"required": ["url"],
	},
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "Geocoding API converting a human-readable address into a pair of latitude and longitude coordinates",
	"default": "https://geocode.maps.co/search",
	},
	"headers": {"properties": {}, "type": "object", "required": []},
	"timeout": {
	"type": "string",
	"description": "How many seconds to wait for the server to send data before giving up.",
	},
	"params": {
	"properties": {
	"lat": {
	"type": "number",
	"description": "Latitude of the location to reverse geocode.",
	},
	"lon": {
	"type": "number",
	"description": "Longitude of the location to reverse geocode.",
	},
	"api_key": {
	"type": "string",
	"description": "Your API key for authentication.",
	},
	"format": {
	"type": "string",
	"description": "The desired response format. Options include 'xml', 'json', 'jsonv2', 'geojson', 'geocodejson'. Default is 'json'.",
	},
	},
	"type": "object",
	"required": ["lat", "lon", "api_key"],
	},
	"allow_redirects": {
	"type": "boolean",
	"description": "A Boolean to enable/disable redirection.",
	"default": True,
	},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	},
	"cert": {
	"type": "string",
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	},
	"cookies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	},
	"proxies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	},
	"verify": {
	"type": "string",
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	},
	},
	},
	"required": ["url"],
	},
	{
	"name": "requests.get",
	"description": "Sends a GET request to the specified URL.",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "Fetches the age rating of a movie from the OMDB API.",
	"default": "http://www.omdbapi.com/",
	},
	"headers": {"properties": {}, "type": "object", "required": []},
	"timeout": {
	"type": "string",
	"description": "How many seconds to wait for the server to send data before giving up.",
	},
	"params": {
	"properties": {
	"i": {
	"type": "string",
	"description": "A valid IMDb ID (e.g., tt1285016).",
	},
	"t": {
	"type": "string",
	"description": "Movie title to search for.",
	},
	"type": {
	"type": "string",
	"description": "Type of result to return. Valid options are 'movie', 'series', and 'episode'.",
	},
	"y": {"type": "string", "description": "Year of release."},
	"plot": {
	"type": "string",
	"description": "Return short or full plot. Default is 'short'.",
	},
	"r": {
	"type": "string",
	"description": "The data type to return. Default is 'json'.",
	},
	"callback": {
	"type": "string",
	"description": "JSONP callback name.",
	},
	"v": {
	"type": "integer",
	"description": "API version (reserved for future use). Default is 1.",
	},
	},
	"type": "object",
	"required": [],
	},
	"allow_redirects": {
	"type": "boolean",
	"description": "A Boolean to enable/disable redirection.",
	"default": True,
	},
	"auth": {
	"type": "tuple",
	"description": "A tuple to enable a certain HTTP authentication.",
	"default": "None",
	},
	"cert": {
	"type": "string",
	"description": "A String or Tuple specifying a cert file or key.",
	"default": "None",
	},
	"cookies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of cookies to send with the request.",
	},
	"proxies": {
	"type": "object",
	"additionalProperties": {"type": "string"},
	"description": "Dictionary of the protocol to the proxy url.",
	},
	"stream": {
	"type": "boolean",
	"description": "A Boolean indication if the response should be immediately downloaded (False) or streamed (True).",
	"default": False,
	},
	"verify": {
	"type": "string",
	"description": "A Boolean or a String indication to verify the servers TLS certificate or not.",
	"default": True,
	},
	},
	},
	"required": ["url"],
	},
	]

	PROMPTS = [
	"While I'm working on a dashboard to display real-time COVID-19 statistics for Uganda, including total cases, recoveries, and deaths, I realized I need to use the API Sports COVID-19 API for accurate data. Given that I have my API key as '123456' and the host as 'covid-193.p.rapidapi.com', how can I fetch the latest statistics ensuring the request times out if it takes longer than 10 seconds? Also, how can I make sure the response is not streamed?",
	"Use my API key '123456', can you convert the address 'Soda Hall, Berkeley, CA' to latitude and longitude coordinates using our Geocoding API, and also make sure to return the results in GeoJSON format?",
	"I'm looking to fetch the full plot details for the movie 'Gorilla' from the OMDB API. Can you provide me with the Python requests.get code to retrieve the information in JSON format? I can provide the API key, it's '123456'.",
	]

	COLUMNS = [
	"Rank",
	"Overall Acc",
	"Model",
	"Simple Function AST",
	"Multiple Functions AST",
	"Parallel Functions AST",
	"Parallel Multiple AST",
	"Simple Function Exec",
	"Multiple Functions Exec",
	"Parallel Functions Exec",
	"Parallel Multiple Exec",
	"Relevance Detection",
	"Cost ($ Per 1k Function Calls)",
	"Latency Mean (s)",
	"Latency Standard Deviation (s)",
	"Latency 95th Percentile (s)",
	"Organization",
	"License",
	]

	COLUMNS_SUMMARY = [
	"Rank",
	"Overall Acc",
	"Model",
	"AST Summary",
	"Exec Summary",
	"Relevance Detection",
	"Cost ($ Per 1k Function Calls)",
	"Latency Mean (s)",
	"Organization",
	"License",
	]


	def parse_csv(text):
	lines = text.split("\n")
	lines = lines[1:]
	result = []
	for i in range(len(lines)):
	row = lines[i].split(",")
	row = [parse_value(value) for value in row]
	row.pop(3)
	row.pop(5)
	row.pop(5)
	row.pop(6)
	row.pop(6)
	row.pop(6)
	row.pop(10)
	row.pop(10)

	result.append(row)
	return result


	def parse_value(value):
	if value.endswith("%"):
	return float(value[:-1])
	try:
	return float(value)
	except:
	return value


	with open("./data.csv", "r") as file:
	csv_text = file.read()
	DATA = parse_csv(csv_text)
	DATA_SUMMARY = [
	row[:5]
	+ [round((row[5] + row[6] + row[7] + row[8]) / 4, 2)]
	+ [round((row[9] + row[10] + row[11] + row[12]) / 4, 2)]
	+ row[13:16]
	for row in DATA
	]
	DATA = [row[:3] + row[5:] + row[3:5] for row in DATA]

	DATA_SUMMARY = [row[:3] + row[5:] + row[3:5] for row in DATA_SUMMARY]

	MODELS = [
	"gorilla-openfunctions-v2",
	"gpt-4-1106-preview-fc",
	"gpt-4-0125-preview-fc",
	"gpt-3.5-turbo-0125-fc",
	"mistral-large-fc",
	]


	def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
	# Login and get access token
	login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
	headers = {"Content-Type": "application/json"}
	login_data = {"username": "website", "password": mongoDBPassword}
	response = requests.post(login_url, headers=headers, json=login_data)
	access_token = response.json()["access_token"]
	# Prepare data for sending feedback
	url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne"
	headers = {
	"Content-Type": "application/json",
	"Access-Control-Request-Headers": "*",
	"Authorization": f"Bearer {access_token}",
	}

	if not prompt or not function:
	return

	body = {
	"collection": "vote",
	"database": "gorilla-feedback",
	"dataSource": "gorilla",
	"document": {
	"prompt": prompt,
	"funcDef": function,
	"temperature": temperature,
	"model": model,
	"codeOutput": codeOutput,
	"jsonOutput": jsonOutput,
	"result": vote,
	},
	}

	# Send feedback
	response = requests.post(url, headers=headers, json=body)
	if response.ok:
	print("Document inserted:", response.json())
	else:
	print("Error:", response.text)


	def get_voting_result():
	login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
	headers = {"Content-Type": "application/json"}
	login_data = {"username": "website", "password": mongoDBPassword}
	response = requests.post(login_url, headers=headers, json=login_data)
	access_token = response.json()["access_token"]

	# Scanning the database
	url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find"
	headers = {
	"Content-Type": "application/json",
	"Access-Control-Request-Headers": "*",
	"Authorization": f"Bearer {access_token}",
	}
	body = {
	"collection": "vote",
	"database": "gorilla-feedback",
	"dataSource": "gorilla",
	}
	response = requests.post(url, headers=headers, json=body)
	if response.ok:
	data = response.json()
	votes = data["documents"]
	votes = [vote for vote in votes if vote["result"] in ["positive", "negative"]]
	# extract only the model, positive count, negative count
	model_votes = {}
	for vote in votes:
	model = vote["model"]
	if model not in model_votes:
	model_votes[model] = {"positive": 0, "negative": 0}
	model_votes[model][vote["result"]] += 1
	for model in model_votes:
	model_votes[model]["accuracy"] = model_votes[model]["positive"] / (
	model_votes[model]["positive"] + model_votes[model]["negative"]
	)

	result = []
	for model in model_votes:
	result.append(
	[
	model,
	model_votes[model]["accuracy"],
	model_votes[model]["positive"],
	model_votes[model]["negative"],
	]
	)
	result = sorted(result, key=lambda x: x[1], reverse=True)
	return pd.DataFrame(
	result, columns=["Model", "Accuracy", "Positive", "Negative"]
	)
	else:
	print("Error:", response.text)
	return []


	def send_feedback_negative(
	prompt, function, model, temperature, codeOutput, jsonOutput
	):
	send_feedback(
	prompt, function, model, temperature, codeOutput, jsonOutput, "negative"
	)
	return "Thank you for your feedback. We will use this to improve our service."


	def send_feedback_positive(
	prompt, function, model, temperature, codeOutput, jsonOutput
	):
	send_feedback(
	prompt, function, model, temperature, codeOutput, jsonOutput, "positive"
	)
	return "Thank you for your feedback. We will use this to improve our service."


	def report_issue(prompt, model, temperature, codeOutput, jsonOutput):
	print("Reporting issue")
	issueTitle = "[bug] OpenFunctions-v2: "
	issueBody = f"Issue Description\n\nPrompt: {prompt}\n\nModel: {model}\n\nTemperature: {temperature}\n\nOutput (or Error if request failed): {codeOutput} \n\n {jsonOutput}\n\nAdditional Information\n"
	webbrowser.open_new_tab(
	f"https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=${issueTitle}&body=${issueBody}"
	)


	def fill_example(index):
	prompt = PROMPTS[index]
	function = EXAMPLES[index]
	model = "gorilla-openfunctions-v2"
	return prompt, json.dumps(function, indent=2), model


	def cast_multi_param_type(properties):
	"""
	OpenAI rejects parameters type other than JSON serializable type.
	Since our evaluation contains Python specific types, we need some casting
	"""
	for key, value in properties.items():
	if "type" not in value:
	properties[key]["type"] = "string"
	else:
	value["type"] = value["type"].lower()
	if value["type"] not in [
	"object",
	"string",
	"number",
	"boolean",
	"array",
	"integer",
	]:
	properties[key]["type"] = "string"
	elif value["type"] == "array" and "items" not in properties[key].keys():
	properties[key]["items"] = {"type": "object"}
	elif (
	value["type"] == "array"
	and "type" not in properties[key]["items"].keys()
	):
	properties[key]["items"]["type"] = "object"
	elif value["type"] == "array" and properties[key]["items"]["type"] not in [
	"object",
	"string",
	"number",
	"boolean",
	"array",
	"integer",
	]:
	properties[key]["items"]["type"] = "string"
	return properties


	def get_gorilla_response(prompt, function, model, temperature):
	requestData = {
	"model": model,
	"messages": [{"role": "user", "content": prompt}],
	"functions": [function],
	"temperature": temperature,
	}
	url = "https://luigi.millennium.berkeley.edu:443/v1/chat/completions"
	response = requests.post(
	url,
	headers={
	"Content-Type": "application/json",
	"Authorization": "EMPTY", # Hosted for free with ❤️ from UC Berkeley
	},
	data=json.dumps(requestData),
	)
	jsonResponse = response.json()
	directCode = jsonResponse["choices"][0]["message"]["content"]
	jsonCode = jsonResponse["choices"][0]["message"]["function_call"]
	jsonFormatted = json.dumps(jsonCode, indent=2)
	return directCode, jsonFormatted


	def get_openai_response(prompt, function, model, temperature):
	model = model[:-3]
	client = OpenAI(api_key=openaiKey)
	oai_tool = []
	function = json.loads(function)
	item = function # use item in the later code
	if "." in item["name"]:
	item["name"] = re.sub(
	r"\.", "_", item["name"]
	) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
	item["parameters"][
	"type"
	] = "object" # If typing is missing, we assume it is an object since OAI requires a type.
	if "properties" not in item["parameters"]:
	item["parameters"]["properties"] = item["parameters"].copy()
	item["parameters"]["type"] = "object"
	for key in list(item["parameters"].keys()).copy():
	if key != "properties" and key != "type" and key != "required":
	del item["parameters"][key]
	for key in list(item["parameters"]["properties"].keys()).copy():
	if key == "required" or key == "type":
	del item["parameters"]["properties"][key]
	item["parameters"]["properties"] = cast_multi_param_type(
	item["parameters"]["properties"]
	)
	oai_tool.append({"type": "function", "function": item})
	message = [{"role": "user", "content": "Questions: " + prompt}]
	if len(oai_tool) > 0:
	response = client.chat.completions.create(
	messages=message,
	model=model,
	temperature=temperature,
	tools=oai_tool,
	)
	else:
	response = client.chat.completions.create(
	messages=message,
	model=model,
	temperature=temperature,
	)
	try:
	func_call = response.choices[0].message.tool_calls[0]
	result = {func_call.function.name: func_call.function.arguments}
	try:
	key = list(result.keys())[0]
	result[key] = json.loads(result[key])
	result = json.dumps(result, indent=2)
	return "No direct code output for this model.", result
	except:
	return "No direct code output for this model.", result
	except:
	result = response.choices[0].message.content
	return result, "The model failed to return a JSON output."


	def get_mistral_response(prompt, function, model, temperature):
	client = MistralClient(api_key=mistralKey)
	oai_tool = []
	function = json.loads(function)
	item = function # use item in the later code
	if "." in item["name"]:
	item["name"] = re.sub(
	r"\.", "_", item["name"]
	) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
	item["parameters"][
	"type"
	] = "object" # If typing is missing, we assume it is an object since OAI requires a type.
	if "properties" not in item["parameters"]:
	item["parameters"]["properties"] = item["parameters"].copy()
	item["parameters"]["type"] = "object"
	for key in list(item["parameters"].keys()).copy():
	if key != "properties" and key != "type" and key != "required":
	del item["parameters"][key]
	for key in list(item["parameters"]["properties"].keys()).copy():
	if key == "required" or key == "type":
	del item["parameters"]["properties"][key]
	item["parameters"]["properties"] = cast_multi_param_type(
	item["parameters"]["properties"]
	)
	oai_tool.append({"type": "function", "function": item})
	message = [
	ChatMessage(role="user", content=prompt),
	]
	chat_response = client.chat(
	model="mistral-large-latest",
	messages=message,
	tools=oai_tool,
	temperature=temperature,
	)
	try:
	func_call = chat_response.choices[0].message.tool_calls[0]
	result = {func_call.function.name: func_call.function.arguments}
	try:
	key = list(result.keys())[0]
	result[key] = json.loads(result[key])
	result = json.dumps(result, indent=2)
	return "No direct code output for this model.", result
	except:
	return "No direct code output for this model.", result
	except:
	result = chat_response.choices[0].message.content
	return result, "The model failed to return a JSON output."


	def distribute_task(prompt, function, model, temperature):
	if "gpt" in model:
	return get_openai_response(prompt, function, model, temperature)
	elif "mistral" in model:
	return get_mistral_response(prompt, function, model, temperature)
	else:
	return get_gorilla_response(prompt, function, model, temperature)
	return "", ""


	def get_leaderboard():
	# Convert the leaderboard data to a pandas DataFrame for easier handling and display
	leaderboard_df = pd.DataFrame(DATA, columns=COLUMNS)
	leaderboard_df = leaderboard_df.sort_values(by="Rank")
	return leaderboard_df


	def get_summary():
	# Convert the leaderboard data to a pandas DataFrame for easier handling and display
	leaderboard_df = pd.DataFrame(DATA_SUMMARY, columns=COLUMNS_SUMMARY)
	leaderboard_df = leaderboard_df.sort_values(by="Rank")
	return leaderboard_df


	prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
	funcDescription = gr.Textbox(
	label="Function Description", placeholder="Describe the function...", lines=20
	)
	model = gr.Dropdown(label="Model", choices=MODELS)


	with gr.Blocks() as demo:
	with gr.Tabs():
	with gr.TabItem("Summary Leaderboard"):
	gr.Markdown(
	"This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html) and [code](https://github.com/ShishirPatil/gorilla)."
	)
	gr.Markdown(
	"""AST means evaluation through Abstract Syntax Tree and Exec means evaluation by executing all the API calls the LLM generates.

	FC = native support for function/tool calling.

	Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.

	AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.

	Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).
	"""
	)
	leaderboard_data = gr.Dataframe(value=get_summary(), wrap=True)
	with gr.TabItem("Full Leaderboard"):
	gr.Markdown(
	"This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html) and [code](https://github.com/ShishirPatil/gorilla)."
	)
	gr.Markdown(
	"""AST means evaluation through Abstract Syntax Tree and Exec means evaluation by executing all the API calls the LLM generates.

	FC = native support for function/tool calling.

	Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.

	Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).
	"""
	)
	leaderboard_data = gr.Dataframe(value=get_leaderboard(), wrap=True)

	with gr.TabItem("Evaluation Categories"):
	gr.Markdown(
	"""
	### What are the different columns representing in the leaderboard?

	We provide a short summary here. For more details, please refer to our release [blog](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html):

	AST means evaluation through Abstract Syntax Tree, and Exec means evaluation by executing all the API calls the LLM generates.

	Cost is calculated as an estimate of the cost per 1000 function calls, in USD.

	Latency is measured in seconds.

	Simple Function evaluation contains the simplest but most commonly seen format, where the user supplies a single JSON function document, with one and only one function call will be invoked.

	Multiple Function contains a user question that only invokes one function call out of 2 to 4 JSON function documentations. The model needs to be capable of selecting the best function to invoke according to user provided context. For example, if the prompt is `what is 2 + 3?` and the options are `add()` and `mult()`, the model should select `add()`.

	Parallel Function is defined as invoking multiple function calls in parallel with one user query. The model needs to digest how many function calls need to be made and the question to model can be a single sentence or multiple sentence. For example, if the prompt is `What's the weather in San Francisco and New York` and the function provided is `get_weather()`, the model should return both `get_weather('San Francisco')` and `get_weather('New York')`.

	Parallel Multiple Function is the combination of parallel function and multiple function. In another word, the model is provided with multiple function documentations, each of the corresponding function calls will be invoked zero or more times.

	In relevance detection, we design scenarios where none of the provided functions are relevant and supposed to be invoked. We expect the model's output to be no function call. This scenario provides insight to whether a model will hallucinate on its function and parameter to generate function code despite lacking the function information or instructions from the users to do so.
	"""
	)

	with gr.TabItem("Try It Out"):
	with gr.Row():
	with gr.Column(scale=1):
	with gr.Row():
	example1 = gr.Button("Example 1").click(
	fn=lambda: fill_example(0),
	outputs=[prompt, funcDescription, model],
	)
	example2 = gr.Button("Example 2").click(
	fn=lambda: fill_example(1),
	outputs=[prompt, funcDescription, model],
	)
	example3 = gr.Button("Example 3").click(
	fn=lambda: fill_example(2),
	outputs=[prompt, funcDescription, model],
	)

	with gr.Row():
	model.render()
	temperature = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	)
	prompt.render()
	funcDescription.render()
	submit_button = gr.Button("Submit")
	with gr.Column(scale=1):
	codeOutput = gr.Textbox(
	label="Code Output",
	placeholder="Code output will be displayed here...",
	lines=7,
	)
	jsonOutput = gr.Textbox(
	label="JSON Format (OpenAI compatible)",
	placeholder="JSON format will be displayed here...",
	lines=20,
	)
	with gr.Row():
	thumbs_up = gr.Button("👍")
	thumbs_down = gr.Button("👎")
	regenerate_button = gr.Button("Regenerate")
	report_issue_button = gr.Button("Report Issue")
	feedbackMsg = gr.Markdown()

	# Actions
	submit_button.click(
	fn=distribute_task,
	inputs=[prompt, funcDescription, model, temperature],
	outputs=[codeOutput, jsonOutput],
	)

	regenerate_button.click(
	fn=distribute_task,
	inputs=[prompt, funcDescription, model, temperature],
	outputs=[codeOutput, jsonOutput],
	)

	report_issue_button.click(
	fn=None,
	inputs=[prompt, model, temperature, codeOutput, jsonOutput],
	outputs=[],
	js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=Issue Description%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0AAdditional Information\n`, "_blank")',
	)

	thumbs_up.click(
	fn=send_feedback_positive,
	inputs=[
	prompt,
	funcDescription,
	model,
	temperature,
	codeOutput,
	jsonOutput,
	],
	outputs=[feedbackMsg],
	)

	thumbs_down.click(
	fn=send_feedback_negative,
	inputs=[
	prompt,
	funcDescription,
	model,
	temperature,
	codeOutput,
	jsonOutput,
	],
	outputs=[feedbackMsg],
	)

	# with gr.TabItem("Voting Leaderboard"):
	# gr.Markdown("## This is a live leaderboard where you can see user's voting result on the agent's response.")
	# leaderboard_data = gr.Dataframe(
	# value=get_voting_result(), wrap=True
	# )

	demo.launch()