|
|
|
import pandas as pd |
|
import json |
|
|
|
|
|
with open('src/combined_data.json') as f: |
|
data = json.load(f) |
|
|
|
|
|
flattened_data = [] |
|
for entry in data: |
|
flattened_entry = { |
|
"model_name": entry["model_name"], |
|
"input_price": entry["pricing"]["input_price"], |
|
"output_price": entry["pricing"]["output_price"], |
|
"multimodality_image": entry["multimodality"]["image"], |
|
"multimodality_multiple_image": entry["multimodality"]["multiple_image"], |
|
"multimodality_audio": entry["multimodality"]["audio"], |
|
"multimodality_video": entry["multimodality"]["video"], |
|
"source": entry["pricing"]["source"], |
|
"license_name": entry["license"]["name"], |
|
"license_url": entry["license"]["url"], |
|
"languages": ", ".join(entry["languages"]), |
|
"release_date": entry["release_date"], |
|
"parameter_size": entry["parameters"]["size"], |
|
"estimated": entry["parameters"]["estimated"], |
|
"open_weight": entry["open_weight"], |
|
"context_size": entry["context_size"], |
|
|
|
|
|
"additional_prices_context_caching": entry["pricing"].get("additional_prices", {}).get("context_caching", None), |
|
"additional_prices_context_storage": entry["pricing"].get("additional_prices", {}).get("context_storage", None), |
|
"additional_prices_image_input": entry["pricing"].get("additional_prices", {}).get("image_input", None), |
|
"additional_prices_image_output": entry["pricing"].get("additional_prices", {}).get("image_output", None), |
|
"additional_prices_video_input": entry["pricing"].get("additional_prices", {}).get("video_input", None), |
|
"additional_prices_video_output": entry["pricing"].get("additional_prices", {}).get("video_output", None), |
|
"additional_prices_audio_input": entry["pricing"].get("additional_prices", {}).get("audio_input", None), |
|
"additional_prices_audio_output": entry["pricing"].get("additional_prices", {}).get("audio_output", None), |
|
} |
|
flattened_data.append(flattened_entry) |
|
|
|
|
|
df = pd.DataFrame(flattened_data) |
|
|
|
|
|
results_1_6_5_multimodal = pd.read_csv('src/results_1.6.5_multimodal.csv', header=None) |
|
results_1_6_5_ascii = pd.read_csv('src/results_1.6.5_ascii.csv', header=None) |
|
results_1_6 = pd.read_csv('src/results_1.6.csv', header=None) |
|
|
|
|
|
results_1_6_5_multimodal[0] = results_1_6_5_multimodal[0].str.split('-t0.0').str[0] |
|
results_1_6_5_ascii[0] = results_1_6_5_ascii[0].str.split('-t0.0').str[0] |
|
results_1_6[0] = results_1_6[0].str.split('-t0.0').str[0] |
|
|
|
|
|
|
|
clemscore_map_1_6_5_multimodal = dict(zip(results_1_6_5_multimodal[0], results_1_6_5_multimodal[1])) |
|
clemscore_map_1_6_5_ascii = dict(zip(results_1_6_5_ascii[0], results_1_6_5_ascii[1])) |
|
clemscore_map_1_6 = dict(zip(results_1_6[0], results_1_6[1])) |
|
|
|
|
|
|
|
df['clemscore_v1.6.5_multimodal'] = df['model_name'].map(clemscore_map_1_6_5_multimodal).fillna(0).astype(float) |
|
df['clemscore_v1.6.5_ascii'] = df['model_name'].map(clemscore_map_1_6_5_ascii).fillna(0).astype(float) |
|
df['clemscore_v1.6'] = df['model_name'].map(clemscore_map_1_6).fillna(0).astype(float) |
|
|
|
|
|
latency_1_6 = pd.read_csv('src/v1.6_latency.csv', header=None) |
|
latency_1_6_5_ascii = pd.read_csv('src/v1.6.5_ascii_latency.csv', header=None) |
|
latency_1_6_5_multimodal = pd.read_csv('src/v1.6.5_multimodal_latency.csv', header=None) |
|
|
|
|
|
latency_map_1_6 = dict(zip(latency_1_6[0], latency_1_6[1])) |
|
latency_map_1_6_5_ascii = dict(zip(latency_1_6_5_ascii[0], latency_1_6_5_ascii[1])) |
|
latency_map_1_6_5_multimodal = dict(zip(latency_1_6_5_multimodal[0], latency_1_6_5_multimodal[1])) |
|
|
|
|
|
df['latency_v1.6'] = df['model_name'].map(latency_map_1_6).fillna(0).astype(float) |
|
df['latency_v1.6.5_multimodal'] = df['model_name'].map(latency_map_1_6_5_multimodal).fillna(0).astype(float) |
|
df['latency_v1.6.5_ascii'] = df['model_name'].map(latency_map_1_6_5_ascii).fillna(0).astype(float) |
|
|
|
|
|
|
|
df['average_clemscore'] = df[['clemscore_v1.6.5_multimodal', 'clemscore_v1.6.5_ascii', 'clemscore_v1.6']].mean(axis=1).round(3) |
|
df['average_latency'] = df[['latency_v1.6', 'latency_v1.6.5_multimodal', 'latency_v1.6.5_ascii']].mean(axis=1).round(3) |
|
|
|
|
|
|
|
|
|
df['input_price'] = df['input_price'].replace({'\$': '', '': None}, regex=True).astype(float).round(3) |
|
df['output_price'] = df['output_price'].replace({'\$': '', '': None}, regex=True).astype(float).round(3) |
|
|
|
|
|
additional_price_columns = [ |
|
'additional_prices_context_caching', |
|
'additional_prices_context_storage', |
|
'additional_prices_image_input', |
|
'additional_prices_image_output', |
|
'additional_prices_video_input', |
|
'additional_prices_video_output', |
|
'additional_prices_audio_input', |
|
'additional_prices_audio_output' |
|
] |
|
|
|
for col in additional_price_columns: |
|
df[col] = df[col].replace({'\$': '', '': None}, regex=True).astype(float).round(3) |
|
|
|
|
|
df['context_size'] = df['context_size'].replace({'k': ''}, regex=True).astype(int) |
|
|
|
df['context_size'] = df['context_size'] |
|
|
|
df['parameter_size'] = df['parameter_size'].replace({'B': '', '': None}, regex=True).astype(float) |
|
|
|
LANG_MAPPING = { |
|
'el': 'Greek', |
|
'id': 'Indonesian', |
|
'ko': 'Korean', |
|
'sv': 'Swedish', |
|
'de': 'German', |
|
'lv': 'Latvian', |
|
'am': 'Amharic', |
|
'fi': 'Finnish', |
|
'da': 'Danish', |
|
'pt': 'Portuguese', |
|
'sw': 'Swahili', |
|
'es': 'Spanish', |
|
'it': 'Italian', |
|
'bn': 'Bengali', |
|
'nl': 'Dutch', |
|
'lt': 'Lithuanian', |
|
'ro': 'Romanian', |
|
'sl': 'Slovenian', |
|
'hu': 'Hungarian', |
|
'hr': 'Croatian', |
|
'vi': 'Vietnamese', |
|
'hi': 'Hindi', |
|
'zh': 'Chinese', |
|
'pl': 'Polish', |
|
'ar': 'Arabic', |
|
'cs': 'Czech', |
|
'sk': 'Slovak', |
|
'ja': 'Japanese', |
|
'no': 'Norwegian', |
|
'uk': 'Ukrainian', |
|
'fr': 'French', |
|
'et': 'Estonian', |
|
'ru': 'Russian', |
|
'th': 'Thai', |
|
'bg': 'Bulgarian', |
|
'tr': 'Turkish', |
|
'ms': 'Malay', |
|
'he': 'Hebrew', |
|
'tl': 'Tagalog', |
|
'sr': 'Serbian', |
|
'en': 'English' |
|
} |
|
|
|
df['languages'] = df['languages'].apply(lambda x: ', '.join([LANG_MAPPING.get(lang, lang) for lang in x.split(', ')])) |
|
|
|
|
|
df = df[[ |
|
'model_name', |
|
'input_price', |
|
'output_price', |
|
'multimodality_image', |
|
'multimodality_multiple_image', |
|
'multimodality_audio', |
|
'multimodality_video', |
|
'source', |
|
'license_name', |
|
'license_url', |
|
'languages', |
|
'release_date', |
|
'open_weight', |
|
'context_size', |
|
'average_clemscore', |
|
'average_latency', |
|
'parameter_size', |
|
'estimated' |
|
]] |
|
|
|
df = df.rename(columns={ |
|
'model_name': 'Model Name', |
|
'input_price': 'Input $/1M', |
|
'output_price': 'Output $/1M', |
|
'multimodality_image': 'Multimodality Image', |
|
'multimodality_multiple_image': 'Multimodality Multiple Image', |
|
'multimodality_audio': 'Multimodality Audio', |
|
'multimodality_video': 'Multimodality Video', |
|
'source': 'Source', |
|
'license_name': 'License Name', |
|
'license_url': 'License', |
|
'languages': 'Languages', |
|
'release_date': 'Release Date', |
|
'open_weight': 'Open Weight', |
|
'context_size': 'Context Size (k)', |
|
'average_clemscore': 'Average Clemscore', |
|
'average_latency': 'Average Latency (s)', |
|
'parameter_size': 'Parameter Size (B)', |
|
'estimated': 'Estimated' |
|
}) |
|
|
|
df['License'] = df.apply(lambda row: f'<a href="{row["License"]}" style="color: blue;">{row["License Name"]}</a>', axis=1) |
|
df['Model Name'] = df.apply(lambda row: f'<a href="{row["Source"]}" style="color: blue;">{row["Model Name"]}</a>', axis=1) |
|
df['Temp Date'] = df['Release Date'] |
|
print(df) |
|
|
|
df.to_csv('src/main_df.csv', index=False) |
|
|