# ... existing code ... import pandas as pd import json # Load the JSON data with open('src/combined_data.json') as f: data = json.load(f) # Flatten the data flattened_data = [] for entry in data: flattened_entry = { "model_name": entry["model_name"], "input_price": entry["pricing"]["input_price"], "output_price": entry["pricing"]["output_price"], "multimodality_image": entry["multimodality"]["image"], "multimodality_multiple_image": entry["multimodality"]["multiple_image"], "multimodality_audio": entry["multimodality"]["audio"], "multimodality_video": entry["multimodality"]["video"], "source": entry["pricing"]["source"], "license_name": entry["license"]["name"], "license_url": entry["license"]["url"], "languages": ", ".join(entry["languages"]), "release_date": entry["release_date"], "parameter_size": entry["parameters"]["size"], "estimated": entry["parameters"]["estimated"], "open_weight": entry["open_weight"], "context_size": entry["context_size"], # ... additional prices ... "additional_prices_context_caching": entry["pricing"].get("additional_prices", {}).get("context_caching", None), "additional_prices_context_storage": entry["pricing"].get("additional_prices", {}).get("context_storage", None), "additional_prices_image_input": entry["pricing"].get("additional_prices", {}).get("image_input", None), "additional_prices_image_output": entry["pricing"].get("additional_prices", {}).get("image_output", None), "additional_prices_video_input": entry["pricing"].get("additional_prices", {}).get("video_input", None), "additional_prices_video_output": entry["pricing"].get("additional_prices", {}).get("video_output", None), "additional_prices_audio_input": entry["pricing"].get("additional_prices", {}).get("audio_input", None), "additional_prices_audio_output": entry["pricing"].get("additional_prices", {}).get("audio_output", None), } flattened_data.append(flattened_entry) # Create a DataFrame df = pd.DataFrame(flattened_data) # Load the results CSV files results_1_6_5_multimodal = pd.read_csv('src/results_1.6.5_multimodal.csv', header=None) results_1_6_5_ascii = pd.read_csv('src/results_1.6.5_ascii.csv', header=None) results_1_6 = pd.read_csv('src/results_1.6.csv', header=None) # Split model names by '-t0.0' and use the first part results_1_6_5_multimodal[0] = results_1_6_5_multimodal[0].str.split('-t0.0').str[0] results_1_6_5_ascii[0] = results_1_6_5_ascii[0].str.split('-t0.0').str[0] results_1_6[0] = results_1_6[0].str.split('-t0.0').str[0] # Create a mapping for clemscore values clemscore_map_1_6_5_multimodal = dict(zip(results_1_6_5_multimodal[0], results_1_6_5_multimodal[1])) clemscore_map_1_6_5_ascii = dict(zip(results_1_6_5_ascii[0], results_1_6_5_ascii[1])) clemscore_map_1_6 = dict(zip(results_1_6[0], results_1_6[1])) # Add clemscore columns to the main DataFrame df['clemscore_v1.6.5_multimodal'] = df['model_name'].map(clemscore_map_1_6_5_multimodal).fillna(0).astype(float) df['clemscore_v1.6.5_ascii'] = df['model_name'].map(clemscore_map_1_6_5_ascii).fillna(0).astype(float) df['clemscore_v1.6'] = df['model_name'].map(clemscore_map_1_6).fillna(0).astype(float) # Load the latency CSV files latency_1_6 = pd.read_csv('src/v1.6_latency.csv', header=None) latency_1_6_5_ascii = pd.read_csv('src/v1.6.5_ascii_latency.csv', header=None) latency_1_6_5_multimodal = pd.read_csv('src/v1.6.5_multimodal_latency.csv', header=None) # Create a mapping for latency values latency_map_1_6 = dict(zip(latency_1_6[0], latency_1_6[1])) latency_map_1_6_5_ascii = dict(zip(latency_1_6_5_ascii[0], latency_1_6_5_ascii[1])) latency_map_1_6_5_multimodal = dict(zip(latency_1_6_5_multimodal[0], latency_1_6_5_multimodal[1])) # Add latency columns to the main DataFrame df['latency_v1.6'] = df['model_name'].map(latency_map_1_6).fillna(0).astype(float) df['latency_v1.6.5_multimodal'] = df['model_name'].map(latency_map_1_6_5_multimodal).fillna(0).astype(float) df['latency_v1.6.5_ascii'] = df['model_name'].map(latency_map_1_6_5_ascii).fillna(0).astype(float) # Calculate average latency and clemscore df['average_clemscore'] = df[['clemscore_v1.6.5_multimodal', 'clemscore_v1.6.5_ascii', 'clemscore_v1.6']].mean(axis=1).round(3) df['average_latency'] = df[['latency_v1.6', 'latency_v1.6.5_multimodal', 'latency_v1.6.5_ascii']].mean(axis=1).round(3) # More clean up # Clean and convert prices to float df['input_price'] = df['input_price'].replace({'\$': '', '': None}, regex=True).astype(float).round(3) df['output_price'] = df['output_price'].replace({'\$': '', '': None}, regex=True).astype(float).round(3) # Clean and convert additional prices to float additional_price_columns = [ 'additional_prices_context_caching', 'additional_prices_context_storage', 'additional_prices_image_input', 'additional_prices_image_output', 'additional_prices_video_input', 'additional_prices_video_output', 'additional_prices_audio_input', 'additional_prices_audio_output' ] for col in additional_price_columns: df[col] = df[col].replace({'\$': '', '': None}, regex=True).astype(float).round(3) # Clean and convert context to integer df['context_size'] = df['context_size'].replace({'k': ''}, regex=True).astype(int) df['context_size'] = df['context_size'] df['parameter_size'] = df['parameter_size'].replace({'B': '', '': None}, regex=True).astype(float) LANG_MAPPING = { 'el': 'Greek', 'id': 'Indonesian', 'ko': 'Korean', 'sv': 'Swedish', 'de': 'German', 'lv': 'Latvian', 'am': 'Amharic', 'fi': 'Finnish', 'da': 'Danish', 'pt': 'Portuguese', 'sw': 'Swahili', 'es': 'Spanish', 'it': 'Italian', 'bn': 'Bengali', 'nl': 'Dutch', 'lt': 'Lithuanian', 'ro': 'Romanian', 'sl': 'Slovenian', 'hu': 'Hungarian', 'hr': 'Croatian', 'vi': 'Vietnamese', 'hi': 'Hindi', 'zh': 'Chinese', 'pl': 'Polish', 'ar': 'Arabic', 'cs': 'Czech', 'sk': 'Slovak', 'ja': 'Japanese', 'no': 'Norwegian', 'uk': 'Ukrainian', 'fr': 'French', 'et': 'Estonian', 'ru': 'Russian', 'th': 'Thai', 'bg': 'Bulgarian', 'tr': 'Turkish', 'ms': 'Malay', 'he': 'Hebrew', 'tl': 'Tagalog', 'sr': 'Serbian', 'en': 'English' } df['languages'] = df['languages'].apply(lambda x: ', '.join([LANG_MAPPING.get(lang, lang) for lang in x.split(', ')])) # Keep only the specified columns df = df[[ 'model_name', 'input_price', 'output_price', 'multimodality_image', 'multimodality_multiple_image', 'multimodality_audio', 'multimodality_video', 'source', 'license_name', 'license_url', 'languages', 'release_date', 'open_weight', 'context_size', 'average_clemscore', 'average_latency', 'parameter_size', 'estimated' ]] df = df.rename(columns={ 'model_name': 'Model Name', 'input_price': 'Input $/1M', 'output_price': 'Output $/1M', 'multimodality_image': 'Multimodality Image', 'multimodality_multiple_image': 'Multimodality Multiple Image', 'multimodality_audio': 'Multimodality Audio', 'multimodality_video': 'Multimodality Video', 'source': 'Source', 'license_name': 'License Name', 'license_url': 'License', 'languages': 'Languages', 'release_date': 'Release Date', 'open_weight': 'Open Weight', 'context_size': 'Context Size (k)', 'average_clemscore': 'Average Clemscore', 'average_latency': 'Average Latency (s)', 'parameter_size': 'Parameter Size (B)', 'estimated': 'Estimated' }) df['License'] = df.apply(lambda row: f'{row["License Name"]}', axis=1) df['Model Name'] = df.apply(lambda row: f'{row["Model Name"]}', axis=1) df['Temp Date'] = df['Release Date'] print(df) # Save to CSV df.to_csv('src/main_df.csv', index=False)