# app.py import json import streamlit as st import glob import os from datetime import datetime st.set_page_config(layout="wide") st.title('Meta Open LLM leaderboard') st.write("Combine data from various open LLM leaderboards into one useful visualization page") st.write("", unsafe_allow_html=True) directories = os.listdir("./data") def format_dir_date(data_dir): # Extracting date and time information from the path parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M") # Formatting the parsed date return parsed_date.strftime("%b %d, %Y %H:%M") def print_model_list(file_name, st, split_into_two=False): file_path = file_name[:-4] + '.json' # Read the list from the JSON file with open(file_path, 'r') as file: model_id_list_loaded = json.load(file) midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint # Split the list into two parts left_list = model_id_list_loaded[:midpoint] right_list = model_id_list_loaded[midpoint:] # Generate HTML for the left column left_html = "" for model_id in left_list: model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] left_html += f'
  • {model_id_trunc}
  • ' # Generate HTML for the right column right_html = "" for model_id in right_list: model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] right_html += f'
  • {model_id_trunc}
  • ' final_html = "" if(split_into_two): final_html = "" cols = st.columns(2) cols[0].write(final_html, unsafe_allow_html=True) final_html = "" cols[1].write(final_html, unsafe_allow_html=True) else: final_html = "" st.write(final_html, unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: data_dir = st.selectbox( 'Select different data generation date', directories, format_func=format_dir_date, index=len(directories)-1, ) with col2: compare_mode = st.checkbox('Enable compare to different date') if compare_mode: compare_data_dir = st.selectbox( 'Select date for comparison', directories, format_func=format_dir_date, index=len(directories)-1, ) captions_map = { "hg_average_to_agentbench_compare.png": "HF to AgentBench compare", "hg_average_to_opencompass_compare.png": "HF to OpenCompass compare", "hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare", "hg_average_to_mosaic_compare.png": "HF to MosaicML compare", "hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare" } with col1: st.write("
    Generated on: " + format_dir_date(data_dir) + "
    ", unsafe_allow_html=True) data_path = './data/' + data_dir # Adjust the data path loading logic if compare_mode: # Side by side compare: compare_data_path = './data/' + compare_data_dir # Load images from both directories imgs = glob.glob(os.path.join(data_path, '*.png')) compare_imgs = glob.glob(os.path.join(compare_data_path, '*.png')) # Extracting images that start with specific keywords from both sets def extract_images(keyword, img_list): return [img for img in img_list if keyword in os.path.basename(img)] hf_llm_diagrams = extract_images('hf_llm_diagram', imgs) bigcode_diagrams = extract_images('bigcode', imgs) mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', imgs) arena_diagrams = extract_images('lmsys_leaderboard_arena', imgs) opencompass_diagrams = extract_images('opencompass_leaderboard', imgs) compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs) compare_bigcode_diagrams = extract_images('bigcode', compare_imgs) compare_mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', compare_imgs) compare_arena_diagrams = extract_images('lmsys_leaderboard_arena', compare_imgs) compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs) # Display each category side by side def display_side_by_side(diagrams1, diagrams2, title): st.subheader(title, divider=True) for d1, d2 in zip(diagrams1, diagrams2): cols = st.columns(2) cols[0].image(d1, use_column_width="auto") cols[1].image(d2, use_column_width="auto") # Displaying HuggingFace LLM Leaderboard display_side_by_side(hf_llm_diagrams, compare_hf_llm_diagrams, "HuggingFace Open LLM leaderboard by Model Size") # Displaying Big Code Models Leaderboard display_side_by_side(bigcode_diagrams, compare_bigcode_diagrams, "Big Code Models Leaderboard") # Displaying MT-Bench Models Leaderboard display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard") # Displaying Arena Models Leaderboard display_side_by_side(arena_diagrams, compare_arena_diagrams, "LMSYS Arena Elo Models Leaderboard") # Displaying OpenCompass Models Leaderboard display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard") # Extracting remaining images from both sets remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams)) compare_remaining_imgs = list(set(compare_imgs) - set(compare_hf_llm_diagrams) - set(compare_bigcode_diagrams) - set(compare_mt_bench_diagrams) - set(compare_opencompass_diagrams)) st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True) st.caption("Only models evaluated on both leaderboards are included.") # Display remaining images side by side for img, compare_img in zip(remaining_imgs, compare_remaining_imgs): cols = st.columns(2) # Extract the filename and caption for the first image filename = os.path.basename(img) caption = captions_map.get(filename, "") # Extract the filename and caption for the comparison image compare_filename = os.path.basename(compare_img) compare_caption = captions_map.get(compare_filename, "") # Display the images with captions cols[0].image(img, caption=caption, width=None) cols[1].image(compare_img, caption=compare_caption, width=None) else: imgs = glob.glob(os.path.join(data_path, '*.png')) # Extracting images that start with "hf_llm_diagram" hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)] bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)] mt_bench_diagrams = [img for img in imgs if 'lmsys_leaderboard_mt_bench' in os.path.basename(img)] arena_diagrams = [img for img in imgs if 'lmsys_leaderboard_arena' in os.path.basename(img)] opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)] # Getting the remaining images remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(arena_diagrams) - set(opencompass_diagrams)) st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True) cols = st.columns(2) cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto") print_model_list(hf_llm_diagrams[0],st, True) st.write("", unsafe_allow_html=True) cols = st.columns(2) cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto") print_model_list(hf_llm_diagrams[1],cols[0]) cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto") print_model_list(hf_llm_diagrams[2],cols[1]) st.write("", unsafe_allow_html=True) cols = st.columns(2) cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto") print_model_list(hf_llm_diagrams[3],cols[0],False) cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto") print_model_list(hf_llm_diagrams[4],cols[1],False) st.subheader("Big Code Models Leaderboard", divider=True) cols = st.columns(2) cols[0].image(bigcode_diagrams[0], use_column_width="auto") print_model_list(bigcode_diagrams[0],st,True) st.subheader("MT-Bench Models Leaderboard", divider=True) cols = st.columns(2) cols[0].image(mt_bench_diagrams[0], use_column_width="auto") print_model_list(mt_bench_diagrams[0],st,True) st.subheader("LMSYS Arena Elo Models Leaderboard", divider=True) cols = st.columns(2) cols[0].image(arena_diagrams[0], use_column_width="auto") print_model_list(arena_diagrams[0],st,True) st.subheader("OpenCompass Models Leaderboard", divider=True) cols = st.columns(2) cols[0].image(opencompass_diagrams[0], use_column_width="auto") print_model_list(opencompass_diagrams[0],st,True) st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True) st.caption("Only models evaluated on both leaderboards are included.") cols = st.columns(2) for i, img in enumerate(remaining_imgs): # Extract the filename from the full image path filename = os.path.basename(img) # Get the caption from the captions_map dictionary caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string # Display the image with the caption cols[i % 2].image(img, caption=caption, width=None) st.write( """

    Leaderboards tracked:

    HuggingFace models that have been flagged as contaminated or do not provide any model card information are excluded. """, unsafe_allow_html=True ) st.subheader('About', divider=True) st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at @FZaslavskiy or here via community discussions.', unsafe_allow_html=True)