import re import json import requests import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup from huggingface_hub import HfApi, list_models, list_datasets, list_spaces import gradio as gr api = HfApi() def get_models(org_name, which_one): all_list = [] if which_one == "models": things = api.list_models(author=org_name) elif which_one == "datasets": things = api.list_datasets(author=org_name) elif which_one == "spaces": things = api.list_spaces(author=org_name) for i in things: i = i.__dict__ json_format_data = {"id": i['id'], "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"id": i['id'], "downloads": 0, "likes": i['likes']} all_list.append(json_format_data) df_all_list = (pd.DataFrame(all_list)) return df_all_list def get_most(df_for_most_function): download_sorted_df = df_for_most_function.sort_values(by=['downloads'], ascending=False) most_downloaded = download_sorted_df.iloc[0] like_sorted_df = df_for_most_function.sort_values(by=['likes'], ascending=False) most_liked = like_sorted_df.iloc[0] return {"Most Download": {"id": most_downloaded['id'], "downloads": most_downloaded['downloads'], "likes": most_downloaded['likes']}, "Most Likes": {"id": most_liked['id'], "downloads": most_liked['downloads'], "likes": most_liked['likes']}} def get_sum(df_for_sum_function): sum_downloads = sum(df_for_sum_function['downloads'].tolist()) sum_likes = sum(df_for_sum_function['likes'].tolist()) return {"Downloads": sum_downloads, "Likes": sum_likes} def get_openllm_leaderboard(): url = 'https://huggingfaceh4-open-llm-leaderboard.hf.space/' response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') script_elements = soup.find_all('script') data = json.loads(str(script_elements[1])[31:-10]) component_index = 11 pattern = r'href="([^"]*)"' zero_or_one = 1 result_list = [] i = 0 while True: try: unfiltered = data['components'][component_index]['props']['value']['data'][i][zero_or_one].rstrip("\n") normal_name = re.search(pattern, unfiltered).group(1) normal_name = "/".join(normal_name.split("/")[-2:]) result_list.append(normal_name) i += 1 except (IndexError, AttributeError): return result_list def get_ranking(model_list, target_org): for index, model in enumerate(model_list): if model.split("/")[0].lower() == target_org.lower(): return [index+1, model] return "Not Found" def make_leaderboard(orgs, which_one): data_rows = [] open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True): df = get_models(org, which_one) if len(df) == 0: continue num_things = len(df) sum_info = get_sum(df) most_info = get_most(df) if which_one == "models": open_llm_leaderboard_get_org = get_ranking(open_llm_leaderboard, org) data_rows.append({ "Organization Name": org, "Total Downloads": sum_info["Downloads"], "Total Likes": sum_info["Likes"], "Number of Models": num_things, "Best Model On Open LLM Leaderboard": open_llm_leaderboard_get_org[1] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, "Best Rank On Open LLM Leaderboard": open_llm_leaderboard_get_org[0] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, "Average Downloads per Model": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, "Average Likes per Model": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, "Most Downloaded Model": most_info["Most Download"]["id"], "Most Download Count": most_info["Most Download"]["downloads"], "Most Liked Model": most_info["Most Likes"]["id"], "Most Like Count": most_info["Most Likes"]["likes"] }) elif which_one == "datasets": data_rows.append({ "Organization Name": org, "Total Downloads": sum_info["Downloads"], "Total Likes": sum_info["Likes"], "Number of Datasets": num_things, "Average Downloads per Dataset": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, "Average Likes per Dataset": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, "Most Downloaded Dataset": most_info["Most Download"]["id"], "Most Download Count": most_info["Most Download"]["downloads"], "Most Liked Dataset": most_info["Most Likes"]["id"], "Most Like Count": most_info["Most Likes"]["likes"] }) elif which_one == "spaces": data_rows.append({ "Organization Name": org, "Total Likes": sum_info["Likes"], "Number of Spaces": num_things, "Average Likes per Space": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, "Most Liked Space": most_info["Most Likes"]["id"], "Most Like Count": most_info["Most Likes"]["likes"] }) leaderboard = pd.DataFrame(data_rows) temp = ["Total Downloads"] if which_one != "spaces" else ["Total Likes"] leaderboard = leaderboard.sort_values(by=temp, ascending=False) leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1)) return leaderboard with open("org_names.txt", "r") as f: org_names_in_list = [i.rstrip("\n") for i in f.readlines()] INTRODUCTION_TEXT = f""" 🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). ## Available Dataframes: - 🏛️ Models - 📊 Datasets - 🚀 Spaces ## Backend 🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api). 🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations). **🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping. """ def clickable(x, which_one): if which_one == "models": if x != "Not Found": return f'{x}' else: return "Not Found" else: return f'{x}' def models_df_to_clickable(df, columns, which_one): for column in columns: if column == "Organization Name": df[column] = df[column].apply(lambda x: clickable(x, "models")) else: df[column] = df[column].apply(lambda x: clickable(x, which_one)) return df demo = gr.Blocks() with gr.Blocks() as demo: gr.Markdown("""

🤗 Organization Leaderboard

""") gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.TabItem("🏛️ Models", id=1): columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model"] models_df = make_leaderboard(org_names_in_list, "models") models_df = models_df_to_clickable(models_df, columns_to_convert, "models") headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count"] gr.Dataframe(models_df.head(400), headers=headers, interactive=True, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"]) with gr.TabItem("📊 Datasets", id=2): columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset"] dataset_df = make_leaderboard(org_names_in_list, "datasets") dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets") headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count"] gr.Dataframe(dataset_df.head(250), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "str", "str", "markdown", "str", "markdown", "str"]) with gr.TabItem("🚀 Spaces", id=3): columns_to_convert = ["Organization Name", "Most Liked Space"] spaces_df = make_leaderboard(org_names_in_list, "spaces") spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces") headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count"] gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str"]) demo.launch()