import gradio as gr from huggingface_hub import list_spaces, list_models, list_datasets from cachetools import TTLCache, cached from toolz import groupby, valmap import platform from enum import Enum is_macos = platform.system() == "Darwin" LIMIT = 1_000_000 if is_macos else None NONE_AUTHOR = "HuggingFace Team" # TODO deal with this class HubRepoType(Enum): MODEL = "model" DATASET = "dataset" SPACE = "space" @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def get_spaces(): # ≈ return list(list_spaces(full=True, limit=LIMIT)) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def get_models(): return list(iter(list_models(full=True, limit=LIMIT))) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def get_datasets(): return list(iter(list_datasets(full=True, limit=LIMIT))) get_spaces() # to warm up the cache get_models() # to warm up the cache get_datasets() # to warm up the cache @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def valid_dataset_ids(): return {dataset.id for dataset in get_datasets()} @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def valid_model_ids(): return {model.id for model in get_models()} @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def valid_space_ids(): return {space.id for space in get_spaces()} VALID_DATASET_IDS = valid_dataset_ids() VALID_MODEL_IDS = valid_model_ids() VALID_SPACE_IDS = valid_space_ids() @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_space_to_like_dict(): spaces = get_spaces() return {space.id: space.likes for space in spaces} @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_org_to_space_like_dict(): spaces = get_spaces() grouped = groupby(lambda x: x.author, spaces) return valmap(lambda x: sum(s.likes for s in x), grouped) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_model_to_like_dict(metric_kind): models = get_models() if metric_kind == "likes": return {model.id: model.likes for model in models} if metric_kind == "downloads": return {model.id: model.downloads for model in models} raise ValueError(f"Unsupported metric_kind: {metric_kind}") @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_org_to_model_metrics(metric_kind="likes"): models = get_models() # remove authors who are None models = [model for model in models if model.author is not None] grouped = groupby(lambda x: x.author, models) if metric_kind: return valmap(lambda x: sum(s.likes for s in x), grouped) else: return valmap(lambda x: sum(s.downloads for s in x), grouped) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_dataset_to_like_dict(metric_kind="likes"): datasets = get_datasets() if metric_kind == "likes": return {dataset.id: dataset.likes for dataset in datasets} if metric_kind == "downloads": return {dataset.id: dataset.downloads for dataset in datasets} @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def create_org_to_dataset_metrics(metric_kind="likes"): datasets = get_datasets() # remove authors who are None datasets = [dataset for dataset in datasets if dataset.author is not None] grouped = groupby(lambda x: x.author, datasets) if metric_kind: return valmap(lambda x: sum(s.likes for s in x), grouped) else: return valmap(lambda x: sum(s.downloads for s in x), grouped) def relative_rank(my_dict, target_key, filter_zero=False): if filter_zero: my_dict = {k: v for k, v in my_dict.items() if v != 0} if target_key not in my_dict: raise gr.Error(f"'{target_key}' not found please check the ID and try again.") sorted_items = sorted(my_dict.items(), key=lambda item: item[1], reverse=True) position = [key for key, _ in sorted_items].index(target_key) num_lower = len(sorted_items) - position - 1 num_higher = position return { "rank": (num_higher + 1) / len(my_dict) * 100, "num_higher": num_higher, "num_lower": num_lower, "value": my_dict[target_key], "position": num_higher + 1, } @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_rank_for_space(space_id, filter_zero=False): space_to_like_dict = create_space_to_like_dict() return relative_rank(space_to_like_dict, space_id, filter_zero=filter_zero) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_rank_for_model(model_id, metric_kind="likes", filter_zero=False): model_to_like_dict = create_model_to_like_dict(metric_kind) return relative_rank(model_to_like_dict, model_id, filter_zero=filter_zero) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_rank_for_dataset(dataset_id, metric_kind="likes", filter_zero=False): dataset_to_like_dict = create_dataset_to_like_dict(metric_kind) return relative_rank(dataset_to_like_dict, dataset_id, filter_zero=filter_zero) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_space_rank_for_org(org_id, filter_zero=False): org_to_like_dict = create_org_to_space_like_dict() return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_model_rank_for_org(org_id, metric_kind="likes", filter_zero=False): org_to_like_dict = create_org_to_model_metrics(metric_kind) return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero) @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def relative_dataset_rank_for_org(org_id, metric_kind="likes", filter_zero=False): org_to_like_dict = create_org_to_dataset_metrics(metric_kind) return relative_rank(org_to_like_dict, org_id, filter_zero=filter_zero) # @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) # def rank_space(space_id): # return relative_rank_for_space(space_id) def rank_space_and_org(space_or_org_id, kind, filter_zero): filter_zero = filter_zero == "yes" split_length = len(space_or_org_id.split("/")) # Logic for split_length == 2 if split_length == 2: return _rank_single_repo(space_or_org_id, kind, filter_zero) # Handle kind-specific logic for split_length == 1 if split_length == 1: valid_ids = {"model": VALID_MODEL_IDS, "dataset": VALID_DATASET_IDS} if kind in valid_ids and space_or_org_id in valid_ids[kind]: return _rank_single_repo(space_or_org_id, kind, filter_zero) else: return _rank_by_org(space_or_org_id, kind, filter_zero) # If no conditions match, handle unexpected cases (optional) raise ValueError( f"Unexpected combination of space_or_org_id '{space_or_org_id}' and kind" f" '{kind}'" ) def _rank_by_org(space_or_org_id, kind, filter_zero): if kind == "space": org_rank = relative_space_rank_for_org(space_or_org_id, filter_zero=filter_zero) elif kind == "model": org_rank = relative_model_rank_for_org(space_or_org_id, filter_zero=filter_zero) elif kind == "dataset": org_rank = relative_dataset_rank_for_org( space_or_org_id, filter_zero=filter_zero ) result = ( f"## ⭐️ Org/User {kind.title()} Likes Rankings ⭐️\n" + f"Here are the rankings for the org/user across all of their {kind}s \n" ) result += f"""- You have {org_rank['value']:,} likes for this org/user.\n""" result += f"""- Your org/user is ranked {org_rank['position']:,}\n""" result += f"""- You have {org_rank['num_higher']:,} orgs/users above and {org_rank['num_lower']:,} orgs/users below in the ranking of {kind} likes \n\n""" result += f"""- Organization or user [{space_or_org_id}](https://huggingface.co/{space_or_org_id}) is ranked in the top {org_rank['rank']:.2f}% \n\n""" if kind == "space": result += f"""You can find all your Spaces sorted by likes [here](https://huggingface.co/{space_or_org_id}?sort_spaces=likes#spaces)\n""" if kind == "model": result += f"""You can find all your Models sorted by likes [here](https://huggingface.co/{space_or_org_id}?sort_models=likes#models)\n""" if kind == "dataset": result += f"""You can find all your Datasets sorted by likes [here](https://huggingface.co/{space_or_org_id}?sort_datasets=likes#datasets)\n""" return _create_footer_message(result, kind) def _rank_single_repo(space_or_org_id, kind, filter_zero): if kind == "space": repo_rank = relative_rank_for_space(space_or_org_id, filter_zero=filter_zero) elif kind == "model": repo_rank = relative_rank_for_model(space_or_org_id, filter_zero=filter_zero) elif kind == "dataset": repo_rank = relative_rank_for_dataset(space_or_org_id, filter_zero=filter_zero) result = f"## ⭐️ {kind.title()} Likes Rankings ⭐️\n" result += f"""Here are the rankings by likes for [`{space_or_org_id}`](https://huggingface.co/spaces/{space_or_org_id}) across all {kind}s \n""" result += f"""- You have {repo_rank['value']:,} likes for this {kind}.\n""" result += f"""- Your {kind} is ranked {repo_rank['position']:,}.\n""" if kind == "space": result += f"""- Space [{space_or_org_id}](https://huggingface.co/spaces/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n""" if kind == "model": result += f"""- Model [{space_or_org_id}](https://huggingface.co/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n""" if kind == "dataset": result += f"""- Dataset [{space_or_org_id}](https://huggingface.co/dataset/{space_or_org_id}) is ranked {repo_rank['rank']:.2f}%\n""" result += f"""- You have {repo_rank['num_higher']:,} {kind}s above and {repo_rank['num_lower']:,} {kind}s below in the ranking of {kind}s likes\n\n""" return _create_footer_message(result, kind) def _create_footer_message(result, kind): result += """### ✨ Remember likes aren't everything!✨\n""" if kind == "space": result += """Some Spaces go very viral whilst other Spaces may be very useful for a smaller audience. If you think your Space is useful, please add it to this [thread](https://huggingface.co/spaces/librarian-bots/ranker/discussions/3) of awesome Spaces. We'll look out for awesome Spaces added to this thread to promote more widely!""" return result def get_top_n_orgs_and_users_spaces(top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") orgs_to_likes = create_org_to_space_like_dict() sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True) sorted_items = sorted_items[:top_n] return sorted_items def get_top_n_orgs_and_users_models(metric, top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") orgs_to_likes = create_org_to_model_metrics(metric) sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True) sorted_items = sorted_items[:top_n] return sorted_items def get_top_n_orgs_and_users_datasets(metric, top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") orgs_to_likes = create_org_to_dataset_metrics(metric) sorted_items = sorted(orgs_to_likes.items(), key=lambda item: item[1], reverse=True) sorted_items = sorted_items[:top_n] return sorted_items def plot_top_n_orgs_and_users(kind, metric="likes", top_n=100): if kind == "space": top_n = get_top_n_orgs_and_users_spaces(top_n) header = """## 🏅 Top 100 Orgs and Users by Space Likes 🏅""" body = "".join( f"\n{i+1}. [{org}](https://huggingface.co/{org}) with {likes:,} likes" for i, (org, likes) in enumerate(top_n) ) return header + body elif kind == "model": top_n = get_top_n_orgs_and_users_models(metric, top_n=top_n) header = """## 🏅 Top 100 Orgs and Users by Model Likes 🏅""" body = "".join( f"\n{i+1}. [{org}](https://huggingface.co/{org}) with {likes:,} likes" for i, (org, likes) in enumerate(top_n) ) return header + body elif kind == "dataset": top_n = get_top_n_orgs_and_users_datasets(metric, top_n=top_n) header = """## 🏅 Top 100 Orgs and Users by Dataset Likes 🏅""" body = "".join( f"\n{i+1}. [{org}](https://huggingface.co/{org}) with {likes:,} likes" for i, (org, likes) in enumerate(top_n) ) return header + body def get_top_n_spaces(top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") space_to_likes = create_space_to_like_dict() sorted_items = sorted( space_to_likes.items(), key=lambda item: item[1], reverse=True ) sorted_items = sorted_items[:top_n] return sorted_items @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def get_top_n_models(metric_kind, top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") model_to_likes = create_model_to_like_dict(metric_kind) sorted_items = sorted( model_to_likes.items(), key=lambda item: item[1], reverse=True ) sorted_items = sorted_items[:top_n] return sorted_items @cached(cache=TTLCache(maxsize=100, ttl=60 * 30)) def get_top_n_datasets(metric, top_n=100): # gr.Info("Updating leaderboard, this may take a few seconds...") dataset_to_likes = create_dataset_to_like_dict(metric) sorted_items = sorted( dataset_to_likes.items(), key=lambda item: item[1], reverse=True ) sorted_items = sorted_items[:top_n] return sorted_items def _plot_top_n_hub_repos(kind: HubRepoType, metric="likes", top_n=100): if kind == HubRepoType.SPACE: top_n = get_top_n_spaces(top_n) header = """## 🏅 Top 100 Space repositories by Likes 🏅""" body = "".join( f"\n{i+1}. [{space}](https://huggingface.co/spaces/{space}) with" f" {likes:,} likes" for i, (space, likes) in enumerate(top_n) ) return header + body elif kind == HubRepoType.MODEL: top_n = get_top_n_models(metric, top_n) header = """## 🏅 Top 100 Model repositories by Likes 🏅""" body = "".join( f"\n{i+1}. [{model}](https://huggingface.co/{model}) with" f" {likes:,} likes" for i, (model, likes) in enumerate(top_n) ) return header + body elif kind == HubRepoType.DATASET: top_n = get_top_n_datasets(metric, top_n) header = """## 🏅 Top 100 Dataset repositories by Likes 🏅""" body = "".join( f"\n{i+1}. [{dataset}](https://huggingface.co/dataset/{dataset}) with" f" {likes:,} likes" for i, (dataset, likes) in enumerate(top_n) ) return header + body def plot_top_n_hub_repos(kind, metric_kind="likes", top_n=100): if kind == "space": return _plot_top_n_hub_repos(HubRepoType.SPACE, top_n) elif kind == "model": return _plot_top_n_hub_repos(HubRepoType.MODEL, metric=metric_kind, top_n=top_n) elif kind == "dataset": return _plot_top_n_hub_repos( HubRepoType.DATASET, metric=metric_kind, top_n=top_n ) with gr.Blocks() as demo: gr.HTML("

🏆 HuggyRanker 🏆

") gr.HTML( """

Rank a single repository or all of the repositories created by an organization or user by likes

""" ) gr.HTML( """

Remember likes aren't everything!

""" ) gr.Markdown( """## Rank Specific Hub repositories or rank an organization or user by likes Provide this app with a Hub ID e.g. `librarian-bots/ranker` or a Username/Organization name e.g. `librarian-bots` to rank by likes.""" ) with gr.Row(): space_id = gr.Textbox( "librarian-bots", max_lines=1, label="Space or user/organization ID" ) filter_zero_likes = gr.Radio( choices=["no", "yes"], label="Filter out repositories with 0 likes in the ranking?", value="yes", ) repo_type = gr.Radio( choices=["space", "model", "dataset"], label="Type of repo", value="space", interactive=True, ) run_btn = gr.Button("Show ranking for this Space or org/user!", label="Rank Space") result = gr.Markdown() run_btn.click( rank_space_and_org, inputs=[space_id, repo_type, filter_zero_likes], outputs=result, ) gr.Markdown("## Leaderboard of Top 100 Spaces and Orgs/Users by Likes") gr.Markdown( """The leaderboard is updated every 30 minutes. Choose the type of repo to rank by likes and click the button to show the leaderboard.""" ) show_refresh_btn = gr.Button("Show/refresh Leaderboard", label="Refresh") with gr.Row(): with gr.Accordion("Show rankings for Orgs and Users", open=False): org_user_ranking = gr.Markdown() show_refresh_btn.click( plot_top_n_orgs_and_users, inputs=[repo_type], outputs=org_user_ranking ) with gr.Accordion("Show rankings for individual repositories", open=False): repo_level_ranking = gr.Markdown() show_refresh_btn.click( plot_top_n_hub_repos, inputs=[repo_type], outputs=repo_level_ranking ) demo.queue(concurrency_count=4).launch()