data-leaderboard

Running

App Files Files Community

data-leaderboard / app.py

Weyaxi

applymap to map in pandas function

c38f61a verified over 1 year ago

raw

history blame contribute delete

3.02 kB

	import os
	import gradio as gr
	import pandas as pd
	import time
	import threading
	from huggingface_hub import HfApi
	from humanize import naturalsize

	api = HfApi()

	HF_TOKEN = os.getenv('HF_TOKEN')


	def clickable(x):
	return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'


	def apply_headers(df, headers):
	tmp = df.copy()
	tmp.columns = headers

	return tmp


	def search(search_text):
	if not search_text:
	return df

	return df[df['👤 Author'].str.contains(search_text, case=False, na=False)]

	df = pd.read_csv("author_data_hf_merged.csv")

	df_author_copy = df.copy()

	df["author"] = df["author"].apply(lambda x: clickable(x))
	df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1)
	df = df.sort_values(by='Total Usage', ascending=False)

	sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist()))

	naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces']
	df[naturalsize_columns] = df[naturalsize_columns].map(naturalsize)

	df['Serial Number'] = [i for i in range(1, len(df)+1)]
	df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]]

	df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"])

	desc = f"""
	🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface.

	## 📄 Information

	🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard).

	These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard) positions:

	- 🤖 Top 60k authors in the models category

	- 📊 Top 60k authors in the datasets category

	- 🚀 Top 50k authors in the spaces category

	## 📒 Notes

	Note that these numbers may not be entirely accurate due to the following reasons:

	- I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen.

	- There may be large datasets/models to which I don't have access (either private or gated).

	# 📶 Total Data Usage From All Authors

	According to this leaderboard, there is a total of {sum_all_author} of data on this platform.
	"""
	# Write note maybe?


	title = """
	<div style="text-align:center">
	<h1 id="space-title">💾 Data Leaderboard 💾</h1>
	</div>
	"""

	with gr.Blocks() as demo:
	gr.Markdown("""<h1 align="center" id="space-title">💾 Data Leaderboard 💾</h1>""")
	gr.Markdown(desc)
	with gr.Column(min_width=320):
	search_bar = gr.Textbox(placeholder="🔍 Search for a author", show_label=False)

	gr_followers = gr.Dataframe(df, interactive=False, datatype=["number", 'markdown', 'number'])

	search_bar.submit(fn=search, inputs=search_bar, outputs=gr_followers)


	demo.launch()