{ "cells": [ { "cell_type": "markdown", "id": "8edbb15c", "metadata": {}, "source": [ "Author: Julien Simon " ] }, { "cell_type": "code", "execution_count": null, "id": "3f8ff20d", "metadata": {}, "outputs": [], "source": [ "import huggingface_hub\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "555233b4", "metadata": {}, "source": [ "### Retrieve metadata on all public models" ] }, { "cell_type": "code", "execution_count": null, "id": "87037e3a", "metadata": {}, "outputs": [], "source": [ "models = huggingface_hub.list_models(full=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "aadd5d5a", "metadata": {}, "outputs": [], "source": [ "models[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "9e0fe1db", "metadata": {}, "outputs": [], "source": [ "huggingface_hub.model_info('distilgpt2', securityStatus=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "a06997e7", "metadata": {}, "outputs": [], "source": [ "models_df = pd.DataFrame(columns=['model_name', 'task_type', 'downloads'])" ] }, { "cell_type": "code", "execution_count": null, "id": "91225693", "metadata": {}, "outputs": [], "source": [ "for m in models:\n", " if hasattr(m, 'downloads'):\n", " downloads = m.downloads\n", " else:\n", " downloads = 0\n", " m_df = pd.DataFrame({'model_name': [m.modelId],'task_type': [m.pipeline_tag], 'downloads': [downloads]})\n", " models_df = models_df.append(m_df, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "eaa0b6e7", "metadata": {}, "outputs": [], "source": [ "models_df.head()" ] }, { "cell_type": "markdown", "id": "6a38785c", "metadata": {}, "source": [ "### List tast types" ] }, { "cell_type": "code", "execution_count": null, "id": "f690e417", "metadata": {}, "outputs": [], "source": [ "task_types = models_df['task_type'].unique()\n", "print(task_types)\n", "print(len(task_types))" ] }, { "cell_type": "markdown", "id": "865346cf", "metadata": {}, "source": [ "### For each task type, find out the percentage of downloads that the top 'n' models represent" ] }, { "cell_type": "code", "execution_count": null, "id": "b8edf413", "metadata": {}, "outputs": [], "source": [ "n = 20" ] }, { "cell_type": "code", "execution_count": null, "id": "3bcbcc8e", "metadata": {}, "outputs": [], "source": [ "for t in task_types:\n", " if t is None:\n", " continue\n", " task_models_df = models_df[models_df['task_type']==t]\n", " topn_downloads = task_models_df[:n]['downloads'].sum()\n", " all_downloads = task_models_df['downloads'].sum()\n", " if all_downloads!=0:\n", " print('{} ({} models): {:.1%}'.format(t, len(task_models_df), topn_downloads/all_downloads))" ] }, { "cell_type": "markdown", "id": "c44c3ef6", "metadata": {}, "source": [ "### For each task type, list the repository of the top 'n' models" ] }, { "cell_type": "code", "execution_count": null, "id": "d77e65fc", "metadata": {}, "outputs": [], "source": [ "BASE_URL = 'https://huggingface.co'\n", "\n", "for t in task_types:\n", " if t is None:\n", " continue\n", " task_models_df = models_df[models_df['task_type']==t]\n", " topn_models = task_models_df[:n]['downloads']\n", " print('[{}]'.format(t))\n", " if len(task_models_df) < n:\n", " indexes = range(len(task_models_df))\n", " else:\n", " indexes = range(n)\n", " for i in indexes:\n", " print('{}/{}'.format(BASE_URL, task_models_df.iloc[i]['model_name']))" ] }, { "cell_type": "code", "execution_count": null, "id": "f893cd03", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }