{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8edbb15c",
   "metadata": {},
   "source": [
    "Author: Julien Simon <julsimon@huggingface.co>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f8ff20d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import huggingface_hub\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "555233b4",
   "metadata": {},
   "source": [
    "### Retrieve metadata on all public models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87037e3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "models = huggingface_hub.list_models(full=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aadd5d5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "models[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e0fe1db",
   "metadata": {},
   "outputs": [],
   "source": [
    "huggingface_hub.model_info('distilgpt2', securityStatus=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a06997e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "models_df = pd.DataFrame(columns=['model_name', 'task_type', 'downloads'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91225693",
   "metadata": {},
   "outputs": [],
   "source": [
    "for m in models:\n",
    "    if hasattr(m, 'downloads'):\n",
    "        downloads = m.downloads\n",
    "    else:\n",
    "        downloads = 0\n",
    "    m_df = pd.DataFrame({'model_name': [m.modelId],'task_type': [m.pipeline_tag], 'downloads': [downloads]})\n",
    "    models_df = models_df.append(m_df, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eaa0b6e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "models_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a38785c",
   "metadata": {},
   "source": [
    "### List tast types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f690e417",
   "metadata": {},
   "outputs": [],
   "source": [
    "task_types = models_df['task_type'].unique()\n",
    "print(task_types)\n",
    "print(len(task_types))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "865346cf",
   "metadata": {},
   "source": [
    "### For each task type, find out the percentage of downloads that the top 'n' models represent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8edf413",
   "metadata": {},
   "outputs": [],
   "source": [
    "n = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bcbcc8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for t in task_types:\n",
    "    if t is None:\n",
    "        continue\n",
    "    task_models_df = models_df[models_df['task_type']==t]\n",
    "    topn_downloads = task_models_df[:n]['downloads'].sum()\n",
    "    all_downloads = task_models_df['downloads'].sum()\n",
    "    if all_downloads!=0:\n",
    "        print('{} ({} models): {:.1%}'.format(t, len(task_models_df), topn_downloads/all_downloads))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c44c3ef6",
   "metadata": {},
   "source": [
    "### For each task type, list the repository of the top 'n' models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d77e65fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_URL = 'https://huggingface.co'\n",
    "\n",
    "for t in task_types:\n",
    "    if t is None:\n",
    "        continue\n",
    "    task_models_df = models_df[models_df['task_type']==t]\n",
    "    topn_models = task_models_df[:n]['downloads']\n",
    "    print('[{}]'.format(t))\n",
    "    if len(task_models_df) < n:\n",
    "        indexes = range(len(task_models_df))\n",
    "    else:\n",
    "        indexes = range(n)\n",
    "    for i in indexes:\n",
    "        print('{}/{}'.format(BASE_URL, task_models_df.iloc[i]['model_name']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f893cd03",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}