{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/codespace/.python/current/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 1.74G/1.74G [00:27<00:00, 62.8MB/s]\n",
      "Generating train split: 100%|██████████| 192363/192363 [00:31<00:00, 6170.02 examples/s]\n"
     ]
    }
   ],
   "source": [
    "data = load_dataset(\"Mohamed-BC/Articles\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "app.py\tdemo.ipynb  recommend.py  requirements.txt  user.py\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset URL: https://www.kaggle.com/datasets/fabiochiusano/medium-articles\n",
      "License(s): CC0-1.0\n",
      "Downloading medium-articles.zip to /workspaces/codespaces-blank\n",
      " 99%|███████████████████████████████████████▊| 367M/369M [00:14<00:00, 42.9MB/s]\n",
      "100%|████████████████████████████████████████| 369M/369M [00:14<00:00, 27.5MB/s]\n"
     ]
    }
   ],
   "source": [
    "!kaggle datasets download -d fabiochiusano/medium-articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  medium-articles.zip\n",
      "  inflating: data/medium_articles.csv  \n"
     ]
    }
   ],
   "source": [
    "!unzip medium-articles.zip -d data\n",
    "!rm medium-articles.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'articles_embeddings'...\n",
      "remote: Enumerating objects: 6, done.\u001b[K\n",
      "remote: Counting objects: 100% (3/3), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
      "Unpacking objects: 100% (6/6), 2.11 KiB | 1.06 MiB/s, done.\n"
     ]
    }
   ],
   "source": [
    "!git clone https://huggingface.co/Mohamed-BC/articles_embeddings "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mv articles_embeddings/articles_embeddings.pkl data\n",
    "!rm -rf articles_embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "emb = pd.read_pickle('data/articles_embeddings.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(192363,)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emb.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from recommend import recommend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/codespace/.python/current/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "query = \"How to train a model in PyTorch?\"\n",
    "recommend(query=\"How to train a model in PyTorch?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|\n",
      "    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|\n",
      "    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|\n",
      "    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|\n",
      "    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|\n",
      "\n",
      "    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n",
      "Enter your token (input will not be visible): Traceback (most recent call last):\n",
      "  File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in <module>\n",
      "    sys.exit(main())\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
      "    service.run()\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 98, in run\n",
      "    login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 115, in login\n",
      "    interpreter_login(new_session=new_session, write_permission=write_permission)\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 191, in interpreter_login\n",
      "    token = getpass(\"Enter your token (input will not be visible): \")\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 77, in unix_getpass\n",
      "    passwd = _raw_input(prompt, stream, input=input)\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 146, in _raw_input\n",
      "    line = input.readline()\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/codecs.py\", line 319, in decode\n",
      "    def decode(self, input, final=False):\n",
      "KeyboardInterrupt\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli login"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[90mgit version 2.44.0\u001b[0m\n",
      "\u001b[90mgit-lfs/3.5.1 (GitHub; linux amd64; go 1.21.8)\u001b[0m\n",
      "\n",
      "You are about to create \u001b[1mspaces/Mohamed-BC/articles_recommender_system\u001b[0m\n",
      "Proceed? [Y/n] ^C\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in <module>\n",
      "    sys.exit(main())\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
      "    service.run()\n",
      "  File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 169, in run\n",
      "    choice = input(\"Proceed? [Y/n] \").lower()\n",
      "KeyboardInterrupt\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli repo create articles_recommender_system --type space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
      "articles_embeddings.pkl:   0%|                       | 0.00/666M [00:00<?, ?B/s]\n",
      "medium_articles.csv:   0%|                          | 0.00/1.04G [00:00<?, ?B/s]\u001b[A\n",
      "\n",
      "articles_embeddings.pkl:   1%|              | 4.33M/666M [00:00<00:16, 40.2MB/s]\u001b[A\u001b[A\n",
      "articles_embeddings.pkl:   1%|▏             | 9.91M/666M [00:00<00:14, 46.6MB/s]\u001b[A\n",
      "articles_embeddings.pkl:   2%|▎             | 15.6M/666M [00:00<00:12, 51.2MB/s]\u001b[A\n",
      "articles_embeddings.pkl:   3%|▍             | 20.8M/666M [00:00<00:22, 28.7MB/s]\u001b[A\n",
      "articles_embeddings.pkl:   4%|▌             | 29.7M/666M [00:00<00:19, 32.4MB/s]\u001b[A\n",
      "medium_articles.csv:   2%|▍                | 23.0M/1.04G [00:00<00:40, 24.9MB/s]\u001b[A\n",
      "articles_embeddings.pkl:   5%|▋             | 33.4M/666M [00:01<00:33, 19.0MB/s]\u001b[A\n",
      "articles_embeddings.pkl:   9%|█▎            | 60.0M/666M [00:02<00:23, 25.6MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  10%|█▎            | 63.5M/666M [00:02<00:22, 26.6MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  10%|█▍            | 66.7M/666M [00:02<00:31, 19.1MB/s]\u001b[A\n",
      "medium_articles.csv:   7%|█▏               | 72.4M/1.04G [00:02<00:29, 32.9MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  14%|██            | 95.3M/666M [00:03<00:14, 38.7MB/s]\u001b[A\n",
      "medium_articles.csv:   8%|█▎               | 80.8M/1.04G [00:03<00:41, 23.4MB/s]\u001b[A\n",
      "medium_articles.csv:   8%|█▍               | 88.1M/1.04G [00:03<00:33, 28.5MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  15%|██            | 99.8M/666M [00:03<00:27, 20.3MB/s]\u001b[A\n",
      "medium_articles.csv:   9%|█▌               | 96.0M/1.04G [00:03<00:48, 19.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  17%|██▌            | 112M/666M [00:04<00:26, 21.0MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  19%|██▉            | 128M/666M [00:04<00:18, 29.5MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  24%|███▌           | 160M/666M [00:05<00:13, 37.5MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  26%|███▉           | 176M/666M [00:05<00:11, 42.2MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  29%|████▎          | 192M/666M [00:05<00:10, 44.5MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  30%|████▍          | 200M/666M [00:06<00:09, 48.2MB/s]\u001b[A\n",
      "medium_articles.csv:  18%|███▏              | 185M/1.04G [00:06<00:19, 43.7MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  31%|████▋          | 205M/666M [00:06<00:11, 39.0MB/s]\u001b[A\n",
      "medium_articles.csv:  19%|███▎              | 195M/1.04G [00:06<00:27, 31.0MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  32%|████▋          | 210M/666M [00:06<00:17, 26.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  36%|█████▍         | 240M/666M [00:07<00:11, 37.8MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  38%|█████▊         | 256M/666M [00:07<00:09, 42.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  41%|██████▏        | 272M/666M [00:08<00:09, 42.8MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  44%|██████▌        | 292M/666M [00:08<00:07, 47.1MB/s]\u001b[A\n",
      "medium_articles.csv:  25%|████▍             | 256M/1.04G [00:08<00:19, 40.3MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  46%|██████▊        | 304M/666M [00:08<00:08, 41.9MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  50%|███████▌       | 336M/666M [00:09<00:06, 47.9MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  53%|███████▉       | 352M/666M [00:09<00:06, 50.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  55%|████████▎      | 368M/666M [00:10<00:06, 47.3MB/s]\u001b[A\n",
      "medium_articles.csv:  32%|█████▊            | 336M/1.04G [00:10<00:15, 46.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  60%|█████████      | 400M/666M [00:10<00:05, 47.6MB/s]\u001b[A\n",
      "medium_articles.csv:  35%|██████▎           | 368M/1.04G [00:10<00:14, 47.8MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  62%|█████████▎     | 416M/666M [00:11<00:05, 44.7MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  65%|█████████▋     | 432M/666M [00:11<00:04, 47.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  70%|██████████▍    | 464M/666M [00:12<00:04, 49.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  72%|██████████▊    | 480M/666M [00:12<00:03, 48.2MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  75%|███████████▏   | 496M/666M [00:12<00:03, 47.4MB/s]\u001b[A\n",
      "medium_articles.csv:  45%|████████          | 464M/1.04G [00:12<00:13, 44.2MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  77%|███████████▌   | 512M/666M [00:13<00:03, 44.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  79%|███████████▉   | 528M/666M [00:13<00:03, 38.0MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  82%|████████████▎  | 544M/666M [00:14<00:03, 40.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  84%|████████████▌  | 560M/666M [00:14<00:02, 41.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  87%|████████████▉  | 576M/666M [00:14<00:01, 46.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  89%|█████████████▎ | 592M/666M [00:15<00:01, 44.4MB/s]\u001b[A\n",
      "medium_articles.csv:  55%|█████████▉        | 576M/1.04G [00:15<00:09, 48.7MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  91%|█████████████▋ | 608M/666M [00:15<00:01, 38.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  94%|██████████████ | 624M/666M [00:16<00:01, 38.8MB/s]\u001b[A\n",
      "medium_articles.csv:  60%|██████████▊       | 624M/1.04G [00:16<00:08, 50.4MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  96%|██████████████▍| 640M/666M [00:16<00:00, 39.6MB/s]\u001b[A\n",
      "articles_embeddings.pkl:  99%|██████████████▊| 656M/666M [00:16<00:00, 42.1MB/s]\u001b[A\n",
      "articles_embeddings.pkl: 100%|███████████████| 666M/666M [00:17<00:00, 39.0MB/s]\u001b[A\n",
      "\n",
      "medium_articles.csv:  66%|███████████▉      | 688M/1.04G [00:17<00:06, 52.4MB/s]\u001b[A\n",
      "\n",
      "Upload 2 LFS files:  50%|████████████▌            | 1/2 [00:17<00:17, 17.43s/it]\u001b[A\u001b[A\n",
      "medium_articles.csv:  68%|████████████▏     | 704M/1.04G [00:17<00:07, 43.1MB/s]\u001b[A\n",
      "medium_articles.csv:  69%|████████████▍     | 720M/1.04G [00:18<00:07, 41.5MB/s]\u001b[A\n",
      "medium_articles.csv:  71%|████████████▋     | 736M/1.04G [00:18<00:06, 46.2MB/s]\u001b[A\n",
      "medium_articles.csv:  72%|████████████▉     | 752M/1.04G [00:18<00:06, 48.3MB/s]\u001b[A\n",
      "medium_articles.csv:  74%|█████████████▎    | 768M/1.04G [00:19<00:05, 48.9MB/s]\u001b[A\n",
      "medium_articles.csv:  75%|█████████████▌    | 784M/1.04G [00:19<00:05, 49.2MB/s]\u001b[A\n",
      "medium_articles.csv:  77%|█████████████▊    | 800M/1.04G [00:19<00:05, 47.3MB/s]\u001b[A\n",
      "medium_articles.csv:  78%|██████████████    | 816M/1.04G [00:20<00:04, 48.0MB/s]\u001b[A\n",
      "medium_articles.csv:  80%|██████████████▎   | 832M/1.04G [00:20<00:04, 47.6MB/s]\u001b[A\n",
      "medium_articles.csv:  81%|██████████████▋   | 848M/1.04G [00:20<00:03, 51.5MB/s]\u001b[A\n",
      "medium_articles.csv:  83%|██████████████▉   | 864M/1.04G [00:21<00:03, 48.1MB/s]\u001b[A\n",
      "medium_articles.csv:  84%|███████████████▏  | 880M/1.04G [00:21<00:03, 47.9MB/s]\u001b[A\n",
      "medium_articles.csv:  86%|███████████████▍  | 896M/1.04G [00:21<00:03, 46.6MB/s]\u001b[A\n",
      "medium_articles.csv:  87%|███████████████▋  | 912M/1.04G [00:22<00:02, 48.3MB/s]\u001b[A\n",
      "medium_articles.csv:  89%|████████████████  | 928M/1.04G [00:22<00:02, 49.1MB/s]\u001b[A\n",
      "medium_articles.csv:  91%|████████████████▎ | 944M/1.04G [00:22<00:02, 45.7MB/s]\u001b[A\n",
      "medium_articles.csv:  92%|████████████████▌ | 960M/1.04G [00:23<00:01, 45.0MB/s]\u001b[A\n",
      "medium_articles.csv:  94%|████████████████▊ | 976M/1.04G [00:23<00:01, 46.9MB/s]\u001b[A\n",
      "medium_articles.csv:  95%|█████████████████▏| 992M/1.04G [00:23<00:01, 47.4MB/s]\u001b[A\n",
      "medium_articles.csv:  97%|████████████████▍| 1.01G/1.04G [00:24<00:00, 47.9MB/s]\u001b[A\n",
      "medium_articles.csv:  98%|████████████████▋| 1.02G/1.04G [00:24<00:00, 49.3MB/s]\u001b[A\n",
      "medium_articles.csv: 100%|█████████████████| 1.04G/1.04G [00:24<00:00, 41.8MB/s]\u001b[A\n",
      "\n",
      "\n",
      "Upload 2 LFS files: 100%|█████████████████████████| 2/2 [00:25<00:00, 12.59s/it]\u001b[A\u001b[A\n",
      "https://huggingface.co/Mohamed-BC/articles_recommender_system/tree/main/.\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload Mohamed-BC/articles_recommender_system ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}