{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/codespace/.python/current/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from datasets import load_dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data: 100%|██████████| 1.74G/1.74G [00:27<00:00, 62.8MB/s]\n",
"Generating train split: 100%|██████████| 192363/192363 [00:31<00:00, 6170.02 examples/s]\n"
]
}
],
"source": [
"data = load_dataset(\"Mohamed-BC/Articles\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"app.py\tdemo.ipynb recommend.py requirements.txt user.py\n"
]
}
],
"source": [
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset URL: https://www.kaggle.com/datasets/fabiochiusano/medium-articles\n",
"License(s): CC0-1.0\n",
"Downloading medium-articles.zip to /workspaces/codespaces-blank\n",
" 99%|███████████████████████████████████████▊| 367M/369M [00:14<00:00, 42.9MB/s]\n",
"100%|████████████████████████████████████████| 369M/369M [00:14<00:00, 27.5MB/s]\n"
]
}
],
"source": [
"!kaggle datasets download -d fabiochiusano/medium-articles"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: medium-articles.zip\n",
" inflating: data/medium_articles.csv \n"
]
}
],
"source": [
"!unzip medium-articles.zip -d data\n",
"!rm medium-articles.zip"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'articles_embeddings'...\n",
"remote: Enumerating objects: 6, done.\u001b[K\n",
"remote: Counting objects: 100% (3/3), done.\u001b[K\n",
"remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
"remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
"Unpacking objects: 100% (6/6), 2.11 KiB | 1.06 MiB/s, done.\n"
]
}
],
"source": [
"!git clone https://huggingface.co/Mohamed-BC/articles_embeddings "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"!mv articles_embeddings/articles_embeddings.pkl data\n",
"!rm -rf articles_embeddings"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"emb = pd.read_pickle('data/articles_embeddings.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(192363,)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emb.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from recommend import recommend"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/codespace/.python/current/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick here for more info. \n",
"\u001b[1;31mView Jupyter log for further details."
]
}
],
"source": [
"query = \"How to train a model in PyTorch?\"\n",
"recommend(query=\"How to train a model in PyTorch?\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n",
" _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
" _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n",
" _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
" _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n",
"\n",
" To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n",
"Enter your token (input will not be visible): Traceback (most recent call last):\n",
" File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in \n",
" sys.exit(main())\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
" service.run()\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 98, in run\n",
" login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 115, in login\n",
" interpreter_login(new_session=new_session, write_permission=write_permission)\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 191, in interpreter_login\n",
" token = getpass(\"Enter your token (input will not be visible): \")\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 77, in unix_getpass\n",
" passwd = _raw_input(prompt, stream, input=input)\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 146, in _raw_input\n",
" line = input.readline()\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/codecs.py\", line 319, in decode\n",
" def decode(self, input, final=False):\n",
"KeyboardInterrupt\n"
]
}
],
"source": [
"!huggingface-cli login"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[90mgit version 2.44.0\u001b[0m\n",
"\u001b[90mgit-lfs/3.5.1 (GitHub; linux amd64; go 1.21.8)\u001b[0m\n",
"\n",
"You are about to create \u001b[1mspaces/Mohamed-BC/articles_recommender_system\u001b[0m\n",
"Proceed? [Y/n] ^C\n",
"Traceback (most recent call last):\n",
" File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in \n",
" sys.exit(main())\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
" service.run()\n",
" File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 169, in run\n",
" choice = input(\"Proceed? [Y/n] \").lower()\n",
"KeyboardInterrupt\n"
]
}
],
"source": [
"!huggingface-cli repo create articles_recommender_system --type space"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
"articles_embeddings.pkl: 0%| | 0.00/666M [00:00, ?B/s]\n",
"medium_articles.csv: 0%| | 0.00/1.04G [00:00, ?B/s]\u001b[A\n",
"\n",
"articles_embeddings.pkl: 1%| | 4.33M/666M [00:00<00:16, 40.2MB/s]\u001b[A\u001b[A\n",
"articles_embeddings.pkl: 1%|▏ | 9.91M/666M [00:00<00:14, 46.6MB/s]\u001b[A\n",
"articles_embeddings.pkl: 2%|▎ | 15.6M/666M [00:00<00:12, 51.2MB/s]\u001b[A\n",
"articles_embeddings.pkl: 3%|▍ | 20.8M/666M [00:00<00:22, 28.7MB/s]\u001b[A\n",
"articles_embeddings.pkl: 4%|▌ | 29.7M/666M [00:00<00:19, 32.4MB/s]\u001b[A\n",
"medium_articles.csv: 2%|▍ | 23.0M/1.04G [00:00<00:40, 24.9MB/s]\u001b[A\n",
"articles_embeddings.pkl: 5%|▋ | 33.4M/666M [00:01<00:33, 19.0MB/s]\u001b[A\n",
"articles_embeddings.pkl: 9%|█▎ | 60.0M/666M [00:02<00:23, 25.6MB/s]\u001b[A\n",
"articles_embeddings.pkl: 10%|█▎ | 63.5M/666M [00:02<00:22, 26.6MB/s]\u001b[A\n",
"articles_embeddings.pkl: 10%|█▍ | 66.7M/666M [00:02<00:31, 19.1MB/s]\u001b[A\n",
"medium_articles.csv: 7%|█▏ | 72.4M/1.04G [00:02<00:29, 32.9MB/s]\u001b[A\n",
"articles_embeddings.pkl: 14%|██ | 95.3M/666M [00:03<00:14, 38.7MB/s]\u001b[A\n",
"medium_articles.csv: 8%|█▎ | 80.8M/1.04G [00:03<00:41, 23.4MB/s]\u001b[A\n",
"medium_articles.csv: 8%|█▍ | 88.1M/1.04G [00:03<00:33, 28.5MB/s]\u001b[A\n",
"articles_embeddings.pkl: 15%|██ | 99.8M/666M [00:03<00:27, 20.3MB/s]\u001b[A\n",
"medium_articles.csv: 9%|█▌ | 96.0M/1.04G [00:03<00:48, 19.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 17%|██▌ | 112M/666M [00:04<00:26, 21.0MB/s]\u001b[A\n",
"articles_embeddings.pkl: 19%|██▉ | 128M/666M [00:04<00:18, 29.5MB/s]\u001b[A\n",
"articles_embeddings.pkl: 24%|███▌ | 160M/666M [00:05<00:13, 37.5MB/s]\u001b[A\n",
"articles_embeddings.pkl: 26%|███▉ | 176M/666M [00:05<00:11, 42.2MB/s]\u001b[A\n",
"articles_embeddings.pkl: 29%|████▎ | 192M/666M [00:05<00:10, 44.5MB/s]\u001b[A\n",
"articles_embeddings.pkl: 30%|████▍ | 200M/666M [00:06<00:09, 48.2MB/s]\u001b[A\n",
"medium_articles.csv: 18%|███▏ | 185M/1.04G [00:06<00:19, 43.7MB/s]\u001b[A\n",
"articles_embeddings.pkl: 31%|████▋ | 205M/666M [00:06<00:11, 39.0MB/s]\u001b[A\n",
"medium_articles.csv: 19%|███▎ | 195M/1.04G [00:06<00:27, 31.0MB/s]\u001b[A\n",
"articles_embeddings.pkl: 32%|████▋ | 210M/666M [00:06<00:17, 26.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 36%|█████▍ | 240M/666M [00:07<00:11, 37.8MB/s]\u001b[A\n",
"articles_embeddings.pkl: 38%|█████▊ | 256M/666M [00:07<00:09, 42.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 41%|██████▏ | 272M/666M [00:08<00:09, 42.8MB/s]\u001b[A\n",
"articles_embeddings.pkl: 44%|██████▌ | 292M/666M [00:08<00:07, 47.1MB/s]\u001b[A\n",
"medium_articles.csv: 25%|████▍ | 256M/1.04G [00:08<00:19, 40.3MB/s]\u001b[A\n",
"articles_embeddings.pkl: 46%|██████▊ | 304M/666M [00:08<00:08, 41.9MB/s]\u001b[A\n",
"articles_embeddings.pkl: 50%|███████▌ | 336M/666M [00:09<00:06, 47.9MB/s]\u001b[A\n",
"articles_embeddings.pkl: 53%|███████▉ | 352M/666M [00:09<00:06, 50.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 55%|████████▎ | 368M/666M [00:10<00:06, 47.3MB/s]\u001b[A\n",
"medium_articles.csv: 32%|█████▊ | 336M/1.04G [00:10<00:15, 46.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 60%|█████████ | 400M/666M [00:10<00:05, 47.6MB/s]\u001b[A\n",
"medium_articles.csv: 35%|██████▎ | 368M/1.04G [00:10<00:14, 47.8MB/s]\u001b[A\n",
"articles_embeddings.pkl: 62%|█████████▎ | 416M/666M [00:11<00:05, 44.7MB/s]\u001b[A\n",
"articles_embeddings.pkl: 65%|█████████▋ | 432M/666M [00:11<00:04, 47.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 70%|██████████▍ | 464M/666M [00:12<00:04, 49.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 72%|██████████▊ | 480M/666M [00:12<00:03, 48.2MB/s]\u001b[A\n",
"articles_embeddings.pkl: 75%|███████████▏ | 496M/666M [00:12<00:03, 47.4MB/s]\u001b[A\n",
"medium_articles.csv: 45%|████████ | 464M/1.04G [00:12<00:13, 44.2MB/s]\u001b[A\n",
"articles_embeddings.pkl: 77%|███████████▌ | 512M/666M [00:13<00:03, 44.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 79%|███████████▉ | 528M/666M [00:13<00:03, 38.0MB/s]\u001b[A\n",
"articles_embeddings.pkl: 82%|████████████▎ | 544M/666M [00:14<00:03, 40.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 84%|████████████▌ | 560M/666M [00:14<00:02, 41.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 87%|████████████▉ | 576M/666M [00:14<00:01, 46.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 89%|█████████████▎ | 592M/666M [00:15<00:01, 44.4MB/s]\u001b[A\n",
"medium_articles.csv: 55%|█████████▉ | 576M/1.04G [00:15<00:09, 48.7MB/s]\u001b[A\n",
"articles_embeddings.pkl: 91%|█████████████▋ | 608M/666M [00:15<00:01, 38.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 94%|██████████████ | 624M/666M [00:16<00:01, 38.8MB/s]\u001b[A\n",
"medium_articles.csv: 60%|██████████▊ | 624M/1.04G [00:16<00:08, 50.4MB/s]\u001b[A\n",
"articles_embeddings.pkl: 96%|██████████████▍| 640M/666M [00:16<00:00, 39.6MB/s]\u001b[A\n",
"articles_embeddings.pkl: 99%|██████████████▊| 656M/666M [00:16<00:00, 42.1MB/s]\u001b[A\n",
"articles_embeddings.pkl: 100%|███████████████| 666M/666M [00:17<00:00, 39.0MB/s]\u001b[A\n",
"\n",
"medium_articles.csv: 66%|███████████▉ | 688M/1.04G [00:17<00:06, 52.4MB/s]\u001b[A\n",
"\n",
"Upload 2 LFS files: 50%|████████████▌ | 1/2 [00:17<00:17, 17.43s/it]\u001b[A\u001b[A\n",
"medium_articles.csv: 68%|████████████▏ | 704M/1.04G [00:17<00:07, 43.1MB/s]\u001b[A\n",
"medium_articles.csv: 69%|████████████▍ | 720M/1.04G [00:18<00:07, 41.5MB/s]\u001b[A\n",
"medium_articles.csv: 71%|████████████▋ | 736M/1.04G [00:18<00:06, 46.2MB/s]\u001b[A\n",
"medium_articles.csv: 72%|████████████▉ | 752M/1.04G [00:18<00:06, 48.3MB/s]\u001b[A\n",
"medium_articles.csv: 74%|█████████████▎ | 768M/1.04G [00:19<00:05, 48.9MB/s]\u001b[A\n",
"medium_articles.csv: 75%|█████████████▌ | 784M/1.04G [00:19<00:05, 49.2MB/s]\u001b[A\n",
"medium_articles.csv: 77%|█████████████▊ | 800M/1.04G [00:19<00:05, 47.3MB/s]\u001b[A\n",
"medium_articles.csv: 78%|██████████████ | 816M/1.04G [00:20<00:04, 48.0MB/s]\u001b[A\n",
"medium_articles.csv: 80%|██████████████▎ | 832M/1.04G [00:20<00:04, 47.6MB/s]\u001b[A\n",
"medium_articles.csv: 81%|██████████████▋ | 848M/1.04G [00:20<00:03, 51.5MB/s]\u001b[A\n",
"medium_articles.csv: 83%|██████████████▉ | 864M/1.04G [00:21<00:03, 48.1MB/s]\u001b[A\n",
"medium_articles.csv: 84%|███████████████▏ | 880M/1.04G [00:21<00:03, 47.9MB/s]\u001b[A\n",
"medium_articles.csv: 86%|███████████████▍ | 896M/1.04G [00:21<00:03, 46.6MB/s]\u001b[A\n",
"medium_articles.csv: 87%|███████████████▋ | 912M/1.04G [00:22<00:02, 48.3MB/s]\u001b[A\n",
"medium_articles.csv: 89%|████████████████ | 928M/1.04G [00:22<00:02, 49.1MB/s]\u001b[A\n",
"medium_articles.csv: 91%|████████████████▎ | 944M/1.04G [00:22<00:02, 45.7MB/s]\u001b[A\n",
"medium_articles.csv: 92%|████████████████▌ | 960M/1.04G [00:23<00:01, 45.0MB/s]\u001b[A\n",
"medium_articles.csv: 94%|████████████████▊ | 976M/1.04G [00:23<00:01, 46.9MB/s]\u001b[A\n",
"medium_articles.csv: 95%|█████████████████▏| 992M/1.04G [00:23<00:01, 47.4MB/s]\u001b[A\n",
"medium_articles.csv: 97%|████████████████▍| 1.01G/1.04G [00:24<00:00, 47.9MB/s]\u001b[A\n",
"medium_articles.csv: 98%|████████████████▋| 1.02G/1.04G [00:24<00:00, 49.3MB/s]\u001b[A\n",
"medium_articles.csv: 100%|█████████████████| 1.04G/1.04G [00:24<00:00, 41.8MB/s]\u001b[A\n",
"\n",
"\n",
"Upload 2 LFS files: 100%|█████████████████████████| 2/2 [00:25<00:00, 12.59s/it]\u001b[A\u001b[A\n",
"https://huggingface.co/Mohamed-BC/articles_recommender_system/tree/main/.\n"
]
}
],
"source": [
"!huggingface-cli upload Mohamed-BC/articles_recommender_system ."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}