Mohamed-BC commited on
Commit
66f5c36
1 Parent(s): 7ad11fc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/medium_articles.csv filter=lfs diff=lfs merge=lfs -text
__pycache__/recommend.cpython-310.pyc ADDED
Binary file (974 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Streamlit app script
2
+ import streamlit as st
3
+ from recommend import recommend
4
+ # A simple function to check login credentials (for demonstration purposes)
5
+ def check_login(username, password):
6
+ # Hardcoding a simple example username and password
7
+ user = "admin"
8
+ pwd = "pass123"
9
+ return username == user and password == pwd
10
+
11
+ # Main application code
12
+ def main():
13
+ # Initialize session state for login status
14
+ if "logged_in" not in st.session_state:
15
+ st.session_state.logged_in = False
16
+
17
+ # If not logged in, display login form
18
+ if not st.session_state.logged_in:
19
+ st.title("Login Page")
20
+ username = st.text_input("Username")
21
+ password = st.text_input("Password", type="password")
22
+ if st.button("Login"):
23
+ if check_login(username, password):
24
+ # Update session state to indicate user is logged in
25
+ # st.session_state.username = username
26
+ st.session_state.logged_in = True
27
+ st.rerun() # Rerun the script to reflect the new state
28
+ else:
29
+ st.error("Invalid credentials. Please try again.")
30
+
31
+ # If logged in, redirect to another page or show different content
32
+ else:
33
+ # This can be another Streamlit page, or a condition to render a different view
34
+ st.title(f"Welcome :)!")
35
+ cols = st.columns([3,1])
36
+ with cols[0]:
37
+ query = st.text_input('Search here', placeholder="Describe what you're looking for", label_visibility="collapsed")
38
+ with cols[1]:
39
+ btn = st.button('Search')
40
+ if btn and query:
41
+ with st.spinner('Searching...'):
42
+ st.write_stream(recommend(query))
43
+ # Example: Provide a logout button
44
+ if st.sidebar.button("Logout"):
45
+ st.session_state.logged_in = False
46
+ st.rerun()
47
+
48
+ if __name__ == "__main__":
49
+ main()
data/articles_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb9b2d170c8857dfb76178505ea4b1232d1a7c5fdd904d4d2cc5465879d96d0f
3
+ size 665668376
data/medium_articles.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba7b1022b2450cfcad0cdccae82ad29714e1fa8812f786fd01b302a7cb12a5c
3
+ size 1042340506
demo.ipynb ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/codespace/.python/current/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "from datasets import load_dataset"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "name": "stderr",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "Downloading data: 100%|██████████| 1.74G/1.74G [00:27<00:00, 62.8MB/s]\n",
31
+ "Generating train split: 100%|██████████| 192363/192363 [00:31<00:00, 6170.02 examples/s]\n"
32
+ ]
33
+ }
34
+ ],
35
+ "source": [
36
+ "data = load_dataset(\"Mohamed-BC/Articles\")"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 4,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "app.py\tdemo.ipynb recommend.py requirements.txt user.py\n"
49
+ ]
50
+ }
51
+ ],
52
+ "source": [
53
+ "!ls"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 5,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "!mkdir -p data"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 8,
68
+ "metadata": {},
69
+ "outputs": [
70
+ {
71
+ "name": "stdout",
72
+ "output_type": "stream",
73
+ "text": [
74
+ "Dataset URL: https://www.kaggle.com/datasets/fabiochiusano/medium-articles\n",
75
+ "License(s): CC0-1.0\n",
76
+ "Downloading medium-articles.zip to /workspaces/codespaces-blank\n",
77
+ " 99%|███████████████████████████████████████▊| 367M/369M [00:14<00:00, 42.9MB/s]\n",
78
+ "100%|████████████████████████████████████████| 369M/369M [00:14<00:00, 27.5MB/s]\n"
79
+ ]
80
+ }
81
+ ],
82
+ "source": [
83
+ "!kaggle datasets download -d fabiochiusano/medium-articles"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 9,
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "name": "stdout",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "Archive: medium-articles.zip\n",
96
+ " inflating: data/medium_articles.csv \n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "!unzip medium-articles.zip -d data\n",
102
+ "!rm medium-articles.zip"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 10,
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "name": "stdout",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "Cloning into 'articles_embeddings'...\n",
115
+ "remote: Enumerating objects: 6, done.\u001b[K\n",
116
+ "remote: Counting objects: 100% (3/3), done.\u001b[K\n",
117
+ "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
118
+ "remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
119
+ "Unpacking objects: 100% (6/6), 2.11 KiB | 1.06 MiB/s, done.\n"
120
+ ]
121
+ }
122
+ ],
123
+ "source": [
124
+ "!git clone https://huggingface.co/Mohamed-BC/articles_embeddings "
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 11,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "!mv articles_embeddings/articles_embeddings.pkl data\n",
134
+ "!rm -rf articles_embeddings"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 12,
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "import pandas as pd\n",
144
+ "emb = pd.read_pickle('data/articles_embeddings.pkl')"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 14,
150
+ "metadata": {},
151
+ "outputs": [
152
+ {
153
+ "data": {
154
+ "text/plain": [
155
+ "(192363,)"
156
+ ]
157
+ },
158
+ "execution_count": 14,
159
+ "metadata": {},
160
+ "output_type": "execute_result"
161
+ }
162
+ ],
163
+ "source": [
164
+ "emb.shape"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 15,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "from recommend import recommend"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 16,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stderr",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "/home/codespace/.python/current/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
186
+ " warnings.warn(\n"
187
+ ]
188
+ },
189
+ {
190
+ "ename": "",
191
+ "evalue": "",
192
+ "output_type": "error",
193
+ "traceback": [
194
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
195
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
196
+ "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
197
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
198
+ ]
199
+ }
200
+ ],
201
+ "source": [
202
+ "query = \"How to train a model in PyTorch?\"\n",
203
+ "recommend(query=\"How to train a model in PyTorch?\")"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 2,
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "\n",
216
+ " _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n",
217
+ " _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
218
+ " _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n",
219
+ " _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
220
+ " _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n",
221
+ "\n",
222
+ " To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n",
223
+ "Enter your token (input will not be visible): Traceback (most recent call last):\n",
224
+ " File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in <module>\n",
225
+ " sys.exit(main())\n",
226
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
227
+ " service.run()\n",
228
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 98, in run\n",
229
+ " login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)\n",
230
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 115, in login\n",
231
+ " interpreter_login(new_session=new_session, write_permission=write_permission)\n",
232
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/_login.py\", line 191, in interpreter_login\n",
233
+ " token = getpass(\"Enter your token (input will not be visible): \")\n",
234
+ " File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 77, in unix_getpass\n",
235
+ " passwd = _raw_input(prompt, stream, input=input)\n",
236
+ " File \"/usr/local/python/3.10.13/lib/python3.10/getpass.py\", line 146, in _raw_input\n",
237
+ " line = input.readline()\n",
238
+ " File \"/usr/local/python/3.10.13/lib/python3.10/codecs.py\", line 319, in decode\n",
239
+ " def decode(self, input, final=False):\n",
240
+ "KeyboardInterrupt\n"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "!huggingface-cli login"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 4,
251
+ "metadata": {},
252
+ "outputs": [
253
+ {
254
+ "name": "stdout",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "\u001b[90mgit version 2.44.0\u001b[0m\n",
258
+ "\u001b[90mgit-lfs/3.5.1 (GitHub; linux amd64; go 1.21.8)\u001b[0m\n",
259
+ "\n",
260
+ "You are about to create \u001b[1mspaces/Mohamed-BC/articles_recommender_system\u001b[0m\n",
261
+ "Proceed? [Y/n] ^C\n",
262
+ "Traceback (most recent call last):\n",
263
+ " File \"/home/codespace/.python/current/bin/huggingface-cli\", line 8, in <module>\n",
264
+ " sys.exit(main())\n",
265
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py\", line 51, in main\n",
266
+ " service.run()\n",
267
+ " File \"/usr/local/python/3.10.13/lib/python3.10/site-packages/huggingface_hub/commands/user.py\", line 169, in run\n",
268
+ " choice = input(\"Proceed? [Y/n] \").lower()\n",
269
+ "KeyboardInterrupt\n"
270
+ ]
271
+ }
272
+ ],
273
+ "source": [
274
+ "!huggingface-cli repo create articles_recommender_system --type space"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 6,
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n"
287
+ ]
288
+ }
289
+ ],
290
+ "source": [
291
+ "!huggingface-cli upload Mohamed-BC/articles_recommender_system ."
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "metadata": {},
298
+ "outputs": [],
299
+ "source": []
300
+ }
301
+ ],
302
+ "metadata": {
303
+ "kernelspec": {
304
+ "display_name": "Python 3",
305
+ "language": "python",
306
+ "name": "python3"
307
+ },
308
+ "language_info": {
309
+ "codemirror_mode": {
310
+ "name": "ipython",
311
+ "version": 3
312
+ },
313
+ "file_extension": ".py",
314
+ "mimetype": "text/x-python",
315
+ "name": "python",
316
+ "nbconvert_exporter": "python",
317
+ "pygments_lexer": "ipython3",
318
+ "version": "3.10.13"
319
+ }
320
+ },
321
+ "nbformat": 4,
322
+ "nbformat_minor": 2
323
+ }
recommend.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from scipy.spatial.distance import cosine
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datasets import load_dataset
6
+ import pickle as pkl
7
+ def recommend(query, n=5):
8
+ # Load the model
9
+ model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
10
+ # Load the data
11
+ data = pd.read_csv('data/medium_articles.csv')
12
+ # get the embeddings
13
+ a_embeddings = pkl.load(open('data/articles_embeddings.pkl', 'rb'))
14
+ # Encode the query
15
+ q_embedding = model.encode(query)
16
+ # Calculate the cosine similarity
17
+ cos_sim = np.array([1 - cosine(q_embedding, emb) for emb in a_embeddings[:1000]])
18
+ # Get the top n recommendations
19
+ top_n = np.argsort(cos_sim)[-n:]
20
+ return data.iloc[top_n]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ sentence-transformers
5
+ datasets
6
+ huggingface-hub