Roshanik commited on
Commit
ae40f24
·
verified ·
1 Parent(s): cc5672b

Upload 8 files

Browse files
Files changed (8) hide show
  1. .env +1 -0
  2. Procfile.txt +1 -0
  3. app.py +50 -0
  4. data_preprocess.ipynb +869 -0
  5. final2.csv +0 -0
  6. functions.py +275 -0
  7. requirements.txt +9 -0
  8. resource.pkl +3 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GOOGLE_API_KEY="AIzaSyCWwWy7tou3X7bqX7xct7FQd4nF99VvPp4"
Procfile.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ web: sh setup.sh && streamlit run functions.py
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ from functions import recommend_from_dataset,summarize_and_generate
4
+ resource = pickle.load(open("resource.pkl",'rb'))
5
+
6
+
7
+ st.title("Micro-Learning Dashboard")
8
+ st.subheader('Your Ultimate Study Partner', divider= "rainbow")
9
+ st.subheader('_AI is the_ :blue[Solution] :sunglasses:')
10
+
11
+
12
+ option = st.selectbox(
13
+ "",
14
+ resource['topic'],
15
+ index=None,
16
+ placeholder="Select contact Course...",
17
+ )
18
+
19
+ user_query = st.text_input("Or Manually Type Here...")
20
+
21
+ def display_output(text_summary, userinput):
22
+ # Display the query summary and generated text
23
+ st.write("Your Query:", userinput)
24
+ st.write("Micro AI: ")
25
+ st.write(text_summary['generated_text'])
26
+
27
+ # Display the related links with their topics
28
+ st.write("Related Links:")
29
+ for link in text_summary["related_links"]:
30
+ topic = link.get('topic', 'No topic available')
31
+ link_value = link.get('link', 'No link available')
32
+ if isinstance(link_value, str):
33
+ st.write(topic + ":")
34
+ for sub_link in link_value.split(", "):
35
+ st.write("- " + sub_link)
36
+ else:
37
+ st.write(topic + ": Link not available")
38
+
39
+ # Define the behavior when the user clicks the Submit button
40
+ if st.button("Submit"):
41
+ if user_query:
42
+ recommendations = recommend_from_dataset(user_query)
43
+ text_summary = summarize_and_generate(user_query, recommendations)
44
+ display_output(text_summary,user_query)
45
+ elif option:
46
+ recommendations = recommend_from_dataset(option)
47
+ text_summary = summarize_and_generate(option, recommendations)
48
+ display_output(text_summary,option)
49
+ else:
50
+ st.write("Please select a course or enter a query.")
data_preprocess.ipynb ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "ecbb6eac",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import pipeline\n",
11
+ "from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "a4bac354",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import numpy as np\n",
23
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
24
+ "import nltk\n",
25
+ "from nltk.stem.porter import PorterStemmer\n",
26
+ "from nltk.stem import WordNetLemmatizer\n",
27
+ "import re\n",
28
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
29
+ "from fuzzywuzzy import fuzz\n",
30
+ "from sklearn.feature_extraction.text import TfidfVectorizer"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 47,
36
+ "id": "bfe7183c",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "\n",
41
+ "data3 = pd.read_csv('final2.csv')"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 5,
47
+ "id": "22f9643b",
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "<class 'pandas.core.frame.DataFrame'>\n",
55
+ "RangeIndex: 3720 entries, 0 to 3719\n",
56
+ "Data columns (total 6 columns):\n",
57
+ " # Column Non-Null Count Dtype \n",
58
+ "--- ------ -------------- ----- \n",
59
+ " 0 Unnamed: 0 3720 non-null int64 \n",
60
+ " 1 topic 3720 non-null object\n",
61
+ " 2 discription 1748 non-null object\n",
62
+ " 3 keyword 3204 non-null object\n",
63
+ " 4 Links 3720 non-null object\n",
64
+ " 5 level 3720 non-null object\n",
65
+ "dtypes: int64(1), object(5)\n",
66
+ "memory usage: 174.5+ KB\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "data3.info()"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 6,
77
+ "id": "6ef84197",
78
+ "metadata": {},
79
+ "outputs": [
80
+ {
81
+ "data": {
82
+ "text/html": [
83
+ "<div>\n",
84
+ "<style scoped>\n",
85
+ " .dataframe tbody tr th:only-of-type {\n",
86
+ " vertical-align: middle;\n",
87
+ " }\n",
88
+ "\n",
89
+ " .dataframe tbody tr th {\n",
90
+ " vertical-align: top;\n",
91
+ " }\n",
92
+ "\n",
93
+ " .dataframe thead th {\n",
94
+ " text-align: right;\n",
95
+ " }\n",
96
+ "</style>\n",
97
+ "<table border=\"1\" class=\"dataframe\">\n",
98
+ " <thead>\n",
99
+ " <tr style=\"text-align: right;\">\n",
100
+ " <th></th>\n",
101
+ " <th>Unnamed: 0</th>\n",
102
+ " <th>topic</th>\n",
103
+ " <th>discription</th>\n",
104
+ " <th>keyword</th>\n",
105
+ " <th>Links</th>\n",
106
+ " <th>level</th>\n",
107
+ " </tr>\n",
108
+ " </thead>\n",
109
+ " <tbody>\n",
110
+ " <tr>\n",
111
+ " <th>0</th>\n",
112
+ " <td>0</td>\n",
113
+ " <td>Java</td>\n",
114
+ " <td>Java is a general-purpose computer programming...</td>\n",
115
+ " <td>Java, James Gosling, website, wikipedia, docum...</td>\n",
116
+ " <td>website: https://oracle.com/java/, documentati...</td>\n",
117
+ " <td>beginner to advance</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>1</th>\n",
121
+ " <td>1</td>\n",
122
+ " <td>JavaScript</td>\n",
123
+ " <td>JavaScript (), often abbreviated as JS, is a h...</td>\n",
124
+ " <td>JavaScript, Brendan Eich, reference, wikipedia...</td>\n",
125
+ " <td>reference: https://www.w3schools.com/js/js_res...</td>\n",
126
+ " <td>beginner to advance</td>\n",
127
+ " </tr>\n",
128
+ " <tr>\n",
129
+ " <th>2</th>\n",
130
+ " <td>2</td>\n",
131
+ " <td>C</td>\n",
132
+ " <td>C (, as in the letter c) is a general-purpose,...</td>\n",
133
+ " <td>C, Dennis Ritchie, reference, wikipedia, docum...</td>\n",
134
+ " <td>reference: http://www.c4learn.com/c-programmin...</td>\n",
135
+ " <td>beginner to advance</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>3</th>\n",
139
+ " <td>3</td>\n",
140
+ " <td>Python</td>\n",
141
+ " <td>Python is a widely used high-level programming...</td>\n",
142
+ " <td>Python, Guido van Rossum, website, reference, ...</td>\n",
143
+ " <td>website: https://www.python.org/, reference: h...</td>\n",
144
+ " <td>beginner to advance</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>4</th>\n",
148
+ " <td>4</td>\n",
149
+ " <td>SQL</td>\n",
150
+ " <td>SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...</td>\n",
151
+ " <td>SQL, Donald D. Chamberlin and Raymond F. Boyce...</td>\n",
152
+ " <td>documentation: https://docs.data.world/documen...</td>\n",
153
+ " <td>beginner to advance</td>\n",
154
+ " </tr>\n",
155
+ " </tbody>\n",
156
+ "</table>\n",
157
+ "</div>"
158
+ ],
159
+ "text/plain": [
160
+ " Unnamed: 0 topic discription \\\n",
161
+ "0 0 Java Java is a general-purpose computer programming... \n",
162
+ "1 1 JavaScript JavaScript (), often abbreviated as JS, is a h... \n",
163
+ "2 2 C C (, as in the letter c) is a general-purpose,... \n",
164
+ "3 3 Python Python is a widely used high-level programming... \n",
165
+ "4 4 SQL SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k... \n",
166
+ "\n",
167
+ " keyword \\\n",
168
+ "0 Java, James Gosling, website, wikipedia, docum... \n",
169
+ "1 JavaScript, Brendan Eich, reference, wikipedia... \n",
170
+ "2 C, Dennis Ritchie, reference, wikipedia, docum... \n",
171
+ "3 Python, Guido van Rossum, website, reference, ... \n",
172
+ "4 SQL, Donald D. Chamberlin and Raymond F. Boyce... \n",
173
+ "\n",
174
+ " Links level \n",
175
+ "0 website: https://oracle.com/java/, documentati... beginner to advance \n",
176
+ "1 reference: https://www.w3schools.com/js/js_res... beginner to advance \n",
177
+ "2 reference: http://www.c4learn.com/c-programmin... beginner to advance \n",
178
+ "3 website: https://www.python.org/, reference: h... beginner to advance \n",
179
+ "4 documentation: https://docs.data.world/documen... beginner to advance "
180
+ ]
181
+ },
182
+ "execution_count": 6,
183
+ "metadata": {},
184
+ "output_type": "execute_result"
185
+ }
186
+ ],
187
+ "source": [
188
+ "data3.head()"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 9,
194
+ "id": "acf74e04",
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ "<class 'pandas.core.frame.DataFrame'>\n",
202
+ "RangeIndex: 3720 entries, 0 to 3719\n",
203
+ "Data columns (total 6 columns):\n",
204
+ " # Column Non-Null Count Dtype \n",
205
+ "--- ------ -------------- ----- \n",
206
+ " 0 Unnamed: 0 3720 non-null int64 \n",
207
+ " 1 topic 3720 non-null string\n",
208
+ " 2 discription 1748 non-null string\n",
209
+ " 3 keyword 3720 non-null string\n",
210
+ " 4 Links 3720 non-null object\n",
211
+ " 5 level 3720 non-null string\n",
212
+ "dtypes: int64(1), object(1), string(4)\n",
213
+ "memory usage: 174.5+ KB\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "data3['topic'] = data3.topic.astype(\"string\")\n",
219
+ "data3['discription'] = data3.discription.astype(\"string\")\n",
220
+ "data3['keyword'] = data3.keyword.astype(\"string\")\n",
221
+ "data3['level'] = data3.level.astype(\"string\")\n",
222
+ "data3.info()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "markdown",
227
+ "id": "64f90df1",
228
+ "metadata": {},
229
+ "source": [
230
+ "# Data Cleaning Process\n",
231
+ "'\n",
232
+ "'\n"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 10,
238
+ "id": "b16989a3",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "data3['tag'] = data3['discription'] + \" \" + data3['keyword'] +\" \" + data3['level']"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 11,
248
+ "id": "caa02729",
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "def remove_symbols(text):\n",
253
+ " # Create a regular expression pattern to match unwanted symbols\n",
254
+ " pattern = r'[^\\w\\s]' # Matches characters that are not alphanumeric or whitespace\n",
255
+ " # Substitute matched symbols with an empty string\n",
256
+ " return re.sub(pattern, '', text.lower()) "
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 12,
262
+ "id": "a97fa574",
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "data": {
267
+ "text/html": [
268
+ "<div>\n",
269
+ "<style scoped>\n",
270
+ " .dataframe tbody tr th:only-of-type {\n",
271
+ " vertical-align: middle;\n",
272
+ " }\n",
273
+ "\n",
274
+ " .dataframe tbody tr th {\n",
275
+ " vertical-align: top;\n",
276
+ " }\n",
277
+ "\n",
278
+ " .dataframe thead th {\n",
279
+ " text-align: right;\n",
280
+ " }\n",
281
+ "</style>\n",
282
+ "<table border=\"1\" class=\"dataframe\">\n",
283
+ " <thead>\n",
284
+ " <tr style=\"text-align: right;\">\n",
285
+ " <th></th>\n",
286
+ " <th>Unnamed: 0</th>\n",
287
+ " <th>topic</th>\n",
288
+ " <th>discription</th>\n",
289
+ " <th>keyword</th>\n",
290
+ " <th>Links</th>\n",
291
+ " <th>level</th>\n",
292
+ " <th>tag</th>\n",
293
+ " </tr>\n",
294
+ " </thead>\n",
295
+ " <tbody>\n",
296
+ " <tr>\n",
297
+ " <th>0</th>\n",
298
+ " <td>0</td>\n",
299
+ " <td>Java</td>\n",
300
+ " <td>Java is a general-purpose computer programming...</td>\n",
301
+ " <td>Java, James Gosling, website, wikipedia, docum...</td>\n",
302
+ " <td>website: https://oracle.com/java/, documentati...</td>\n",
303
+ " <td>beginnertoadvance</td>\n",
304
+ " <td>java is a generalpurpose computer programming ...</td>\n",
305
+ " </tr>\n",
306
+ " <tr>\n",
307
+ " <th>1</th>\n",
308
+ " <td>1</td>\n",
309
+ " <td>JavaScript</td>\n",
310
+ " <td>JavaScript (), often abbreviated as JS, is a h...</td>\n",
311
+ " <td>JavaScript, Brendan Eich, reference, wikipedia...</td>\n",
312
+ " <td>reference: https://www.w3schools.com/js/js_res...</td>\n",
313
+ " <td>beginnertoadvance</td>\n",
314
+ " <td>javascript often abbreviated as js is a highl...</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>2</th>\n",
318
+ " <td>2</td>\n",
319
+ " <td>C</td>\n",
320
+ " <td>C (, as in the letter c) is a general-purpose,...</td>\n",
321
+ " <td>C, Dennis Ritchie, reference, wikipedia, docum...</td>\n",
322
+ " <td>reference: http://www.c4learn.com/c-programmin...</td>\n",
323
+ " <td>beginnertoadvance</td>\n",
324
+ " <td>c as in the letter c is a generalpurpose impe...</td>\n",
325
+ " </tr>\n",
326
+ " <tr>\n",
327
+ " <th>3</th>\n",
328
+ " <td>3</td>\n",
329
+ " <td>Python</td>\n",
330
+ " <td>Python is a widely used high-level programming...</td>\n",
331
+ " <td>Python, Guido van Rossum, website, reference, ...</td>\n",
332
+ " <td>website: https://www.python.org/, reference: h...</td>\n",
333
+ " <td>beginnertoadvance</td>\n",
334
+ " <td>python is a widely used highlevel programming ...</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>4</th>\n",
338
+ " <td>4</td>\n",
339
+ " <td>SQL</td>\n",
340
+ " <td>SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...</td>\n",
341
+ " <td>SQL, Donald D. Chamberlin and Raymond F. Boyce...</td>\n",
342
+ " <td>documentation: https://docs.data.world/documen...</td>\n",
343
+ " <td>beginnertoadvance</td>\n",
344
+ " <td>sql listen esskewel or listen seekwəl or ...</td>\n",
345
+ " </tr>\n",
346
+ " </tbody>\n",
347
+ "</table>\n",
348
+ "</div>"
349
+ ],
350
+ "text/plain": [
351
+ " Unnamed: 0 topic discription \\\n",
352
+ "0 0 Java Java is a general-purpose computer programming... \n",
353
+ "1 1 JavaScript JavaScript (), often abbreviated as JS, is a h... \n",
354
+ "2 2 C C (, as in the letter c) is a general-purpose,... \n",
355
+ "3 3 Python Python is a widely used high-level programming... \n",
356
+ "4 4 SQL SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k... \n",
357
+ "\n",
358
+ " keyword \\\n",
359
+ "0 Java, James Gosling, website, wikipedia, docum... \n",
360
+ "1 JavaScript, Brendan Eich, reference, wikipedia... \n",
361
+ "2 C, Dennis Ritchie, reference, wikipedia, docum... \n",
362
+ "3 Python, Guido van Rossum, website, reference, ... \n",
363
+ "4 SQL, Donald D. Chamberlin and Raymond F. Boyce... \n",
364
+ "\n",
365
+ " Links level \\\n",
366
+ "0 website: https://oracle.com/java/, documentati... beginnertoadvance \n",
367
+ "1 reference: https://www.w3schools.com/js/js_res... beginnertoadvance \n",
368
+ "2 reference: http://www.c4learn.com/c-programmin... beginnertoadvance \n",
369
+ "3 website: https://www.python.org/, reference: h... beginnertoadvance \n",
370
+ "4 documentation: https://docs.data.world/documen... beginnertoadvance \n",
371
+ "\n",
372
+ " tag \n",
373
+ "0 java is a generalpurpose computer programming ... \n",
374
+ "1 javascript often abbreviated as js is a highl... \n",
375
+ "2 c as in the letter c is a generalpurpose impe... \n",
376
+ "3 python is a widely used highlevel programming ... \n",
377
+ "4 sql listen esskewel or listen seekwəl or ... "
378
+ ]
379
+ },
380
+ "execution_count": 12,
381
+ "metadata": {},
382
+ "output_type": "execute_result"
383
+ }
384
+ ],
385
+ "source": [
386
+ "data3['tag'] = data3['tag'].fillna('')\n",
387
+ "data3['tag'] = data3['tag'].apply(remove_symbols)\n",
388
+ "data3['level'] = data3['level'].apply(lambda x: x.replace(\" \",\"\"))\n",
389
+ "data3['keyword'] = data3['keyword'].fillna('')\n",
390
+ "data3.head()"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 13,
396
+ "id": "a5a4f1ba",
397
+ "metadata": {},
398
+ "outputs": [
399
+ {
400
+ "data": {
401
+ "text/plain": [
402
+ "'java is a generalpurpose computer programming language that is concurrent classbased objectoriented and specifically designed to have as few implementation dependencies as possible it is intended to let application developers write once run anywhere wora meaning that compiled java code can run on all platforms that support java without the need for recompilation java applications are typically compiled to bytecode that can run on any java virtual machine jvm regardless of computer architecture as of 2016 java is one of the most popular programming languages in use particularly for clientserver web applications with a reported 9 million developers java was originally developed by james gosling at sun microsystems which has since been acquired by oracle corporation and released in 1995 as a core component of sun microsystems java platform the language derives much of its syntax from c and c but it has fewer lowlevel facilities than either of them the original and reference implementation java compilers virtual machines and class libraries were originally released by sun under proprietary licenses as of may 2007 in compliance with the specifications of the java community process sun relicensed most of its java technologies under the gnu general public license others have also developed alternative implementations of these sun technologies such as the gnu compiler for java bytecode compiler gnu classpath standard libraries and icedteaweb browser plugin for applets the latest version is java 9 released on september 21 2017 and is one of the two versions currently supported for free by oracle versions earlier than java 8 are supported by companies on a commercial basis eg by oracle back to java 6 as of october 2017 while they still highly recommend that you uninstall prejava 8 from at least windows computers java james gosling website wikipedia document united states beginnertoadvance'"
403
+ ]
404
+ },
405
+ "execution_count": 13,
406
+ "metadata": {},
407
+ "output_type": "execute_result"
408
+ }
409
+ ],
410
+ "source": [
411
+ "data3['tag'][0]"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "markdown",
416
+ "id": "efb5aaba",
417
+ "metadata": {},
418
+ "source": [
419
+ "# Convert tag columns into vector "
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 14,
425
+ "id": "86f2a927",
426
+ "metadata": {},
427
+ "outputs": [],
428
+ "source": [
429
+ "cv = CountVectorizer( max_features = 5000, stop_words = 'english')\n",
430
+ "vector = cv.fit_transform(data3['tag']).toarray()"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": 15,
436
+ "id": "b99539f9",
437
+ "metadata": {},
438
+ "outputs": [
439
+ {
440
+ "data": {
441
+ "text/plain": [
442
+ "array([0, 0, 0, ..., 0, 0, 0], dtype=int64)"
443
+ ]
444
+ },
445
+ "execution_count": 15,
446
+ "metadata": {},
447
+ "output_type": "execute_result"
448
+ }
449
+ ],
450
+ "source": [
451
+ "vector[0]"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 16,
457
+ "id": "6be0d7ec",
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "data": {
462
+ "text/plain": [
463
+ "array(['10', '100', '1000', ..., 'λprolog', 'λx', 'μc'], dtype=object)"
464
+ ]
465
+ },
466
+ "execution_count": 16,
467
+ "metadata": {},
468
+ "output_type": "execute_result"
469
+ }
470
+ ],
471
+ "source": [
472
+ "cv.get_feature_names_out()"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "markdown",
477
+ "id": "019ce68a",
478
+ "metadata": {},
479
+ "source": [
480
+ "# Stemming And Lemmitization Process"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 18,
486
+ "id": "be45a6b8",
487
+ "metadata": {},
488
+ "outputs": [],
489
+ "source": [
490
+ "ps = PorterStemmer()"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": 30,
496
+ "id": "3635f58c",
497
+ "metadata": {},
498
+ "outputs": [],
499
+ "source": [
500
+ "def preprocess_query(query):\n",
501
+ " \n",
502
+ " # Lowercase the query\n",
503
+ " cleaned_query = query.lower()\n",
504
+ "\n",
505
+ " # Remove punctuation (adjust as needed)\n",
506
+ " import string\n",
507
+ " punctuation = string.punctuation\n",
508
+ " cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])\n",
509
+ "\n",
510
+ " # Remove stop words (optional, replace with your stop word list)\n",
511
+ " stop_words = [\"the\", \"a\", \"is\", \"in\", \"of\"]\n",
512
+ " cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])\n",
513
+ "\n",
514
+ " # Stemming\n",
515
+ " ps = PorterStemmer()\n",
516
+ " cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])\n",
517
+ "\n",
518
+ " # Lemmatization\n",
519
+ " wnl = WordNetLemmatizer()\n",
520
+ " cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])\n",
521
+ "\n",
522
+ " return cleaned_query"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 32,
528
+ "id": "2787d4d3",
529
+ "metadata": {},
530
+ "outputs": [
531
+ {
532
+ "data": {
533
+ "text/plain": [
534
+ "'talk'"
535
+ ]
536
+ },
537
+ "execution_count": 32,
538
+ "metadata": {},
539
+ "output_type": "execute_result"
540
+ }
541
+ ],
542
+ "source": [
543
+ "preprocess_query('talked')"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "execution_count": 31,
549
+ "id": "6b8326d6",
550
+ "metadata": {},
551
+ "outputs": [
552
+ {
553
+ "data": {
554
+ "text/plain": [
555
+ "'java jame gosl websit wikipedia document unit state beginnertoadv'"
556
+ ]
557
+ },
558
+ "execution_count": 31,
559
+ "metadata": {},
560
+ "output_type": "execute_result"
561
+ }
562
+ ],
563
+ "source": [
564
+ "preprocess_query('java james gosling website wikipedia document united states beginnertoadvance')"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "code",
569
+ "execution_count": 23,
570
+ "id": "02ff3f52",
571
+ "metadata": {},
572
+ "outputs": [
573
+ {
574
+ "data": {
575
+ "text/plain": [
576
+ "0 java is a generalpurpos comput program languag...\n",
577
+ "1 javascript often abbrevi as js is a highlevel ...\n",
578
+ "2 c as in the letter c is a generalpurpos imper ...\n",
579
+ "3 python is a wide use highlevel program languag...\n",
580
+ "4 sql listen esskewel or listen seekwəl or skwee...\n",
581
+ " ... \n",
582
+ "3715 understandingtheprofessionaldataengineercertif...\n",
583
+ "3716 atourofgooglecloudhandsonlab machinelearningen...\n",
584
+ "3717 introductiontoaiandmachinelearningongoogleclou...\n",
585
+ "3718 introductiontoaiandmachinelearningongoogleclou...\n",
586
+ "3719 aifound machinelearningengineerlearningpathweb...\n",
587
+ "Name: tag, Length: 3720, dtype: object"
588
+ ]
589
+ },
590
+ "execution_count": 23,
591
+ "metadata": {},
592
+ "output_type": "execute_result"
593
+ }
594
+ ],
595
+ "source": [
596
+ "data3['tag'].apply(stem) # apply on tag columns "
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "markdown",
601
+ "id": "66adf3fd",
602
+ "metadata": {},
603
+ "source": [
604
+ "# Find Similarity score for finding most related topic from dataset"
605
+ ]
606
+ },
607
+ {
608
+ "cell_type": "code",
609
+ "execution_count": 24,
610
+ "id": "33126518",
611
+ "metadata": {},
612
+ "outputs": [],
613
+ "source": [
614
+ "similar = cosine_similarity(vector)"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 27,
620
+ "id": "e1f7379a",
621
+ "metadata": {},
622
+ "outputs": [
623
+ {
624
+ "data": {
625
+ "text/plain": [
626
+ "[(1, 0.9999999999999998),\n",
627
+ " (40, 0.4543441112511213),\n",
628
+ " (350, 0.445852828483904),\n",
629
+ " (134, 0.4049985302736412),\n",
630
+ " (1485, 0.3754717312648463)]"
631
+ ]
632
+ },
633
+ "execution_count": 27,
634
+ "metadata": {},
635
+ "output_type": "execute_result"
636
+ }
637
+ ],
638
+ "source": [
639
+ "sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": 29,
645
+ "id": "084d898b",
646
+ "metadata": {},
647
+ "outputs": [],
648
+ "source": [
649
+ "summarizer = pipeline(\"summarization\", model=\"facebook/bart-base\")\n",
650
+ "text_generator = pipeline(\"text-generation\", model=\"gpt2\")"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "execution_count": 34,
656
+ "id": "0197db1f",
657
+ "metadata": {},
658
+ "outputs": [],
659
+ "source": [
660
+ "documents = []\n",
661
+ "for index, row in data3.iterrows():\n",
662
+ " topic_description = preprocess_query(row[\"topic\"]) \n",
663
+ " keywords = preprocess_query(row[\"keyword\"]) \n",
664
+ " combined_text = f\"{topic_description} {keywords}\" # Combine for TF-IDF\n",
665
+ " documents.append(combined_text)\n"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "code",
670
+ "execution_count": 35,
671
+ "id": "d80d5e6f",
672
+ "metadata": {},
673
+ "outputs": [],
674
+ "source": [
675
+ "# Create TF-IDF vectorizer\n",
676
+ "vectorizer = TfidfVectorizer()\n",
677
+ "\n",
678
+ "# Fit the vectorizer on the documents\n",
679
+ "document_vectors = vectorizer.fit_transform(documents)\n",
680
+ "\n",
681
+ "def recommend_from_dataset(query):\n",
682
+ " \n",
683
+ " cleaned_query = preprocess_query(query)\n",
684
+ " query_vector = vectorizer.transform([cleaned_query])\n",
685
+ "\n",
686
+ " # Calculate cosine similarity between query and documents\n",
687
+ " cosine_similarities = cosine_similarity(query_vector, document_vectors)\n",
688
+ " similarity_scores = cosine_similarities.flatten()\n",
689
+ "\n",
690
+ " # Sort documents based on similarity scores\n",
691
+ " sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)\n",
692
+ "\n",
693
+ " # Return top N recommendations with scores, topic names, and links (if available)\n",
694
+ " top_n_results = sorted_results[:5] \n",
695
+ " recommendations = []\n",
696
+ " for result in top_n_results:\n",
697
+ " score = result[0]\n",
698
+ " document_id = result[1]\n",
699
+ " topic_name = data3.loc[document_id, \"topic\"] \n",
700
+ " link = data3.loc[document_id, \"Links\"] if \"Links\" in data3.columns else \"No link available\" \n",
701
+ " if score >= 0.3:\n",
702
+ " recommendations.append({\"topic_name\": topic_name, \"link\": link, \"score\": score})\n",
703
+ " return recommendations\n"
704
+ ]
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "execution_count": 36,
709
+ "id": "e56ccfc2",
710
+ "metadata": {},
711
+ "outputs": [],
712
+ "source": [
713
+ "def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):\n",
714
+ " # Load model and tokenizer\n",
715
+ " model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
716
+ " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
717
+ "\n",
718
+ " # Define training arguments (adjust parameters as needed)\n",
719
+ " training_args = TrainingArguments(\n",
720
+ " output_dir=\"./results\", # Adjust output directory\n",
721
+ " per_device_train_batch_size=8,\n",
722
+ " per_device_eval_batch_size=8,\n",
723
+ " num_train_epochs=epochs,\n",
724
+ " save_steps=10_000,\n",
725
+ " )\n",
726
+ "\n",
727
+ " # Create a Trainer instance for fine-tuning\n",
728
+ " trainer = Trainer(\n",
729
+ " model=model,\n",
730
+ " args=training_args,\n",
731
+ " train_dataset=train_dataset,\n",
732
+ " eval_dataset=validation_dataset,\n",
733
+ " tokenizer=tokenizer,\n",
734
+ " )\n",
735
+ "\n",
736
+ " # Train the model\n",
737
+ " trainer.train()\n",
738
+ "\n",
739
+ " return model"
740
+ ]
741
+ },
742
+ {
743
+ "cell_type": "code",
744
+ "execution_count": 39,
745
+ "id": "9c1c02c9",
746
+ "metadata": {},
747
+ "outputs": [],
748
+ "source": [
749
+ "train_dataset = # Prepare your training dataset\n",
750
+ "validation_dataset = ... # Prepare your validation dataset\n",
751
+ "\n",
752
+ "# Fine-tune the model (replace model name if needed)\n",
753
+ "fine_tuned_model = fine_tune_model(\"facebook/bart-base\", train_dataset, validation_dataset)\n",
754
+ "\n",
755
+ "# Update summarization pipeline with the fine-tuned model\n",
756
+ "summarizer1 = pipeline(\"text-generation\", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)\n"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "code",
761
+ "execution_count": 45,
762
+ "id": "49baeaf5",
763
+ "metadata": {},
764
+ "outputs": [],
765
+ "source": [
766
+ "def summarize_and_generate(user_query, recommendations):\n",
767
+ " \n",
768
+ " # Summarize the user query\n",
769
+ " query_summary = summarizer(user_query, max_length=100, truncation=True)[0][\"summary_text\"]\n",
770
+ "\n",
771
+ " # Generate creative text related to the query\n",
772
+ " generated_text = text_generator(f\"Exploring the concept of {user_query}\", max_length=100, num_return_sequences=1)[0][\"generated_text\"]\n",
773
+ "\n",
774
+ " # Extract related links with scores\n",
775
+ " related_links = []\n",
776
+ " for recommendation in recommendations:\n",
777
+ " related_links.append({\"topic\": recommendation[\"topic_name\"], \"link\": recommendation[\"link\"], \"score\": recommendation[\"score\"]})\n",
778
+ "\n",
779
+ " return {\n",
780
+ " \"query_summary\": query_summary.strip(),\n",
781
+ " \"generated_text\": generated_text.strip(),\n",
782
+ " \"related_links\": related_links\n",
783
+ " }"
784
+ ]
785
+ },
786
+ {
787
+ "cell_type": "code",
788
+ "execution_count": 46,
789
+ "id": "fb9e58cc",
790
+ "metadata": {},
791
+ "outputs": [
792
+ {
793
+ "name": "stderr",
794
+ "output_type": "stream",
795
+ "text": [
796
+ "Your max_length is set to 100, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)\n",
797
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
798
+ ]
799
+ },
800
+ {
801
+ "name": "stdout",
802
+ "output_type": "stream",
803
+ "text": [
804
+ "Query Summary: java by james goslinjames groslin\n",
805
+ "Creative Text: Exploring the concept of java by james goslin is an impressive effort at the best of times and I'm very impressed by how well this was done. The code looks quite simple for simple purposes — there are only two basic methods, call() and destroy(). These two methods are used by most of the java libraries, so any Java that relies on call() or destroy() should use a proper method of your choice as well. Also, the code uses a single method, so that\n",
806
+ "Some Related Links for your query:\n",
807
+ "- Java:\n",
808
+ " website: https://oracle.com/java/, documentation: https://docs.oracle.com/en/java/, wikipedia: https://en.wikipedia.org/wiki/Java_(programming_language) : \n",
809
+ " Score: 0.625462748622542\n",
810
+ "- Java Properties:\n",
811
+ " wikipedia: https://en.wikipedia.org/wiki/.properties : \n",
812
+ " Score: 0.3952596829701199\n",
813
+ "- Java Bytecode:\n",
814
+ " documentation: https://docs.oracle.com/javase/specs/jvms/se7/html/, wikipedia: https://en.wikipedia.org/wiki/Java_bytecode : \n",
815
+ " Score: 0.38255306128391625\n",
816
+ "- Query by Example:\n",
817
+ " reference: https://semanticscholar.org/paper/f320e453ae65ddf0a3789f4383fa164481c7a8b3, wikipedia: https://en.wikipedia.org/wiki/Query_by_Example : \n",
818
+ " Score: 0.3726562653850712\n",
819
+ "- Join Java:\n",
820
+ " wikipedia: https://en.wikipedia.org/wiki/Join_Java : \n",
821
+ " Score: 0.3143513411797295\n"
822
+ ]
823
+ }
824
+ ],
825
+ "source": [
826
+ "user_query = \"java by james goslin\"\n",
827
+ "recommendations = recommend_from_dataset(user_query)\n",
828
+ "\n",
829
+ "# Get the summary, generated text, and related links\n",
830
+ "results = summarize_and_generate(user_query, recommendations)\n",
831
+ "\n",
832
+ "print(f\"Query Summary: {results['query_summary']}\")\n",
833
+ "print(f\"Creative Text: {results['generated_text']}\")\n",
834
+ "print(\"Some Related Links for your query:\")\n",
835
+ "for link in results[\"related_links\"]:\n",
836
+ " print(f\"- {link['topic']}:\\n {link['link']} : \\n Score: {link['score']}\") #(Score: {link['score']})"
837
+ ]
838
+ },
839
+ {
840
+ "cell_type": "code",
841
+ "execution_count": null,
842
+ "id": "46535752",
843
+ "metadata": {},
844
+ "outputs": [],
845
+ "source": []
846
+ }
847
+ ],
848
+ "metadata": {
849
+ "kernelspec": {
850
+ "display_name": "Python 3 (ipykernel)",
851
+ "language": "python",
852
+ "name": "python3"
853
+ },
854
+ "language_info": {
855
+ "codemirror_mode": {
856
+ "name": "ipython",
857
+ "version": 3
858
+ },
859
+ "file_extension": ".py",
860
+ "mimetype": "text/x-python",
861
+ "name": "python",
862
+ "nbconvert_exporter": "python",
863
+ "pygments_lexer": "ipython3",
864
+ "version": "3.10.9"
865
+ }
866
+ },
867
+ "nbformat": 4,
868
+ "nbformat_minor": 5
869
+ }
final2.csv ADDED
The diff for this file is too large to render. See raw diff
 
functions.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
3
+
4
+
5
+ # In[2]:
6
+
7
+ import pandas as pd
8
+ import pickle
9
+ import streamlit as st
10
+ import numpy as np
11
+ from sklearn.feature_extraction.text import CountVectorizer
12
+ import nltk
13
+ from nltk.stem.porter import PorterStemmer
14
+ from nltk.stem import WordNetLemmatizer
15
+ import re
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+
19
+
20
+ # In[47]:
21
+
22
+
23
+ data3 = pd.read_csv('final2.csv')
24
+
25
+
26
+ # In[5]:
27
+
28
+
29
+ data3.info()
30
+
31
+
32
+ # In[6]:
33
+
34
+
35
+ data3.head()
36
+
37
+
38
+ # In[9]:
39
+
40
+
41
+ data3['topic'] = data3.topic.astype("string")
42
+ data3['discription'] = data3.discription.astype("string")
43
+ data3['keyword'] = data3.keyword.astype("string")
44
+ data3['level'] = data3.level.astype("string")
45
+ data3.info()
46
+
47
+
48
+ # # Data Cleaning Process
49
+
50
+
51
+ # In[10]:
52
+
53
+
54
+ data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level']
55
+
56
+
57
+ # In[11]:
58
+
59
+
60
+ def remove_symbols(text):
61
+ # Create a regular expression pattern to match unwanted symbols
62
+ pattern = r'[^\w\s]' # Matches characters that are not alphanumeric or whitespace
63
+ # Substitute matched symbols with an empty string
64
+ return re.sub(pattern, '', text.lower())
65
+
66
+
67
+ # In[12]:
68
+
69
+
70
+ data3['tag'] = data3['tag'].fillna('')
71
+ data3['tag'] = data3['tag'].apply(remove_symbols)
72
+ data3['level'] = data3['level'].apply(lambda x: x.replace(" ",""))
73
+ data3['keyword'] = data3['keyword'].fillna('')
74
+ data3.head()
75
+
76
+
77
+
78
+ # # Convert tag columns into vector
79
+
80
+ # In[14]:
81
+
82
+
83
+ cv = CountVectorizer( max_features = 5000, stop_words = 'english')
84
+ vector = cv.fit_transform(data3['tag']).toarray()
85
+
86
+
87
+ # In[18]:
88
+
89
+
90
+ ps = PorterStemmer()
91
+
92
+
93
+ # In[30]:
94
+
95
+
96
+ def preprocess_query(query):
97
+
98
+ # Lowercase the query
99
+ cleaned_query = query.lower()
100
+
101
+ # Remove punctuation (adjust as needed)
102
+ import string
103
+ punctuation = string.punctuation
104
+ cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])
105
+
106
+ # Remove stop words (optional, replace with your stop word list)
107
+ stop_words = ["the", "a", "is", "in", "of"]
108
+ cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])
109
+
110
+ # Stemming
111
+ ps = PorterStemmer()
112
+ cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])
113
+
114
+ # Lemmatization
115
+ wnl = WordNetLemmatizer()
116
+ cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])
117
+
118
+ return cleaned_query
119
+
120
+
121
+
122
+ # In[31]:
123
+
124
+
125
+ # # Find Similarity score for finding most related topic from dataset
126
+
127
+ # In[24]:
128
+
129
+
130
+ similar = cosine_similarity(vector)
131
+
132
+
133
+ # In[27]:
134
+
135
+
136
+ # sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]
137
+
138
+
139
+ # In[29]:
140
+
141
+
142
+ summarizer = pipeline("summarization", model="facebook/bart-base")
143
+ text_generator = pipeline("text-generation", model="gpt2")
144
+
145
+
146
+ # In[34]:
147
+
148
+
149
+ documents = []
150
+ for index, row in data3.iterrows():
151
+ topic_description = preprocess_query(row["topic"])
152
+ keywords = preprocess_query(row["keyword"])
153
+ combined_text = f"{topic_description} {keywords}" # Combine for TF-IDF
154
+ documents.append(combined_text)
155
+
156
+
157
+ # In[35]:
158
+
159
+
160
+ # Create TF-IDF vectorizer
161
+ vectorizer = TfidfVectorizer()
162
+
163
+ # Fit the vectorizer on the documents
164
+ document_vectors = vectorizer.fit_transform(documents)
165
+
166
+ def recommend_from_dataset(query):
167
+
168
+ cleaned_query = preprocess_query(query)
169
+ query_vector = vectorizer.transform([cleaned_query])
170
+
171
+ # Calculate cosine similarity between query and documents
172
+ cosine_similarities = cosine_similarity(query_vector, document_vectors)
173
+ similarity_scores = cosine_similarities.flatten()
174
+
175
+ # Sort documents based on similarity scores
176
+ sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)
177
+
178
+ # Return top N recommendations with scores, topic names, and links (if available)
179
+ top_n_results = sorted_results[:5]
180
+ recommendations = []
181
+ for result in top_n_results:
182
+ score = result[0]
183
+ document_id = result[1]
184
+ topic_name = data3.loc[document_id, "topic"]
185
+ link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available"
186
+ if score >= 0.3:
187
+ recommendations.append({"topic_name": topic_name, "link": link})
188
+ return recommendations
189
+
190
+
191
+ # In[36]:
192
+
193
+
194
+ # def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):
195
+ # # Load model and tokenizer
196
+ # model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
197
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
198
+
199
+ # # Define training arguments (adjust parameters as needed)
200
+ # training_args = TrainingArguments(
201
+ # output_dir="./results", # Adjust output directory
202
+ # per_device_train_batch_size=8,
203
+ # per_device_eval_batch_size=8,
204
+ # num_train_epochs=epochs,
205
+ # save_steps=10_000,
206
+ # )
207
+
208
+ # # Create a Trainer instance for fine-tuning
209
+ # trainer = Trainer(
210
+ # model=model,
211
+ # args=training_args,
212
+ # train_dataset=train_dataset,
213
+ # eval_dataset=validation_dataset,
214
+ # tokenizer=tokenizer,
215
+ # )
216
+
217
+ # # Train the model
218
+ # trainer.train()
219
+
220
+ # return model
221
+
222
+
223
+ # In[39]:
224
+
225
+
226
+ # train_dataset = ... # Prepare your training dataset
227
+ # validation_dataset = ... # Prepare your validation dataset
228
+
229
+ # # Fine-tune the model (replace model name if needed)
230
+ # fine_tuned_model = fine_tune_model("facebook/bart-base", train_dataset, validation_dataset)
231
+
232
+ # # Update summarization pipeline with the fine-tuned model
233
+ # summarizer1 = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)
234
+
235
+
236
+ # In[45]:
237
+
238
+
239
+ def summarize_and_generate(user_query, recommendations):
240
+
241
+ # Summarize the user query
242
+ query_summary = summarizer(user_query, max_length=200, truncation=True)[0]["summary_text"]
243
+
244
+ # Generate creative text related to the query
245
+ generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=200, num_return_sequences=3)[0]["generated_text"]
246
+
247
+ # Extract related links with scores
248
+ related_links = []
249
+ for recommendation in recommendations:
250
+ related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"]})
251
+
252
+ return {
253
+ "query_summary": query_summary.strip(),
254
+ "generated_text": generated_text.strip(),
255
+ "related_links": related_links
256
+ }
257
+
258
+
259
+
260
+ # In[46]:
261
+
262
+ # user_query = "java "
263
+ # recommendations = recommend_from_dataset(user_query)
264
+
265
+ # # Get the summary, generated text, and related links
266
+ # results = summarize_and_generate(user_query, recommendations)
267
+
268
+ # print(f"Query Summary: {results['query_summary']}")
269
+ # print(f"Creative Text: {results['generated_text']}")
270
+ # print("Related Links:")
271
+ # for link in results["related_links"]:
272
+ # print(f"- {link['topic']}: {link['link']}")
273
+
274
+ # In[ ]:
275
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ huggingface-hub
2
+ numpy==1.24.3
3
+ pandas==2.0.1
4
+ requests==2.30.0
5
+ scikit-learn==1.2.2
6
+ scipy==1.10.1
7
+ streamlit==1.34.0
8
+ tokenizers==0.13.3
9
+ transformers==4.29.2
resource.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abbeb1a6614b0d85cbdfad41009adbb32ea485ec996b19c2dbc6e68fa5b195e8
3
+ size 2969362