Ubuntu commited on
Commit
3e6d815
1 Parent(s): dbb9b6d

added stuffs

Browse files
.gitignore CHANGED
@@ -6,4 +6,5 @@ intent_classification_model_with_metatitle_with_local2/
6
  intent_classification_model_with_metatitle_with_local1/
7
  intent_classification_model_with_metatitle_with_local/
8
  intent_classification_model_with_metatitle/
9
- intent_classification_model_with_metatitle_with_local2/
 
 
6
  intent_classification_model_with_metatitle_with_local1/
7
  intent_classification_model_with_metatitle_with_local/
8
  intent_classification_model_with_metatitle/
9
+ intent_classification_model_with_metatitle_with_local2/
10
+ intent_classification_model_without_metatitle_with_local23/
__pycache__/keys.cpython-310.pyc CHANGED
Binary files a/__pycache__/keys.cpython-310.pyc and b/__pycache__/keys.cpython-310.pyc differ
 
data/data_for_seo_new_intent.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da30bf15a41177fee836996b499bc5a0d59fd45853f8d12f995d146f99339210
3
+ size 2357733
data/intent_data_dataforseo.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -12,4 +12,5 @@ evaluate
12
  openpyxl
13
  summa
14
  git+https://github.com/LIAAD/yake
15
- multi_rake
 
 
12
  openpyxl
13
  summa
14
  git+https://github.com/LIAAD/yake
15
+ multi_rake
16
+ accelerate
research/11_intent_classification_using_distilbert.ipynb CHANGED
@@ -46,64 +46,45 @@
46
  " <th></th>\n",
47
  " <th>keyword</th>\n",
48
  " <th>intent</th>\n",
49
- " <th>id</th>\n",
50
- " <th>metatitle</th>\n",
51
  " </tr>\n",
52
  " </thead>\n",
53
  " <tbody>\n",
54
  " <tr>\n",
55
  " <th>0</th>\n",
56
- " <td>citalopram vs prozac</td>\n",
57
- " <td>Commercial</td>\n",
58
- " <td>0</td>\n",
59
- " <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
60
  " </tr>\n",
61
  " <tr>\n",
62
  " <th>1</th>\n",
63
- " <td>who is the oldest football player</td>\n",
64
- " <td>Informational</td>\n",
65
- " <td>1</td>\n",
66
- " <td>Oldest active NFL players and in league histor...</td>\n",
67
  " </tr>\n",
68
  " <tr>\n",
69
  " <th>2</th>\n",
70
- " <td>t mobile town east</td>\n",
71
- " <td>Navigational</td>\n",
72
- " <td>2</td>\n",
73
- " <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
74
  " </tr>\n",
75
  " <tr>\n",
76
  " <th>3</th>\n",
77
- " <td>starbucks</td>\n",
78
- " <td>Navigational</td>\n",
79
- " <td>2</td>\n",
80
- " <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
81
  " </tr>\n",
82
  " <tr>\n",
83
  " <th>4</th>\n",
84
- " <td>tech crunch</td>\n",
85
- " <td>Navigational</td>\n",
86
- " <td>2</td>\n",
87
- " <td>TechCrunch | Startup and Technology News techc...</td>\n",
88
  " </tr>\n",
89
  " </tbody>\n",
90
  "</table>\n",
91
  "</div>"
92
  ],
93
  "text/plain": [
94
- " keyword intent id \\\n",
95
- "0 citalopram vs prozac Commercial 0 \n",
96
- "1 who is the oldest football player Informational 1 \n",
97
- "2 t mobile town east Navigational 2 \n",
98
- "3 starbucks Navigational 2 \n",
99
- "4 tech crunch Navigational 2 \n",
100
- "\n",
101
- " metatitle \n",
102
- "0 Celexa vs Prozac - ClarityX clarityxdna.com ht... \n",
103
- "1 Oldest active NFL players and in league histor... \n",
104
- "2 T-Mobile Town East Blvd & Pavillion Ct | Mesqu... \n",
105
- "3 Starbucks Coffee Company www.starbucks.com htt... \n",
106
- "4 TechCrunch | Startup and Technology News techc... "
107
  ]
108
  },
109
  "execution_count": 3,
@@ -112,10 +93,20 @@
112
  }
113
  ],
114
  "source": [
115
- "original_df= pd.read_csv(\"data_intent/intent_with_metatitle.csv\")\n",
116
  "original_df.head()"
117
  ]
118
  },
 
 
 
 
 
 
 
 
 
 
119
  {
120
  "cell_type": "code",
121
  "execution_count": 4,
@@ -123,10 +114,66 @@
123
  "outputs": [
124
  {
125
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  "text/plain": [
127
- "False 1659\n",
128
- "True 343\n",
129
- "Name: count, dtype: int64"
 
 
 
130
  ]
131
  },
132
  "execution_count": 4,
@@ -135,7 +182,10 @@
135
  }
136
  ],
137
  "source": [
138
- "original_df.duplicated().value_counts()"
 
 
 
139
  ]
140
  },
141
  {
@@ -144,7 +194,16 @@
144
  "metadata": {},
145
  "outputs": [],
146
  "source": [
147
- "# original_df.drop_duplicates(inplace=True)"
 
 
 
 
 
 
 
 
 
148
  ]
149
  },
150
  {
@@ -155,8 +214,8 @@
155
  {
156
  "data": {
157
  "text/plain": [
158
- "False 1659\n",
159
- "True 343\n",
160
  "Name: count, dtype: int64"
161
  ]
162
  },
@@ -175,16 +234,29 @@
175
  "metadata": {},
176
  "outputs": [],
177
  "source": [
178
- "original_df= original_df[original_df.intent!='Local']"
179
  ]
180
  },
181
  {
182
  "cell_type": "code",
183
  "execution_count": 8,
184
  "metadata": {},
185
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  "source": [
187
- "intents= original_df.intent.unique().tolist()"
188
  ]
189
  },
190
  {
@@ -192,6 +264,36 @@
192
  "execution_count": 9,
193
  "metadata": {},
194
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  "source": [
196
  "id2label= {}\n",
197
  "label2id= {}\n",
@@ -202,16 +304,16 @@
202
  },
203
  {
204
  "cell_type": "code",
205
- "execution_count": 10,
206
  "metadata": {},
207
  "outputs": [
208
  {
209
  "data": {
210
  "text/plain": [
211
- "{0: 'Commercial', 1: 'Informational', 2: 'Navigational', 3: 'Transactional'}"
212
  ]
213
  },
214
- "execution_count": 10,
215
  "metadata": {},
216
  "output_type": "execute_result"
217
  }
@@ -222,16 +324,16 @@
222
  },
223
  {
224
  "cell_type": "code",
225
- "execution_count": 11,
226
  "metadata": {},
227
  "outputs": [
228
  {
229
  "data": {
230
  "text/plain": [
231
- "{'Commercial': 0, 'Informational': 1, 'Navigational': 2, 'Transactional': 3}"
232
  ]
233
  },
234
- "execution_count": 11,
235
  "metadata": {},
236
  "output_type": "execute_result"
237
  }
@@ -242,7 +344,7 @@
242
  },
243
  {
244
  "cell_type": "code",
245
- "execution_count": 12,
246
  "metadata": {},
247
  "outputs": [],
248
  "source": [
@@ -252,7 +354,7 @@
252
  },
253
  {
254
  "cell_type": "code",
255
- "execution_count": 13,
256
  "metadata": {},
257
  "outputs": [
258
  {
@@ -279,123 +381,98 @@
279
  " <th>keyword</th>\n",
280
  " <th>intent</th>\n",
281
  " <th>id</th>\n",
282
- " <th>metatitle</th>\n",
283
  " </tr>\n",
284
  " </thead>\n",
285
  " <tbody>\n",
286
  " <tr>\n",
287
  " <th>0</th>\n",
288
  " <td>citalopram vs prozac</td>\n",
289
- " <td>Commercial</td>\n",
290
  " <td>0</td>\n",
291
- " <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
292
  " </tr>\n",
293
  " <tr>\n",
294
  " <th>1</th>\n",
295
  " <td>who is the oldest football player</td>\n",
296
- " <td>Informational</td>\n",
297
  " <td>1</td>\n",
298
- " <td>Oldest active NFL players and in league histor...</td>\n",
299
  " </tr>\n",
300
  " <tr>\n",
301
  " <th>2</th>\n",
302
  " <td>t mobile town east</td>\n",
303
- " <td>Navigational</td>\n",
304
  " <td>2</td>\n",
305
- " <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
306
  " </tr>\n",
307
  " <tr>\n",
308
  " <th>3</th>\n",
309
  " <td>starbucks</td>\n",
310
- " <td>Navigational</td>\n",
311
  " <td>2</td>\n",
312
- " <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
313
  " </tr>\n",
314
  " <tr>\n",
315
  " <th>4</th>\n",
316
  " <td>tech crunch</td>\n",
317
- " <td>Navigational</td>\n",
318
  " <td>2</td>\n",
319
- " <td>TechCrunch | Startup and Technology News techc...</td>\n",
320
  " </tr>\n",
321
  " <tr>\n",
322
  " <th>...</th>\n",
323
  " <td>...</td>\n",
324
  " <td>...</td>\n",
325
  " <td>...</td>\n",
326
- " <td>...</td>\n",
327
  " </tr>\n",
328
  " <tr>\n",
329
- " <th>1997</th>\n",
330
  " <td>How to make homemade pet accessories from recy...</td>\n",
331
- " <td>Informational</td>\n",
332
  " <td>1</td>\n",
333
- " <td>Try These Dog Products Made From Recycled Mate...</td>\n",
334
  " </tr>\n",
335
  " <tr>\n",
336
- " <th>1998</th>\n",
337
  " <td>Top 10 science fiction book series that take r...</td>\n",
338
- " <td>Informational</td>\n",
339
  " <td>1</td>\n",
340
- " <td>10 Sci-Fi and Fantasy Books About Fantastical ...</td>\n",
341
  " </tr>\n",
342
  " <tr>\n",
343
- " <th>1999</th>\n",
344
  " <td>How to start a car restoration and customizati...</td>\n",
345
- " <td>Informational</td>\n",
346
  " <td>1</td>\n",
347
- " <td>What to Consider When Starting an Auto Restora...</td>\n",
348
  " </tr>\n",
349
  " <tr>\n",
350
- " <th>2000</th>\n",
351
  " <td>Ancient Mesopotamian architecture and its infl...</td>\n",
352
- " <td>Informational</td>\n",
353
  " <td>1</td>\n",
354
- " <td>Mesopotamian art and architecture | Characteri...</td>\n",
355
  " </tr>\n",
356
  " <tr>\n",
357
- " <th>2001</th>\n",
358
  " <td>Benefits of a flexitarian diet for those seeki...</td>\n",
359
- " <td>Informational</td>\n",
360
  " <td>1</td>\n",
361
- " <td>The Flexitarian Diet: A Detailed Beginner's Gu...</td>\n",
362
  " </tr>\n",
363
  " </tbody>\n",
364
  "</table>\n",
365
- "<p>1786 rows × 4 columns</p>\n",
366
  "</div>"
367
  ],
368
  "text/plain": [
369
- " keyword intent id \\\n",
370
- "0 citalopram vs prozac Commercial 0 \n",
371
- "1 who is the oldest football player Informational 1 \n",
372
- "2 t mobile town east Navigational 2 \n",
373
- "3 starbucks Navigational 2 \n",
374
- "4 tech crunch Navigational 2 \n",
375
- "... ... ... .. \n",
376
- "1997 How to make homemade pet accessories from recy... Informational 1 \n",
377
- "1998 Top 10 science fiction book series that take r... Informational 1 \n",
378
- "1999 How to start a car restoration and customizati... Informational 1 \n",
379
- "2000 Ancient Mesopotamian architecture and its infl... Informational 1 \n",
380
- "2001 Benefits of a flexitarian diet for those seeki... Informational 1 \n",
381
- "\n",
382
- " metatitle \n",
383
- "0 Celexa vs Prozac - ClarityX clarityxdna.com ht... \n",
384
- "1 Oldest active NFL players and in league histor... \n",
385
- "2 T-Mobile Town East Blvd & Pavillion Ct | Mesqu... \n",
386
- "3 Starbucks Coffee Company www.starbucks.com htt... \n",
387
- "4 TechCrunch | Startup and Technology News techc... \n",
388
- "... ... \n",
389
- "1997 Try These Dog Products Made From Recycled Mate... \n",
390
- "1998 10 Sci-Fi and Fantasy Books About Fantastical ... \n",
391
- "1999 What to Consider When Starting an Auto Restora... \n",
392
- "2000 Mesopotamian art and architecture | Characteri... \n",
393
- "2001 The Flexitarian Diet: A Detailed Beginner's Gu... \n",
394
  "\n",
395
- "[1786 rows x 4 columns]"
396
  ]
397
  },
398
- "execution_count": 13,
399
  "metadata": {},
400
  "output_type": "execute_result"
401
  }
@@ -407,7 +484,7 @@
407
  },
408
  {
409
  "cell_type": "code",
410
- "execution_count": 14,
411
  "metadata": {},
412
  "outputs": [
413
  {
@@ -431,34 +508,34 @@
431
  " <thead>\n",
432
  " <tr style=\"text-align: right;\">\n",
433
  " <th></th>\n",
434
- " <th>metatitle</th>\n",
435
  " <th>id</th>\n",
436
  " </tr>\n",
437
  " </thead>\n",
438
  " <tbody>\n",
439
  " <tr>\n",
440
  " <th>0</th>\n",
441
- " <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
442
  " <td>0</td>\n",
443
  " </tr>\n",
444
  " <tr>\n",
445
  " <th>1</th>\n",
446
- " <td>Oldest active NFL players and in league histor...</td>\n",
447
  " <td>1</td>\n",
448
  " </tr>\n",
449
  " <tr>\n",
450
  " <th>2</th>\n",
451
- " <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
452
  " <td>2</td>\n",
453
  " </tr>\n",
454
  " <tr>\n",
455
  " <th>3</th>\n",
456
- " <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
457
  " <td>2</td>\n",
458
  " </tr>\n",
459
  " <tr>\n",
460
  " <th>4</th>\n",
461
- " <td>TechCrunch | Startup and Technology News techc...</td>\n",
462
  " <td>2</td>\n",
463
  " </tr>\n",
464
  " <tr>\n",
@@ -467,72 +544,192 @@
467
  " <td>...</td>\n",
468
  " </tr>\n",
469
  " <tr>\n",
470
- " <th>1997</th>\n",
471
- " <td>Try These Dog Products Made From Recycled Mate...</td>\n",
472
  " <td>1</td>\n",
473
  " </tr>\n",
474
  " <tr>\n",
475
- " <th>1998</th>\n",
476
- " <td>10 Sci-Fi and Fantasy Books About Fantastical ...</td>\n",
477
  " <td>1</td>\n",
478
  " </tr>\n",
479
  " <tr>\n",
480
- " <th>1999</th>\n",
481
- " <td>What to Consider When Starting an Auto Restora...</td>\n",
482
  " <td>1</td>\n",
483
  " </tr>\n",
484
  " <tr>\n",
485
- " <th>2000</th>\n",
486
- " <td>Mesopotamian art and architecture | Characteri...</td>\n",
487
  " <td>1</td>\n",
488
  " </tr>\n",
489
  " <tr>\n",
490
- " <th>2001</th>\n",
491
- " <td>The Flexitarian Diet: A Detailed Beginner's Gu...</td>\n",
492
  " <td>1</td>\n",
493
  " </tr>\n",
494
  " </tbody>\n",
495
  "</table>\n",
496
- "<p>1786 rows × 2 columns</p>\n",
497
  "</div>"
498
  ],
499
  "text/plain": [
500
- " metatitle id\n",
501
- "0 Celexa vs Prozac - ClarityX clarityxdna.com ht... 0\n",
502
- "1 Oldest active NFL players and in league histor... 1\n",
503
- "2 T-Mobile Town East Blvd & Pavillion Ct | Mesqu... 2\n",
504
- "3 Starbucks Coffee Company www.starbucks.com htt... 2\n",
505
- "4 TechCrunch | Startup and Technology News techc... 2\n",
506
  "... ... ..\n",
507
- "1997 Try These Dog Products Made From Recycled Mate... 1\n",
508
- "1998 10 Sci-Fi and Fantasy Books About Fantastical ... 1\n",
509
- "1999 What to Consider When Starting an Auto Restora... 1\n",
510
- "2000 Mesopotamian art and architecture | Characteri... 1\n",
511
- "2001 The Flexitarian Diet: A Detailed Beginner's Gu... 1\n",
512
  "\n",
513
- "[1786 rows x 2 columns]"
514
  ]
515
  },
516
- "execution_count": 14,
517
  "metadata": {},
518
  "output_type": "execute_result"
519
  }
520
  ],
521
  "source": [
522
- "df= original_df[['metatitle', 'id']]\n",
 
523
  "df"
524
  ]
525
  },
526
  {
527
  "cell_type": "code",
528
- "execution_count": 15,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  "metadata": {},
530
  "outputs": [
531
  {
532
  "name": "stderr",
533
  "output_type": "stream",
534
  "text": [
535
- "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
536
  " from .autonotebook import tqdm as notebook_tqdm\n"
537
  ]
538
  }
@@ -543,20 +740,9 @@
543
  },
544
  {
545
  "cell_type": "code",
546
- "execution_count": 16,
547
  "metadata": {},
548
  "outputs": [
549
- {
550
- "name": "stderr",
551
- "output_type": "stream",
552
- "text": [
553
- "/tmp/ipykernel_401416/1659657905.py:1: SettingWithCopyWarning: \n",
554
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
555
- "\n",
556
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
557
- " df.rename(columns={\n"
558
- ]
559
- },
560
  {
561
  "data": {
562
  "text/html": [
@@ -584,54 +770,54 @@
584
  " </thead>\n",
585
  " <tbody>\n",
586
  " <tr>\n",
587
- " <th>287</th>\n",
588
- " <td>Vanilla Pudding Recipe www.bettycrocker.com ht...</td>\n",
589
  " <td>1</td>\n",
590
  " </tr>\n",
591
  " <tr>\n",
592
- " <th>730</th>\n",
593
- " <td>Garden Outlet - Garden Sale + Free Shipping ww...</td>\n",
594
- " <td>0</td>\n",
595
  " </tr>\n",
596
  " <tr>\n",
597
- " <th>897</th>\n",
598
- " <td>Party Supplies on Sale | Oriental Trading Comp...</td>\n",
599
- " <td>3</td>\n",
600
  " </tr>\n",
601
  " <tr>\n",
602
- " <th>95</th>\n",
603
- " <td>My Chemical Romance www.mychemicalromance.com ...</td>\n",
604
- " <td>2</td>\n",
605
  " </tr>\n",
606
  " <tr>\n",
607
- " <th>1832</th>\n",
608
- " <td>Einstein's Special Theory of Relativity | PBS ...</td>\n",
609
- " <td>1</td>\n",
610
  " </tr>\n",
611
  " <tr>\n",
612
- " <th>1751</th>\n",
613
- " <td>12 Breathtaking Places to Go Kayaking in the U...</td>\n",
614
  " <td>1</td>\n",
615
  " </tr>\n",
616
  " <tr>\n",
617
- " <th>1870</th>\n",
618
- " <td>Nuclear “Power Balls” May Make Meltdowns a Thi...</td>\n",
619
- " <td>1</td>\n",
620
  " </tr>\n",
621
  " <tr>\n",
622
- " <th>795</th>\n",
623
- " <td>Natural &amp; Organic Makeup and Skincare – INIKA ...</td>\n",
624
  " <td>0</td>\n",
625
  " </tr>\n",
626
  " <tr>\n",
627
- " <th>1707</th>\n",
628
- " <td>10 Best Ski Resorts in the US www.travelandlei...</td>\n",
629
- " <td>1</td>\n",
630
  " </tr>\n",
631
  " <tr>\n",
632
- " <th>92</th>\n",
633
- " <td>Hozier | Unreal Unearth THE NEW ALBUM OUT NOW ...</td>\n",
634
- " <td>2</td>\n",
635
  " </tr>\n",
636
  " </tbody>\n",
637
  "</table>\n",
@@ -639,27 +825,27 @@
639
  ],
640
  "text/plain": [
641
  " text label\n",
642
- "287 Vanilla Pudding Recipe www.bettycrocker.com ht... 1\n",
643
- "730 Garden Outlet - Garden Sale + Free Shipping ww... 0\n",
644
- "897 Party Supplies on Sale | Oriental Trading Comp... 3\n",
645
- "95 My Chemical Romance www.mychemicalromance.com ... 2\n",
646
- "1832 Einstein's Special Theory of Relativity | PBS ... 1\n",
647
- "1751 12 Breathtaking Places to Go Kayaking in the U... 1\n",
648
- "1870 Nuclear “Power Balls” May Make Meltdowns a Thi... 1\n",
649
- "795 Natural & Organic Makeup and Skincare – INIKA ... 0\n",
650
- "1707 10 Best Ski Resorts in the US www.travelandlei... 1\n",
651
- "92 Hozier | Unreal Unearth THE NEW ALBUM OUT NOW ... 2"
652
  ]
653
  },
654
- "execution_count": 16,
655
  "metadata": {},
656
  "output_type": "execute_result"
657
  }
658
  ],
659
  "source": [
660
  "df.rename(columns={\n",
661
- " # \"keyword\": \"text\", \n",
662
- " \"metatitle\": \"text\", \n",
663
  " \"id\": \"label\"\n",
664
  "}, \n",
665
  " inplace=True\n",
@@ -670,27 +856,19 @@
670
  },
671
  {
672
  "cell_type": "code",
673
- "execution_count": 17,
674
  "metadata": {},
675
  "outputs": [
676
- {
677
- "name": "stderr",
678
- "output_type": "stream",
679
- "text": [
680
- "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:373: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n",
681
- " if _pandas_api.is_sparse(col):\n"
682
- ]
683
- },
684
  {
685
  "data": {
686
  "text/plain": [
687
  "Dataset({\n",
688
- " features: ['text', 'label', '__index_level_0__'],\n",
689
- " num_rows: 1786\n",
690
  "})"
691
  ]
692
  },
693
- "execution_count": 17,
694
  "metadata": {},
695
  "output_type": "execute_result"
696
  }
@@ -702,7 +880,7 @@
702
  },
703
  {
704
  "cell_type": "code",
705
- "execution_count": 18,
706
  "metadata": {},
707
  "outputs": [
708
  {
@@ -710,29 +888,29 @@
710
  "text/plain": [
711
  "DatasetDict({\n",
712
  " train: Dataset({\n",
713
- " features: ['text', 'label', '__index_level_0__'],\n",
714
- " num_rows: 1428\n",
715
  " })\n",
716
  " test: Dataset({\n",
717
- " features: ['text', 'label', '__index_level_0__'],\n",
718
- " num_rows: 358\n",
719
  " })\n",
720
  "})"
721
  ]
722
  },
723
- "execution_count": 18,
724
  "metadata": {},
725
  "output_type": "execute_result"
726
  }
727
  ],
728
  "source": [
729
- "new_data= dataset_df.train_test_split(test_size=0.2)\n",
730
  "new_data"
731
  ]
732
  },
733
  {
734
  "cell_type": "code",
735
- "execution_count": 19,
736
  "metadata": {},
737
  "outputs": [],
738
  "source": [
@@ -743,7 +921,7 @@
743
  },
744
  {
745
  "cell_type": "code",
746
- "execution_count": 20,
747
  "metadata": {},
748
  "outputs": [],
749
  "source": [
@@ -753,22 +931,15 @@
753
  },
754
  {
755
  "cell_type": "code",
756
- "execution_count": 21,
757
  "metadata": {},
758
  "outputs": [
759
  {
760
  "name": "stderr",
761
  "output_type": "stream",
762
  "text": [
763
- "Map: 100%|██████████| 1428/1428 [00:01<00:00, 1417.99 examples/s]\n",
764
- "Map: 0%| | 0/358 [00:00<?, ? examples/s]"
765
- ]
766
- },
767
- {
768
- "name": "stderr",
769
- "output_type": "stream",
770
- "text": [
771
- "Map: 100%|██████████| 358/358 [00:00<00:00, 1509.84 examples/s]\n"
772
  ]
773
  }
774
  ],
@@ -778,16 +949,20 @@
778
  },
779
  {
780
  "cell_type": "code",
781
- "execution_count": 22,
782
  "metadata": {},
783
  "outputs": [
784
  {
785
  "name": "stderr",
786
  "output_type": "stream",
787
  "text": [
788
- "2023-10-18 11:22:35.000690: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
789
- "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
790
- "2023-10-18 11:22:36.451442: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
 
 
 
 
791
  ]
792
  }
793
  ],
@@ -806,7 +981,7 @@
806
  },
807
  {
808
  "cell_type": "code",
809
- "execution_count": 23,
810
  "metadata": {},
811
  "outputs": [],
812
  "source": [
@@ -817,7 +992,7 @@
817
  },
818
  {
819
  "cell_type": "code",
820
- "execution_count": 24,
821
  "metadata": {},
822
  "outputs": [],
823
  "source": [
@@ -832,14 +1007,14 @@
832
  },
833
  {
834
  "cell_type": "code",
835
- "execution_count": 25,
836
  "metadata": {},
837
  "outputs": [
838
  {
839
  "name": "stderr",
840
  "output_type": "stream",
841
  "text": [
842
- "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']\n",
843
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
844
  ]
845
  }
@@ -849,13 +1024,14 @@
849
  "\n",
850
  "model = AutoModelForSequenceClassification.from_pretrained(\n",
851
  " # \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n",
852
- " \"distilbert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
 
853
  ")"
854
  ]
855
  },
856
  {
857
  "cell_type": "code",
858
- "execution_count": 26,
859
  "metadata": {},
860
  "outputs": [
861
  {
@@ -871,8 +1047,8 @@
871
  "\n",
872
  " <div>\n",
873
  " \n",
874
- " <progress value='2700' max='2700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
875
- " [2700/2700 12:13, Epoch 30/30]\n",
876
  " </div>\n",
877
  " <table border=\"1\" class=\"dataframe\">\n",
878
  " <thead>\n",
@@ -887,182 +1063,38 @@
887
  " <tr>\n",
888
  " <td>1</td>\n",
889
  " <td>No log</td>\n",
890
- " <td>0.386599</td>\n",
891
- " <td>0.927374</td>\n",
892
  " </tr>\n",
893
  " <tr>\n",
894
  " <td>2</td>\n",
895
  " <td>No log</td>\n",
896
- " <td>0.187701</td>\n",
897
- " <td>0.944134</td>\n",
898
  " </tr>\n",
899
  " <tr>\n",
900
  " <td>3</td>\n",
901
  " <td>No log</td>\n",
902
- " <td>0.219236</td>\n",
903
- " <td>0.938547</td>\n",
904
  " </tr>\n",
905
  " <tr>\n",
906
  " <td>4</td>\n",
907
  " <td>No log</td>\n",
908
- " <td>0.212073</td>\n",
909
- " <td>0.935754</td>\n",
910
  " </tr>\n",
911
  " <tr>\n",
912
  " <td>5</td>\n",
913
  " <td>No log</td>\n",
914
- " <td>0.157072</td>\n",
915
- " <td>0.958101</td>\n",
916
  " </tr>\n",
917
  " <tr>\n",
918
  " <td>6</td>\n",
919
- " <td>0.244800</td>\n",
920
- " <td>0.149268</td>\n",
921
- " <td>0.960894</td>\n",
922
- " </tr>\n",
923
- " <tr>\n",
924
- " <td>7</td>\n",
925
- " <td>0.244800</td>\n",
926
- " <td>0.138416</td>\n",
927
- " <td>0.963687</td>\n",
928
- " </tr>\n",
929
- " <tr>\n",
930
- " <td>8</td>\n",
931
- " <td>0.244800</td>\n",
932
- " <td>0.129277</td>\n",
933
- " <td>0.969274</td>\n",
934
- " </tr>\n",
935
- " <tr>\n",
936
- " <td>9</td>\n",
937
- " <td>0.244800</td>\n",
938
- " <td>0.155066</td>\n",
939
- " <td>0.960894</td>\n",
940
- " </tr>\n",
941
- " <tr>\n",
942
- " <td>10</td>\n",
943
- " <td>0.244800</td>\n",
944
- " <td>0.132079</td>\n",
945
- " <td>0.966480</td>\n",
946
- " </tr>\n",
947
- " <tr>\n",
948
- " <td>11</td>\n",
949
- " <td>0.244800</td>\n",
950
- " <td>0.138543</td>\n",
951
- " <td>0.969274</td>\n",
952
- " </tr>\n",
953
- " <tr>\n",
954
- " <td>12</td>\n",
955
- " <td>0.040300</td>\n",
956
- " <td>0.162308</td>\n",
957
- " <td>0.966480</td>\n",
958
- " </tr>\n",
959
- " <tr>\n",
960
- " <td>13</td>\n",
961
- " <td>0.040300</td>\n",
962
- " <td>0.132775</td>\n",
963
- " <td>0.969274</td>\n",
964
- " </tr>\n",
965
- " <tr>\n",
966
- " <td>14</td>\n",
967
- " <td>0.040300</td>\n",
968
- " <td>0.169590</td>\n",
969
- " <td>0.966480</td>\n",
970
- " </tr>\n",
971
- " <tr>\n",
972
- " <td>15</td>\n",
973
- " <td>0.040300</td>\n",
974
- " <td>0.151754</td>\n",
975
- " <td>0.960894</td>\n",
976
- " </tr>\n",
977
- " <tr>\n",
978
- " <td>16</td>\n",
979
- " <td>0.040300</td>\n",
980
- " <td>0.150127</td>\n",
981
- " <td>0.972067</td>\n",
982
- " </tr>\n",
983
- " <tr>\n",
984
- " <td>17</td>\n",
985
- " <td>0.024200</td>\n",
986
- " <td>0.159291</td>\n",
987
- " <td>0.963687</td>\n",
988
- " </tr>\n",
989
- " <tr>\n",
990
- " <td>18</td>\n",
991
- " <td>0.024200</td>\n",
992
- " <td>0.162419</td>\n",
993
- " <td>0.963687</td>\n",
994
- " </tr>\n",
995
- " <tr>\n",
996
- " <td>19</td>\n",
997
- " <td>0.024200</td>\n",
998
- " <td>0.172608</td>\n",
999
- " <td>0.963687</td>\n",
1000
- " </tr>\n",
1001
- " <tr>\n",
1002
- " <td>20</td>\n",
1003
- " <td>0.024200</td>\n",
1004
- " <td>0.176368</td>\n",
1005
- " <td>0.963687</td>\n",
1006
- " </tr>\n",
1007
- " <tr>\n",
1008
- " <td>21</td>\n",
1009
- " <td>0.024200</td>\n",
1010
- " <td>0.179977</td>\n",
1011
- " <td>0.960894</td>\n",
1012
- " </tr>\n",
1013
- " <tr>\n",
1014
- " <td>22</td>\n",
1015
- " <td>0.024200</td>\n",
1016
- " <td>0.175084</td>\n",
1017
- " <td>0.960894</td>\n",
1018
- " </tr>\n",
1019
- " <tr>\n",
1020
- " <td>23</td>\n",
1021
- " <td>0.016700</td>\n",
1022
- " <td>0.186994</td>\n",
1023
- " <td>0.960894</td>\n",
1024
- " </tr>\n",
1025
- " <tr>\n",
1026
- " <td>24</td>\n",
1027
- " <td>0.016700</td>\n",
1028
- " <td>0.177934</td>\n",
1029
- " <td>0.960894</td>\n",
1030
- " </tr>\n",
1031
- " <tr>\n",
1032
- " <td>25</td>\n",
1033
- " <td>0.016700</td>\n",
1034
- " <td>0.183129</td>\n",
1035
- " <td>0.960894</td>\n",
1036
- " </tr>\n",
1037
- " <tr>\n",
1038
- " <td>26</td>\n",
1039
- " <td>0.016700</td>\n",
1040
- " <td>0.180832</td>\n",
1041
- " <td>0.960894</td>\n",
1042
- " </tr>\n",
1043
- " <tr>\n",
1044
- " <td>27</td>\n",
1045
- " <td>0.016700</td>\n",
1046
- " <td>0.179173</td>\n",
1047
- " <td>0.960894</td>\n",
1048
- " </tr>\n",
1049
- " <tr>\n",
1050
- " <td>28</td>\n",
1051
- " <td>0.016300</td>\n",
1052
- " <td>0.182724</td>\n",
1053
- " <td>0.960894</td>\n",
1054
- " </tr>\n",
1055
- " <tr>\n",
1056
- " <td>29</td>\n",
1057
- " <td>0.016300</td>\n",
1058
- " <td>0.181777</td>\n",
1059
- " <td>0.960894</td>\n",
1060
- " </tr>\n",
1061
- " <tr>\n",
1062
- " <td>30</td>\n",
1063
- " <td>0.016300</td>\n",
1064
- " <td>0.182771</td>\n",
1065
- " <td>0.960894</td>\n",
1066
  " </tr>\n",
1067
  " </tbody>\n",
1068
  "</table><p>"
@@ -1077,21 +1109,21 @@
1077
  {
1078
  "data": {
1079
  "text/plain": [
1080
- "TrainOutput(global_step=2700, training_loss=0.0646134274094193, metrics={'train_runtime': 734.5773, 'train_samples_per_second': 58.319, 'train_steps_per_second': 3.676, 'total_flos': 5675105766113280.0, 'train_loss': 0.0646134274094193, 'epoch': 30.0})"
1081
  ]
1082
  },
1083
- "execution_count": 26,
1084
  "metadata": {},
1085
  "output_type": "execute_result"
1086
  }
1087
  ],
1088
  "source": [
1089
  "training_args = TrainingArguments(\n",
1090
- " output_dir=\"intent_classification_model_with_metatitle_with_local2\",\n",
1091
  " learning_rate=2e-5,\n",
1092
  " per_device_train_batch_size=16,\n",
1093
  " per_device_eval_batch_size=16,\n",
1094
- " num_train_epochs=30,\n",
1095
  " weight_decay=0.01,\n",
1096
  " evaluation_strategy=\"epoch\",\n",
1097
  " save_strategy=\"epoch\",\n",
 
46
  " <th></th>\n",
47
  " <th>keyword</th>\n",
48
  " <th>intent</th>\n",
 
 
49
  " </tr>\n",
50
  " </thead>\n",
51
  " <tbody>\n",
52
  " <tr>\n",
53
  " <th>0</th>\n",
54
+ " <td>social media groups</td>\n",
55
+ " <td>informational</td>\n",
 
 
56
  " </tr>\n",
57
  " <tr>\n",
58
  " <th>1</th>\n",
59
+ " <td>social media groups</td>\n",
60
+ " <td>navigational</td>\n",
 
 
61
  " </tr>\n",
62
  " <tr>\n",
63
  " <th>2</th>\n",
64
+ " <td>internet forums</td>\n",
65
+ " <td>navigational</td>\n",
 
 
66
  " </tr>\n",
67
  " <tr>\n",
68
  " <th>3</th>\n",
69
+ " <td>virtual communities</td>\n",
70
+ " <td>navigational</td>\n",
 
 
71
  " </tr>\n",
72
  " <tr>\n",
73
  " <th>4</th>\n",
74
+ " <td>online discussion boards</td>\n",
75
+ " <td>commercial</td>\n",
 
 
76
  " </tr>\n",
77
  " </tbody>\n",
78
  "</table>\n",
79
  "</div>"
80
  ],
81
  "text/plain": [
82
+ " keyword intent\n",
83
+ "0 social media groups informational\n",
84
+ "1 social media groups navigational\n",
85
+ "2 internet forums navigational\n",
86
+ "3 virtual communities navigational\n",
87
+ "4 online discussion boards commercial"
 
 
 
 
 
 
 
88
  ]
89
  },
90
  "execution_count": 3,
 
93
  }
94
  ],
95
  "source": [
96
+ "original_df= pd.read_csv(\"data/data_for_seo_new_intent.csv\")\n",
97
  "original_df.head()"
98
  ]
99
  },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 3,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "def map_intent(intent:str):\n",
107
+ " return intent.lower()"
108
+ ]
109
+ },
110
  {
111
  "cell_type": "code",
112
  "execution_count": 4,
 
114
  "outputs": [
115
  {
116
  "data": {
117
+ "text/html": [
118
+ "<div>\n",
119
+ "<style scoped>\n",
120
+ " .dataframe tbody tr th:only-of-type {\n",
121
+ " vertical-align: middle;\n",
122
+ " }\n",
123
+ "\n",
124
+ " .dataframe tbody tr th {\n",
125
+ " vertical-align: top;\n",
126
+ " }\n",
127
+ "\n",
128
+ " .dataframe thead th {\n",
129
+ " text-align: right;\n",
130
+ " }\n",
131
+ "</style>\n",
132
+ "<table border=\"1\" class=\"dataframe\">\n",
133
+ " <thead>\n",
134
+ " <tr style=\"text-align: right;\">\n",
135
+ " <th></th>\n",
136
+ " <th>keyword</th>\n",
137
+ " <th>intent</th>\n",
138
+ " </tr>\n",
139
+ " </thead>\n",
140
+ " <tbody>\n",
141
+ " <tr>\n",
142
+ " <th>0</th>\n",
143
+ " <td>citalopram vs prozac</td>\n",
144
+ " <td>commercial</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>1</th>\n",
148
+ " <td>who is the oldest football player</td>\n",
149
+ " <td>informational</td>\n",
150
+ " </tr>\n",
151
+ " <tr>\n",
152
+ " <th>2</th>\n",
153
+ " <td>t mobile town east</td>\n",
154
+ " <td>navigational</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>3</th>\n",
158
+ " <td>starbucks</td>\n",
159
+ " <td>navigational</td>\n",
160
+ " </tr>\n",
161
+ " <tr>\n",
162
+ " <th>4</th>\n",
163
+ " <td>tech crunch</td>\n",
164
+ " <td>navigational</td>\n",
165
+ " </tr>\n",
166
+ " </tbody>\n",
167
+ "</table>\n",
168
+ "</div>"
169
+ ],
170
  "text/plain": [
171
+ " keyword intent\n",
172
+ "0 citalopram vs prozac commercial\n",
173
+ "1 who is the oldest football player informational\n",
174
+ "2 t mobile town east navigational\n",
175
+ "3 starbucks navigational\n",
176
+ "4 tech crunch navigational"
177
  ]
178
  },
179
  "execution_count": 4,
 
182
  }
183
  ],
184
  "source": [
185
+ "temp_df= pd.read_csv(\"data_intent/intent_data.csv\")\n",
186
+ "temp_df.intent= temp_df.intent.map(map_intent)\n",
187
+ "temp_df= temp_df[temp_df.intent!=\"local\"]\n",
188
+ "temp_df.head()"
189
  ]
190
  },
191
  {
 
194
  "metadata": {},
195
  "outputs": [],
196
  "source": [
197
+ "# original_df= temp_df.copy()"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 6,
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "original_df= pd.concat([original_df, temp_df])"
207
  ]
208
  },
209
  {
 
214
  {
215
  "data": {
216
  "text/plain": [
217
+ "False 1304\n",
218
+ "True 196\n",
219
  "Name: count, dtype: int64"
220
  ]
221
  },
 
234
  "metadata": {},
235
  "outputs": [],
236
  "source": [
237
+ "# original_df.drop_duplicates(inplace=True)"
238
  ]
239
  },
240
  {
241
  "cell_type": "code",
242
  "execution_count": 8,
243
  "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "data": {
247
+ "text/plain": [
248
+ "False 1304\n",
249
+ "True 196\n",
250
+ "Name: count, dtype: int64"
251
+ ]
252
+ },
253
+ "execution_count": 8,
254
+ "metadata": {},
255
+ "output_type": "execute_result"
256
+ }
257
+ ],
258
  "source": [
259
+ "original_df.duplicated().value_counts()"
260
  ]
261
  },
262
  {
 
264
  "execution_count": 9,
265
  "metadata": {},
266
  "outputs": [],
267
+ "source": [
268
+ "original_df= original_df[original_df.intent!='Local']"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 10,
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "data": {
278
+ "text/plain": [
279
+ "['commercial', 'informational', 'navigational', 'transactional']"
280
+ ]
281
+ },
282
+ "execution_count": 10,
283
+ "metadata": {},
284
+ "output_type": "execute_result"
285
+ }
286
+ ],
287
+ "source": [
288
+ "intents= original_df.intent.unique().tolist()\n",
289
+ "intents"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 11,
295
+ "metadata": {},
296
+ "outputs": [],
297
  "source": [
298
  "id2label= {}\n",
299
  "label2id= {}\n",
 
304
  },
305
  {
306
  "cell_type": "code",
307
+ "execution_count": 12,
308
  "metadata": {},
309
  "outputs": [
310
  {
311
  "data": {
312
  "text/plain": [
313
+ "{0: 'commercial', 1: 'informational', 2: 'navigational', 3: 'transactional'}"
314
  ]
315
  },
316
+ "execution_count": 12,
317
  "metadata": {},
318
  "output_type": "execute_result"
319
  }
 
324
  },
325
  {
326
  "cell_type": "code",
327
+ "execution_count": 13,
328
  "metadata": {},
329
  "outputs": [
330
  {
331
  "data": {
332
  "text/plain": [
333
+ "{'commercial': 0, 'informational': 1, 'navigational': 2, 'transactional': 3}"
334
  ]
335
  },
336
+ "execution_count": 13,
337
  "metadata": {},
338
  "output_type": "execute_result"
339
  }
 
344
  },
345
  {
346
  "cell_type": "code",
347
+ "execution_count": 14,
348
  "metadata": {},
349
  "outputs": [],
350
  "source": [
 
354
  },
355
  {
356
  "cell_type": "code",
357
+ "execution_count": 15,
358
  "metadata": {},
359
  "outputs": [
360
  {
 
381
  " <th>keyword</th>\n",
382
  " <th>intent</th>\n",
383
  " <th>id</th>\n",
 
384
  " </tr>\n",
385
  " </thead>\n",
386
  " <tbody>\n",
387
  " <tr>\n",
388
  " <th>0</th>\n",
389
  " <td>citalopram vs prozac</td>\n",
390
+ " <td>commercial</td>\n",
391
  " <td>0</td>\n",
 
392
  " </tr>\n",
393
  " <tr>\n",
394
  " <th>1</th>\n",
395
  " <td>who is the oldest football player</td>\n",
396
+ " <td>informational</td>\n",
397
  " <td>1</td>\n",
 
398
  " </tr>\n",
399
  " <tr>\n",
400
  " <th>2</th>\n",
401
  " <td>t mobile town east</td>\n",
402
+ " <td>navigational</td>\n",
403
  " <td>2</td>\n",
 
404
  " </tr>\n",
405
  " <tr>\n",
406
  " <th>3</th>\n",
407
  " <td>starbucks</td>\n",
408
+ " <td>navigational</td>\n",
409
  " <td>2</td>\n",
 
410
  " </tr>\n",
411
  " <tr>\n",
412
  " <th>4</th>\n",
413
  " <td>tech crunch</td>\n",
414
+ " <td>navigational</td>\n",
415
  " <td>2</td>\n",
 
416
  " </tr>\n",
417
  " <tr>\n",
418
  " <th>...</th>\n",
419
  " <td>...</td>\n",
420
  " <td>...</td>\n",
421
  " <td>...</td>\n",
 
422
  " </tr>\n",
423
  " <tr>\n",
424
+ " <th>1703</th>\n",
425
  " <td>How to make homemade pet accessories from recy...</td>\n",
426
+ " <td>informational</td>\n",
427
  " <td>1</td>\n",
 
428
  " </tr>\n",
429
  " <tr>\n",
430
+ " <th>1704</th>\n",
431
  " <td>Top 10 science fiction book series that take r...</td>\n",
432
+ " <td>informational</td>\n",
433
  " <td>1</td>\n",
 
434
  " </tr>\n",
435
  " <tr>\n",
436
+ " <th>1705</th>\n",
437
  " <td>How to start a car restoration and customizati...</td>\n",
438
+ " <td>informational</td>\n",
439
  " <td>1</td>\n",
 
440
  " </tr>\n",
441
  " <tr>\n",
442
+ " <th>1706</th>\n",
443
  " <td>Ancient Mesopotamian architecture and its infl...</td>\n",
444
+ " <td>informational</td>\n",
445
  " <td>1</td>\n",
 
446
  " </tr>\n",
447
  " <tr>\n",
448
+ " <th>1707</th>\n",
449
  " <td>Benefits of a flexitarian diet for those seeki...</td>\n",
450
+ " <td>informational</td>\n",
451
  " <td>1</td>\n",
 
452
  " </tr>\n",
453
  " </tbody>\n",
454
  "</table>\n",
455
+ "<p>1500 rows × 3 columns</p>\n",
456
  "</div>"
457
  ],
458
  "text/plain": [
459
+ " keyword intent id\n",
460
+ "0 citalopram vs prozac commercial 0\n",
461
+ "1 who is the oldest football player informational 1\n",
462
+ "2 t mobile town east navigational 2\n",
463
+ "3 starbucks navigational 2\n",
464
+ "4 tech crunch navigational 2\n",
465
+ "... ... ... ..\n",
466
+ "1703 How to make homemade pet accessories from recy... informational 1\n",
467
+ "1704 Top 10 science fiction book series that take r... informational 1\n",
468
+ "1705 How to start a car restoration and customizati... informational 1\n",
469
+ "1706 Ancient Mesopotamian architecture and its infl... informational 1\n",
470
+ "1707 Benefits of a flexitarian diet for those seeki... informational 1\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  "\n",
472
+ "[1500 rows x 3 columns]"
473
  ]
474
  },
475
+ "execution_count": 15,
476
  "metadata": {},
477
  "output_type": "execute_result"
478
  }
 
484
  },
485
  {
486
  "cell_type": "code",
487
+ "execution_count": 16,
488
  "metadata": {},
489
  "outputs": [
490
  {
 
508
  " <thead>\n",
509
  " <tr style=\"text-align: right;\">\n",
510
  " <th></th>\n",
511
+ " <th>keyword</th>\n",
512
  " <th>id</th>\n",
513
  " </tr>\n",
514
  " </thead>\n",
515
  " <tbody>\n",
516
  " <tr>\n",
517
  " <th>0</th>\n",
518
+ " <td>citalopram vs prozac</td>\n",
519
  " <td>0</td>\n",
520
  " </tr>\n",
521
  " <tr>\n",
522
  " <th>1</th>\n",
523
+ " <td>who is the oldest football player</td>\n",
524
  " <td>1</td>\n",
525
  " </tr>\n",
526
  " <tr>\n",
527
  " <th>2</th>\n",
528
+ " <td>t mobile town east</td>\n",
529
  " <td>2</td>\n",
530
  " </tr>\n",
531
  " <tr>\n",
532
  " <th>3</th>\n",
533
+ " <td>starbucks</td>\n",
534
  " <td>2</td>\n",
535
  " </tr>\n",
536
  " <tr>\n",
537
  " <th>4</th>\n",
538
+ " <td>tech crunch</td>\n",
539
  " <td>2</td>\n",
540
  " </tr>\n",
541
  " <tr>\n",
 
544
  " <td>...</td>\n",
545
  " </tr>\n",
546
  " <tr>\n",
547
+ " <th>1703</th>\n",
548
+ " <td>How to make homemade pet accessories from recy...</td>\n",
549
  " <td>1</td>\n",
550
  " </tr>\n",
551
  " <tr>\n",
552
+ " <th>1704</th>\n",
553
+ " <td>Top 10 science fiction book series that take r...</td>\n",
554
  " <td>1</td>\n",
555
  " </tr>\n",
556
  " <tr>\n",
557
+ " <th>1705</th>\n",
558
+ " <td>How to start a car restoration and customizati...</td>\n",
559
  " <td>1</td>\n",
560
  " </tr>\n",
561
  " <tr>\n",
562
+ " <th>1706</th>\n",
563
+ " <td>Ancient Mesopotamian architecture and its infl...</td>\n",
564
  " <td>1</td>\n",
565
  " </tr>\n",
566
  " <tr>\n",
567
+ " <th>1707</th>\n",
568
+ " <td>Benefits of a flexitarian diet for those seeki...</td>\n",
569
  " <td>1</td>\n",
570
  " </tr>\n",
571
  " </tbody>\n",
572
  "</table>\n",
573
+ "<p>1500 rows × 2 columns</p>\n",
574
  "</div>"
575
  ],
576
  "text/plain": [
577
+ " keyword id\n",
578
+ "0 citalopram vs prozac 0\n",
579
+ "1 who is the oldest football player 1\n",
580
+ "2 t mobile town east 2\n",
581
+ "3 starbucks 2\n",
582
+ "4 tech crunch 2\n",
583
  "... ... ..\n",
584
+ "1703 How to make homemade pet accessories from recy... 1\n",
585
+ "1704 Top 10 science fiction book series that take r... 1\n",
586
+ "1705 How to start a car restoration and customizati... 1\n",
587
+ "1706 Ancient Mesopotamian architecture and its infl... 1\n",
588
+ "1707 Benefits of a flexitarian diet for those seeki... 1\n",
589
  "\n",
590
+ "[1500 rows x 2 columns]"
591
  ]
592
  },
593
+ "execution_count": 16,
594
  "metadata": {},
595
  "output_type": "execute_result"
596
  }
597
  ],
598
  "source": [
599
+ "# df= original_df[['metatitle', 'id']]\n",
600
+ "df= original_df[['keyword', 'id']]\n",
601
  "df"
602
  ]
603
  },
604
  {
605
  "cell_type": "code",
606
+ "execution_count": 17,
607
+ "metadata": {},
608
+ "outputs": [
609
+ {
610
+ "data": {
611
+ "text/html": [
612
+ "<div>\n",
613
+ "<style scoped>\n",
614
+ " .dataframe tbody tr th:only-of-type {\n",
615
+ " vertical-align: middle;\n",
616
+ " }\n",
617
+ "\n",
618
+ " .dataframe tbody tr th {\n",
619
+ " vertical-align: top;\n",
620
+ " }\n",
621
+ "\n",
622
+ " .dataframe thead th {\n",
623
+ " text-align: right;\n",
624
+ " }\n",
625
+ "</style>\n",
626
+ "<table border=\"1\" class=\"dataframe\">\n",
627
+ " <thead>\n",
628
+ " <tr style=\"text-align: right;\">\n",
629
+ " <th></th>\n",
630
+ " <th>keyword</th>\n",
631
+ " <th>id</th>\n",
632
+ " </tr>\n",
633
+ " </thead>\n",
634
+ " <tbody>\n",
635
+ " <tr>\n",
636
+ " <th>0</th>\n",
637
+ " <td>Buy baby stroller</td>\n",
638
+ " <td>3</td>\n",
639
+ " </tr>\n",
640
+ " <tr>\n",
641
+ " <th>1</th>\n",
642
+ " <td>Why do leaves change color in the fall?</td>\n",
643
+ " <td>1</td>\n",
644
+ " </tr>\n",
645
+ " <tr>\n",
646
+ " <th>2</th>\n",
647
+ " <td>How to improve your leadership skills</td>\n",
648
+ " <td>1</td>\n",
649
+ " </tr>\n",
650
+ " <tr>\n",
651
+ " <th>3</th>\n",
652
+ " <td>sneakers amazon</td>\n",
653
+ " <td>3</td>\n",
654
+ " </tr>\n",
655
+ " <tr>\n",
656
+ " <th>4</th>\n",
657
+ " <td>Shop for photography equipment</td>\n",
658
+ " <td>3</td>\n",
659
+ " </tr>\n",
660
+ " <tr>\n",
661
+ " <th>...</th>\n",
662
+ " <td>...</td>\n",
663
+ " <td>...</td>\n",
664
+ " </tr>\n",
665
+ " <tr>\n",
666
+ " <th>1495</th>\n",
667
+ " <td>Why do stars twinkle?</td>\n",
668
+ " <td>1</td>\n",
669
+ " </tr>\n",
670
+ " <tr>\n",
671
+ " <th>1496</th>\n",
672
+ " <td>Buy eco-friendly beauty products</td>\n",
673
+ " <td>0</td>\n",
674
+ " </tr>\n",
675
+ " <tr>\n",
676
+ " <th>1497</th>\n",
677
+ " <td>Order makeup kit</td>\n",
678
+ " <td>3</td>\n",
679
+ " </tr>\n",
680
+ " <tr>\n",
681
+ " <th>1498</th>\n",
682
+ " <td>Lowe's</td>\n",
683
+ " <td>2</td>\n",
684
+ " </tr>\n",
685
+ " <tr>\n",
686
+ " <th>1499</th>\n",
687
+ " <td>Get photography equipment</td>\n",
688
+ " <td>3</td>\n",
689
+ " </tr>\n",
690
+ " </tbody>\n",
691
+ "</table>\n",
692
+ "<p>1500 rows × 2 columns</p>\n",
693
+ "</div>"
694
+ ],
695
+ "text/plain": [
696
+ " keyword id\n",
697
+ "0 Buy baby stroller 3\n",
698
+ "1 Why do leaves change color in the fall? 1\n",
699
+ "2 How to improve your leadership skills 1\n",
700
+ "3 sneakers amazon 3\n",
701
+ "4 Shop for photography equipment 3\n",
702
+ "... ... ..\n",
703
+ "1495 Why do stars twinkle? 1\n",
704
+ "1496 Buy eco-friendly beauty products 0\n",
705
+ "1497 Order makeup kit 3\n",
706
+ "1498 Lowe's 2\n",
707
+ "1499 Get photography equipment 3\n",
708
+ "\n",
709
+ "[1500 rows x 2 columns]"
710
+ ]
711
+ },
712
+ "execution_count": 17,
713
+ "metadata": {},
714
+ "output_type": "execute_result"
715
+ }
716
+ ],
717
+ "source": [
718
+ "df= df.sample(frac=1).reset_index(drop=True)\n",
719
+ "\n",
720
+ "df"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": 18,
726
  "metadata": {},
727
  "outputs": [
728
  {
729
  "name": "stderr",
730
  "output_type": "stream",
731
  "text": [
732
+ "/home/ubuntu/FineTunedDistilledBertAIChecker/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
733
  " from .autonotebook import tqdm as notebook_tqdm\n"
734
  ]
735
  }
 
740
  },
741
  {
742
  "cell_type": "code",
743
+ "execution_count": 19,
744
  "metadata": {},
745
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
746
  {
747
  "data": {
748
  "text/html": [
 
770
  " </thead>\n",
771
  " <tbody>\n",
772
  " <tr>\n",
773
+ " <th>870</th>\n",
774
+ " <td>Important space missions to the New Horizons s...</td>\n",
775
  " <td>1</td>\n",
776
  " </tr>\n",
777
  " <tr>\n",
778
+ " <th>947</th>\n",
779
+ " <td>How to start a travel and adventure blog</td>\n",
780
+ " <td>1</td>\n",
781
  " </tr>\n",
782
  " <tr>\n",
783
+ " <th>477</th>\n",
784
+ " <td>How to improve your critical thinking skills</td>\n",
785
+ " <td>1</td>\n",
786
  " </tr>\n",
787
  " <tr>\n",
788
+ " <th>174</th>\n",
789
+ " <td>How to make homemade baby food</td>\n",
790
+ " <td>1</td>\n",
791
  " </tr>\n",
792
  " <tr>\n",
793
+ " <th>1369</th>\n",
794
+ " <td>Cheap sustainable clothing brands</td>\n",
795
+ " <td>0</td>\n",
796
  " </tr>\n",
797
  " <tr>\n",
798
+ " <th>396</th>\n",
799
+ " <td>Exploring the mysteries of the deep ocean</td>\n",
800
  " <td>1</td>\n",
801
  " </tr>\n",
802
  " <tr>\n",
803
+ " <th>206</th>\n",
804
+ " <td>Discounted eco-friendly patio decor</td>\n",
805
+ " <td>0</td>\n",
806
  " </tr>\n",
807
  " <tr>\n",
808
+ " <th>191</th>\n",
809
+ " <td>Cheap eco-friendly office products</td>\n",
810
  " <td>0</td>\n",
811
  " </tr>\n",
812
  " <tr>\n",
813
+ " <th>533</th>\n",
814
+ " <td>Affordable pet supplies</td>\n",
815
+ " <td>0</td>\n",
816
  " </tr>\n",
817
  " <tr>\n",
818
+ " <th>1398</th>\n",
819
+ " <td>Travel tips for Japan</td>\n",
820
+ " <td>1</td>\n",
821
  " </tr>\n",
822
  " </tbody>\n",
823
  "</table>\n",
 
825
  ],
826
  "text/plain": [
827
  " text label\n",
828
+ "870 Important space missions to the New Horizons s... 1\n",
829
+ "947 How to start a travel and adventure blog 1\n",
830
+ "477 How to improve your critical thinking skills 1\n",
831
+ "174 How to make homemade baby food 1\n",
832
+ "1369 Cheap sustainable clothing brands 0\n",
833
+ "396 Exploring the mysteries of the deep ocean 1\n",
834
+ "206 Discounted eco-friendly patio decor 0\n",
835
+ "191 Cheap eco-friendly office products 0\n",
836
+ "533 Affordable pet supplies 0\n",
837
+ "1398 Travel tips for Japan 1"
838
  ]
839
  },
840
+ "execution_count": 19,
841
  "metadata": {},
842
  "output_type": "execute_result"
843
  }
844
  ],
845
  "source": [
846
  "df.rename(columns={\n",
847
+ " \"keyword\": \"text\", \n",
848
+ " # \"metatitle\": \"text\", \n",
849
  " \"id\": \"label\"\n",
850
  "}, \n",
851
  " inplace=True\n",
 
856
  },
857
  {
858
  "cell_type": "code",
859
+ "execution_count": 20,
860
  "metadata": {},
861
  "outputs": [
 
 
 
 
 
 
 
 
862
  {
863
  "data": {
864
  "text/plain": [
865
  "Dataset({\n",
866
+ " features: ['text', 'label'],\n",
867
+ " num_rows: 1500\n",
868
  "})"
869
  ]
870
  },
871
+ "execution_count": 20,
872
  "metadata": {},
873
  "output_type": "execute_result"
874
  }
 
880
  },
881
  {
882
  "cell_type": "code",
883
+ "execution_count": 21,
884
  "metadata": {},
885
  "outputs": [
886
  {
 
888
  "text/plain": [
889
  "DatasetDict({\n",
890
  " train: Dataset({\n",
891
+ " features: ['text', 'label'],\n",
892
+ " num_rows: 1125\n",
893
  " })\n",
894
  " test: Dataset({\n",
895
+ " features: ['text', 'label'],\n",
896
+ " num_rows: 375\n",
897
  " })\n",
898
  "})"
899
  ]
900
  },
901
+ "execution_count": 21,
902
  "metadata": {},
903
  "output_type": "execute_result"
904
  }
905
  ],
906
  "source": [
907
+ "new_data= dataset_df.train_test_split(test_size=0.25)\n",
908
  "new_data"
909
  ]
910
  },
911
  {
912
  "cell_type": "code",
913
+ "execution_count": 22,
914
  "metadata": {},
915
  "outputs": [],
916
  "source": [
 
921
  },
922
  {
923
  "cell_type": "code",
924
+ "execution_count": 23,
925
  "metadata": {},
926
  "outputs": [],
927
  "source": [
 
931
  },
932
  {
933
  "cell_type": "code",
934
+ "execution_count": 24,
935
  "metadata": {},
936
  "outputs": [
937
  {
938
  "name": "stderr",
939
  "output_type": "stream",
940
  "text": [
941
+ "Map: 100%|██████████| 1125/1125 [00:00<00:00, 31352.56 examples/s]\n",
942
+ "Map: 100%|██████████| 375/375 [00:00<00:00, 29503.00 examples/s]\n"
 
 
 
 
 
 
 
943
  ]
944
  }
945
  ],
 
949
  },
950
  {
951
  "cell_type": "code",
952
+ "execution_count": 25,
953
  "metadata": {},
954
  "outputs": [
955
  {
956
  "name": "stderr",
957
  "output_type": "stream",
958
  "text": [
959
+ "2023-11-04 12:46:03.199613: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
960
+ "2023-11-04 12:46:03.249373: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
961
+ "2023-11-04 12:46:03.249409: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
962
+ "2023-11-04 12:46:03.249439: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
963
+ "2023-11-04 12:46:03.257947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
964
+ "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
965
+ "2023-11-04 12:46:04.345188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
966
  ]
967
  }
968
  ],
 
981
  },
982
  {
983
  "cell_type": "code",
984
+ "execution_count": 26,
985
  "metadata": {},
986
  "outputs": [],
987
  "source": [
 
992
  },
993
  {
994
  "cell_type": "code",
995
+ "execution_count": 27,
996
  "metadata": {},
997
  "outputs": [],
998
  "source": [
 
1007
  },
1008
  {
1009
  "cell_type": "code",
1010
+ "execution_count": 28,
1011
  "metadata": {},
1012
  "outputs": [
1013
  {
1014
  "name": "stderr",
1015
  "output_type": "stream",
1016
  "text": [
1017
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
1018
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
1019
  ]
1020
  }
 
1024
  "\n",
1025
  "model = AutoModelForSequenceClassification.from_pretrained(\n",
1026
  " # \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n",
1027
+ " # \"distilbert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
1028
+ " \"bert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
1029
  ")"
1030
  ]
1031
  },
1032
  {
1033
  "cell_type": "code",
1034
+ "execution_count": 29,
1035
  "metadata": {},
1036
  "outputs": [
1037
  {
 
1047
  "\n",
1048
  " <div>\n",
1049
  " \n",
1050
+ " <progress value='426' max='426' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1051
+ " [426/426 00:56, Epoch 6/6]\n",
1052
  " </div>\n",
1053
  " <table border=\"1\" class=\"dataframe\">\n",
1054
  " <thead>\n",
 
1063
  " <tr>\n",
1064
  " <td>1</td>\n",
1065
  " <td>No log</td>\n",
1066
+ " <td>0.350181</td>\n",
1067
+ " <td>0.957333</td>\n",
1068
  " </tr>\n",
1069
  " <tr>\n",
1070
  " <td>2</td>\n",
1071
  " <td>No log</td>\n",
1072
+ " <td>0.107043</td>\n",
1073
+ " <td>0.973333</td>\n",
1074
  " </tr>\n",
1075
  " <tr>\n",
1076
  " <td>3</td>\n",
1077
  " <td>No log</td>\n",
1078
+ " <td>0.087978</td>\n",
1079
+ " <td>0.978667</td>\n",
1080
  " </tr>\n",
1081
  " <tr>\n",
1082
  " <td>4</td>\n",
1083
  " <td>No log</td>\n",
1084
+ " <td>0.085274</td>\n",
1085
+ " <td>0.973333</td>\n",
1086
  " </tr>\n",
1087
  " <tr>\n",
1088
  " <td>5</td>\n",
1089
  " <td>No log</td>\n",
1090
+ " <td>0.086987</td>\n",
1091
+ " <td>0.973333</td>\n",
1092
  " </tr>\n",
1093
  " <tr>\n",
1094
  " <td>6</td>\n",
1095
+ " <td>No log</td>\n",
1096
+ " <td>0.093197</td>\n",
1097
+ " <td>0.970667</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1098
  " </tr>\n",
1099
  " </tbody>\n",
1100
  "</table><p>"
 
1109
  {
1110
  "data": {
1111
  "text/plain": [
1112
+ "TrainOutput(global_step=426, training_loss=0.1806535676051753, metrics={'train_runtime': 57.2339, 'train_samples_per_second': 117.937, 'train_steps_per_second': 7.443, 'total_flos': 44042600979624.0, 'train_loss': 0.1806535676051753, 'epoch': 6.0})"
1113
  ]
1114
  },
1115
+ "execution_count": 29,
1116
  "metadata": {},
1117
  "output_type": "execute_result"
1118
  }
1119
  ],
1120
  "source": [
1121
  "training_args = TrainingArguments(\n",
1122
+ " output_dir=\"intent_classification_model_without_metatitle_with_local23\",\n",
1123
  " learning_rate=2e-5,\n",
1124
  " per_device_train_batch_size=16,\n",
1125
  " per_device_eval_batch_size=16,\n",
1126
+ " num_train_epochs=6,\n",
1127
  " weight_decay=0.01,\n",
1128
  " evaluation_strategy=\"epoch\",\n",
1129
  " save_strategy=\"epoch\",\n",
research/14_keyword_intent.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
utils/__pycache__/client.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/client.cpython-310.pyc and b/utils/__pycache__/client.cpython-310.pyc differ
 
utils/client.py CHANGED
@@ -40,6 +40,28 @@ client = RestClient(data_for_seo_email, data_for_seo_password)
40
  # client = RestClient("deepankar@warewe.com", "cb1661e9ec7c1fba")
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def generate_seo_metatitle(keyword, num_query_results=10):
44
  post_data = dict()
45
  # You can set only one task at a time
 
40
  # client = RestClient("deepankar@warewe.com", "cb1661e9ec7c1fba")
41
 
42
 
43
+
44
+ def generate_keyword_intent_list(list_of_keywords: list):
45
+ post_data = dict()
46
+ # simple way to set a task
47
+ post_data[len(post_data)] = dict(
48
+ keywords= list_of_keywords,
49
+ language_name="English"
50
+ )
51
+ # POST /v3/dataforseo_labs/google/search_intent/live
52
+ response = client.post("/v3/dataforseo_labs/google/search_intent/live", post_data)
53
+ # you can find the full list of the response codes here https://docs.dataforseo.com/v3/appendix/errors
54
+ if response["status_code"] == 20000:
55
+ # print(response)
56
+ return response["tasks"][0]["result"][0]["items"]
57
+ # do something with result
58
+ else:
59
+ print("error. Code: %d Message: %s" % (response["status_code"], response["status_message"]))
60
+
61
+
62
+
63
+
64
+
65
  def generate_seo_metatitle(keyword, num_query_results=10):
66
  post_data = dict()
67
  # You can set only one task at a time