added stuffs

Browse files

Files changed (9) hide show

.gitignore +2 -1
__pycache__/keys.cpython-310.pyc +0 -0
data/data_for_seo_new_intent.csv +3 -0
data/intent_data_dataforseo.json +0 -0
requirements.txt +2 -1
research/11_intent_classification_using_distilbert.ipynb +431 -399
research/14_keyword_intent.ipynb +0 -0
utils/__pycache__/client.cpython-310.pyc +0 -0
utils/client.py +22 -0

.gitignore CHANGED Viewed

@@ -6,4 +6,5 @@ intent_classification_model_with_metatitle_with_local2/
 intent_classification_model_with_metatitle_with_local1/
 intent_classification_model_with_metatitle_with_local/
 intent_classification_model_with_metatitle/
-intent_classification_model_with_metatitle_with_local2/

 intent_classification_model_with_metatitle_with_local1/
 intent_classification_model_with_metatitle_with_local/
 intent_classification_model_with_metatitle/
+intent_classification_model_with_metatitle_with_local2/
+intent_classification_model_without_metatitle_with_local23/

__pycache__/keys.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/keys.cpython-310.pyc and b/__pycache__/keys.cpython-310.pyc differ

data/data_for_seo_new_intent.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da30bf15a41177fee836996b499bc5a0d59fd45853f8d12f995d146f99339210
+size 2357733

data/intent_data_dataforseo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ evaluate
 openpyxl
 summa
 git+https://github.com/LIAAD/yake
-multi_rake

 openpyxl
 summa
 git+https://github.com/LIAAD/yake
+multi_rake
+accelerate

research/11_intent_classification_using_distilbert.ipynb CHANGED Viewed

@@ -46,64 +46,45 @@
        "      <th></th>\n",
        "      <th>keyword</th>\n",
        "      <th>intent</th>\n",
-       "      <th>id</th>\n",
-       "      <th>metatitle</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>citalopram vs prozac</td>\n",
-       "      <td>Commercial</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>who is the oldest football player</td>\n",
-       "      <td>Informational</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Oldest active NFL players and in league histor...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>t mobile town east</td>\n",
-       "      <td>Navigational</td>\n",
-       "      <td>2</td>\n",
-       "      <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>starbucks</td>\n",
-       "      <td>Navigational</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>tech crunch</td>\n",
-       "      <td>Navigational</td>\n",
-       "      <td>2</td>\n",
-       "      <td>TechCrunch | Startup and Technology News techc...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                             keyword         intent  id  \\\n",
-       "0               citalopram vs prozac     Commercial   0   \n",
-       "1  who is the oldest football player  Informational   1   \n",
-       "2                 t mobile town east   Navigational   2   \n",
-       "3                          starbucks   Navigational   2   \n",
-       "4                        tech crunch   Navigational   2   \n",
-       "\n",
-       "                                           metatitle  \n",
-       "0  Celexa vs Prozac - ClarityX clarityxdna.com ht...  \n",
-       "1  Oldest active NFL players and in league histor...  \n",
-       "2  T-Mobile Town East Blvd & Pavillion Ct | Mesqu...  \n",
-       "3  Starbucks Coffee Company www.starbucks.com htt...  \n",
-       "4  TechCrunch | Startup and Technology News techc...  "
       ]
      },
      "execution_count": 3,
@@ -112,10 +93,20 @@
     }
    ],
    "source": [
-    "original_df= pd.read_csv(\"data_intent/intent_with_metatitle.csv\")\n",
     "original_df.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -123,10 +114,66 @@
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "False    1659\n",
-       "True      343\n",
-       "Name: count, dtype: int64"
       ]
      },
      "execution_count": 4,
@@ -135,7 +182,10 @@
     }
    ],
    "source": [
-    "original_df.duplicated().value_counts()"
    ]
   },
   {
@@ -144,7 +194,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# original_df.drop_duplicates(inplace=True)"
    ]
   },
   {
@@ -155,8 +214,8 @@
     {
      "data": {
       "text/plain": [
-       "False    1659\n",
-       "True      343\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -175,16 +234,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "original_df= original_df[original_df.intent!='Local']"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "intents= original_df.intent.unique().tolist()"
    ]
   },
   {
@@ -192,6 +264,36 @@
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "id2label= {}\n",
     "label2id= {}\n",
@@ -202,16 +304,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{0: 'Commercial', 1: 'Informational', 2: 'Navigational', 3: 'Transactional'}"
       ]
      },
-     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -222,16 +324,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'Commercial': 0, 'Informational': 1, 'Navigational': 2, 'Transactional': 3}"
       ]
      },
-     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -242,7 +344,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,7 +354,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -279,123 +381,98 @@
        "      <th>keyword</th>\n",
        "      <th>intent</th>\n",
        "      <th>id</th>\n",
-       "      <th>metatitle</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>citalopram vs prozac</td>\n",
-       "      <td>Commercial</td>\n",
        "      <td>0</td>\n",
-       "      <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>who is the oldest football player</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>Oldest active NFL players and in league histor...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>t mobile town east</td>\n",
-       "      <td>Navigational</td>\n",
        "      <td>2</td>\n",
-       "      <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>starbucks</td>\n",
-       "      <td>Navigational</td>\n",
        "      <td>2</td>\n",
-       "      <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>tech crunch</td>\n",
-       "      <td>Navigational</td>\n",
        "      <td>2</td>\n",
-       "      <td>TechCrunch | Startup and Technology News techc...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1997</th>\n",
        "      <td>How to make homemade pet accessories from recy...</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>Try These Dog Products Made From Recycled Mate...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1998</th>\n",
        "      <td>Top 10 science fiction book series that take r...</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>10 Sci-Fi and Fantasy Books About Fantastical ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1999</th>\n",
        "      <td>How to start a car restoration and customizati...</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>What to Consider When Starting an Auto Restora...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2000</th>\n",
        "      <td>Ancient Mesopotamian architecture and its infl...</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>Mesopotamian art and architecture | Characteri...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2001</th>\n",
        "      <td>Benefits of a flexitarian diet for those seeki...</td>\n",
-       "      <td>Informational</td>\n",
        "      <td>1</td>\n",
-       "      <td>The Flexitarian Diet: A Detailed Beginner's Gu...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>1786 rows × 4 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                keyword         intent  id  \\\n",
-       "0                                  citalopram vs prozac     Commercial   0   \n",
-       "1                     who is the oldest football player  Informational   1   \n",
-       "2                                    t mobile town east   Navigational   2   \n",
-       "3                                             starbucks   Navigational   2   \n",
-       "4                                           tech crunch   Navigational   2   \n",
-       "...                                                 ...            ...  ..   \n",
-       "1997  How to make homemade pet accessories from recy...  Informational   1   \n",
-       "1998  Top 10 science fiction book series that take r...  Informational   1   \n",
-       "1999  How to start a car restoration and customizati...  Informational   1   \n",
-       "2000  Ancient Mesopotamian architecture and its infl...  Informational   1   \n",
-       "2001  Benefits of a flexitarian diet for those seeki...  Informational   1   \n",
-       "\n",
-       "                                              metatitle  \n",
-       "0     Celexa vs Prozac - ClarityX clarityxdna.com ht...  \n",
-       "1     Oldest active NFL players and in league histor...  \n",
-       "2     T-Mobile Town East Blvd & Pavillion Ct | Mesqu...  \n",
-       "3     Starbucks Coffee Company www.starbucks.com htt...  \n",
-       "4     TechCrunch | Startup and Technology News techc...  \n",
-       "...                                                 ...  \n",
-       "1997  Try These Dog Products Made From Recycled Mate...  \n",
-       "1998  10 Sci-Fi and Fantasy Books About Fantastical ...  \n",
-       "1999  What to Consider When Starting an Auto Restora...  \n",
-       "2000  Mesopotamian art and architecture | Characteri...  \n",
-       "2001  The Flexitarian Diet: A Detailed Beginner's Gu...  \n",
        "\n",
-       "[1786 rows x 4 columns]"
       ]
      },
-     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -407,7 +484,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -431,34 +508,34 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>metatitle</th>\n",
        "      <th>id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Celexa vs Prozac - ClarityX clarityxdna.com ht...</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>Oldest active NFL players and in league histor...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>T-Mobile Town East Blvd &amp; Pavillion Ct | Mesqu...</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>Starbucks Coffee Company www.starbucks.com htt...</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>TechCrunch | Startup and Technology News techc...</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -467,72 +544,192 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1997</th>\n",
-       "      <td>Try These Dog Products Made From Recycled Mate...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1998</th>\n",
-       "      <td>10 Sci-Fi and Fantasy Books About Fantastical ...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1999</th>\n",
-       "      <td>What to Consider When Starting an Auto Restora...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2000</th>\n",
-       "      <td>Mesopotamian art and architecture | Characteri...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2001</th>\n",
-       "      <td>The Flexitarian Diet: A Detailed Beginner's Gu...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>1786 rows × 2 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                              metatitle  id\n",
-       "0     Celexa vs Prozac - ClarityX clarityxdna.com ht...   0\n",
-       "1     Oldest active NFL players and in league histor...   1\n",
-       "2     T-Mobile Town East Blvd & Pavillion Ct | Mesqu...   2\n",
-       "3     Starbucks Coffee Company www.starbucks.com htt...   2\n",
-       "4     TechCrunch | Startup and Technology News techc...   2\n",
        "...                                                 ...  ..\n",
-       "1997  Try These Dog Products Made From Recycled Mate...   1\n",
-       "1998  10 Sci-Fi and Fantasy Books About Fantastical ...   1\n",
-       "1999  What to Consider When Starting an Auto Restora...   1\n",
-       "2000  Mesopotamian art and architecture | Characteri...   1\n",
-       "2001  The Flexitarian Diet: A Detailed Beginner's Gu...   1\n",
        "\n",
-       "[1786 rows x 2 columns]"
       ]
      },
-     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df= original_df[['metatitle', 'id']]\n",
     "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
@@ -543,20 +740,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_401416/1659657905.py:1: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df.rename(columns={\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -584,54 +770,54 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>287</th>\n",
-       "      <td>Vanilla Pudding Recipe www.bettycrocker.com ht...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>730</th>\n",
-       "      <td>Garden Outlet - Garden Sale + Free Shipping ww...</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>897</th>\n",
-       "      <td>Party Supplies on Sale | Oriental Trading Comp...</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>95</th>\n",
-       "      <td>My Chemical Romance www.mychemicalromance.com ...</td>\n",
-       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1832</th>\n",
-       "      <td>Einstein's Special Theory of Relativity | PBS ...</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1751</th>\n",
-       "      <td>12 Breathtaking Places to Go Kayaking in the U...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1870</th>\n",
-       "      <td>Nuclear “Power Balls” May Make Meltdowns a Thi...</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>795</th>\n",
-       "      <td>Natural &amp; Organic Makeup and Skincare – INIKA ...</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1707</th>\n",
-       "      <td>10 Best Ski Resorts in the US www.travelandlei...</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>92</th>\n",
-       "      <td>Hozier | Unreal Unearth THE NEW ALBUM OUT NOW ...</td>\n",
-       "      <td>2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -639,27 +825,27 @@
       ],
       "text/plain": [
        "                                                   text  label\n",
-       "287   Vanilla Pudding Recipe www.bettycrocker.com ht...      1\n",
-       "730   Garden Outlet - Garden Sale + Free Shipping ww...      0\n",
-       "897   Party Supplies on Sale | Oriental Trading Comp...      3\n",
-       "95    My Chemical Romance www.mychemicalromance.com ...      2\n",
-       "1832  Einstein's Special Theory of Relativity | PBS ...      1\n",
-       "1751  12 Breathtaking Places to Go Kayaking in the U...      1\n",
-       "1870  Nuclear “Power Balls” May Make Meltdowns a Thi...      1\n",
-       "795   Natural & Organic Makeup and Skincare – INIKA ...      0\n",
-       "1707  10 Best Ski Resorts in the US www.travelandlei...      1\n",
-       "92    Hozier | Unreal Unearth THE NEW ALBUM OUT NOW ...      2"
       ]
      },
-     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df.rename(columns={\n",
-    "    # \"keyword\": \"text\", \n",
-    "    \"metatitle\": \"text\", \n",
     "    \"id\": \"label\"\n",
     "}, \n",
     "          inplace=True\n",
@@ -670,27 +856,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:373: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n",
-      "  if _pandas_api.is_sparse(col):\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
        "Dataset({\n",
-       "    features: ['text', 'label', '__index_level_0__'],\n",
-       "    num_rows: 1786\n",
        "})"
       ]
      },
-     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -702,7 +880,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -710,29 +888,29 @@
       "text/plain": [
        "DatasetDict({\n",
        "    train: Dataset({\n",
-       "        features: ['text', 'label', '__index_level_0__'],\n",
-       "        num_rows: 1428\n",
        "    })\n",
        "    test: Dataset({\n",
-       "        features: ['text', 'label', '__index_level_0__'],\n",
-       "        num_rows: 358\n",
        "    })\n",
        "})"
       ]
      },
-     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "new_data= dataset_df.train_test_split(test_size=0.2)\n",
     "new_data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -743,7 +921,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -753,22 +931,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Map: 100%|██████████| 1428/1428 [00:01<00:00, 1417.99 examples/s]\n",
-      "Map:   0%|          | 0/358 [00:00<?, ? examples/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Map: 100%|██████████| 358/358 [00:00<00:00, 1509.84 examples/s]\n"
      ]
     }
    ],
@@ -778,16 +949,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-10-18 11:22:35.000690: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-10-18 11:22:36.451442: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
@@ -806,7 +981,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -817,7 +992,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -832,14 +1007,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']\n",
       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     }
@@ -849,13 +1024,14 @@
     "\n",
     "model = AutoModelForSequenceClassification.from_pretrained(\n",
     "    # \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n",
-    "    \"distilbert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -871,8 +1047,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='2700' max='2700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [2700/2700 12:13, Epoch 30/30]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -887,182 +1063,38 @@
        "    <tr>\n",
        "      <td>1</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.386599</td>\n",
-       "      <td>0.927374</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>2</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.187701</td>\n",
-       "      <td>0.944134</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>3</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.219236</td>\n",
-       "      <td>0.938547</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>4</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.212073</td>\n",
-       "      <td>0.935754</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>5</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.157072</td>\n",
-       "      <td>0.958101</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>6</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.149268</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.138416</td>\n",
-       "      <td>0.963687</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>8</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.129277</td>\n",
-       "      <td>0.969274</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>9</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.155066</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>10</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.132079</td>\n",
-       "      <td>0.966480</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>11</td>\n",
-       "      <td>0.244800</td>\n",
-       "      <td>0.138543</td>\n",
-       "      <td>0.969274</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>12</td>\n",
-       "      <td>0.040300</td>\n",
-       "      <td>0.162308</td>\n",
-       "      <td>0.966480</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>13</td>\n",
-       "      <td>0.040300</td>\n",
-       "      <td>0.132775</td>\n",
-       "      <td>0.969274</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>14</td>\n",
-       "      <td>0.040300</td>\n",
-       "      <td>0.169590</td>\n",
-       "      <td>0.966480</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>15</td>\n",
-       "      <td>0.040300</td>\n",
-       "      <td>0.151754</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>16</td>\n",
-       "      <td>0.040300</td>\n",
-       "      <td>0.150127</td>\n",
-       "      <td>0.972067</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>17</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.159291</td>\n",
-       "      <td>0.963687</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>18</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.162419</td>\n",
-       "      <td>0.963687</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>19</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.172608</td>\n",
-       "      <td>0.963687</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>20</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.176368</td>\n",
-       "      <td>0.963687</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>21</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.179977</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>22</td>\n",
-       "      <td>0.024200</td>\n",
-       "      <td>0.175084</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>23</td>\n",
-       "      <td>0.016700</td>\n",
-       "      <td>0.186994</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>24</td>\n",
-       "      <td>0.016700</td>\n",
-       "      <td>0.177934</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>25</td>\n",
-       "      <td>0.016700</td>\n",
-       "      <td>0.183129</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>26</td>\n",
-       "      <td>0.016700</td>\n",
-       "      <td>0.180832</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>27</td>\n",
-       "      <td>0.016700</td>\n",
-       "      <td>0.179173</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>28</td>\n",
-       "      <td>0.016300</td>\n",
-       "      <td>0.182724</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>29</td>\n",
-       "      <td>0.016300</td>\n",
-       "      <td>0.181777</td>\n",
-       "      <td>0.960894</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>30</td>\n",
-       "      <td>0.016300</td>\n",
-       "      <td>0.182771</td>\n",
-       "      <td>0.960894</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
@@ -1077,21 +1109,21 @@
     {
      "data": {
       "text/plain": [
-       "TrainOutput(global_step=2700, training_loss=0.0646134274094193, metrics={'train_runtime': 734.5773, 'train_samples_per_second': 58.319, 'train_steps_per_second': 3.676, 'total_flos': 5675105766113280.0, 'train_loss': 0.0646134274094193, 'epoch': 30.0})"
       ]
      },
-     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"intent_classification_model_with_metatitle_with_local2\",\n",
     "    learning_rate=2e-5,\n",
     "    per_device_train_batch_size=16,\n",
     "    per_device_eval_batch_size=16,\n",
-    "    num_train_epochs=30,\n",
     "    weight_decay=0.01,\n",
     "    evaluation_strategy=\"epoch\",\n",
     "    save_strategy=\"epoch\",\n",

        "      <th></th>\n",
        "      <th>keyword</th>\n",
        "      <th>intent</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
+       "      <td>social media groups</td>\n",
+       "      <td>informational</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
+       "      <td>social media groups</td>\n",
+       "      <td>navigational</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
+       "      <td>internet forums</td>\n",
+       "      <td>navigational</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>virtual communities</td>\n",
+       "      <td>navigational</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
+       "      <td>online discussion boards</td>\n",
+       "      <td>commercial</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
+       "                    keyword         intent\n",
+       "0       social media groups  informational\n",
+       "1       social media groups   navigational\n",
+       "2           internet forums   navigational\n",
+       "3       virtual communities   navigational\n",
+       "4  online discussion boards     commercial"
       ]
      },
      "execution_count": 3,
     }
    ],
    "source": [
+    "original_df= pd.read_csv(\"data/data_for_seo_new_intent.csv\")\n",
     "original_df.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def map_intent(intent:str):\n",
+    "    return intent.lower()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>intent</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>citalopram vs prozac</td>\n",
+       "      <td>commercial</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>who is the oldest football player</td>\n",
+       "      <td>informational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>t mobile town east</td>\n",
+       "      <td>navigational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>starbucks</td>\n",
+       "      <td>navigational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tech crunch</td>\n",
+       "      <td>navigational</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
+       "                             keyword         intent\n",
+       "0               citalopram vs prozac     commercial\n",
+       "1  who is the oldest football player  informational\n",
+       "2                 t mobile town east   navigational\n",
+       "3                          starbucks   navigational\n",
+       "4                        tech crunch   navigational"
       ]
      },
      "execution_count": 4,
     }
    ],
    "source": [
+    "temp_df= pd.read_csv(\"data_intent/intent_data.csv\")\n",
+    "temp_df.intent= temp_df.intent.map(map_intent)\n",
+    "temp_df= temp_df[temp_df.intent!=\"local\"]\n",
+    "temp_df.head()"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# original_df= temp_df.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_df= pd.concat([original_df, temp_df])"
    ]
   },
   {
     {
      "data": {
       "text/plain": [
+       "False    1304\n",
+       "True      196\n",
        "Name: count, dtype: int64"
       ]
      },
    "metadata": {},
    "outputs": [],
    "source": [
+    "# original_df.drop_duplicates(inplace=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    1304\n",
+       "True      196\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "original_df.duplicated().value_counts()"
    ]
   },
   {
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "original_df= original_df[original_df.intent!='Local']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['commercial', 'informational', 'navigational', 'transactional']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "intents= original_df.intent.unique().tolist()\n",
+    "intents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "id2label= {}\n",
     "label2id= {}\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{0: 'commercial', 1: 'informational', 2: 'navigational', 3: 'transactional'}"
       ]
      },
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{'commercial': 0, 'informational': 1, 'navigational': 2, 'transactional': 3}"
       ]
      },
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
        "      <th>keyword</th>\n",
        "      <th>intent</th>\n",
        "      <th>id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>citalopram vs prozac</td>\n",
+       "      <td>commercial</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>who is the oldest football player</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>t mobile town east</td>\n",
+       "      <td>navigational</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>starbucks</td>\n",
+       "      <td>navigational</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>tech crunch</td>\n",
+       "      <td>navigational</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1703</th>\n",
        "      <td>How to make homemade pet accessories from recy...</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1704</th>\n",
        "      <td>Top 10 science fiction book series that take r...</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1705</th>\n",
        "      <td>How to start a car restoration and customizati...</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1706</th>\n",
        "      <td>Ancient Mesopotamian architecture and its infl...</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1707</th>\n",
        "      <td>Benefits of a flexitarian diet for those seeki...</td>\n",
+       "      <td>informational</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>1500 rows × 3 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
+       "                                                keyword         intent  id\n",
+       "0                                  citalopram vs prozac     commercial   0\n",
+       "1                     who is the oldest football player  informational   1\n",
+       "2                                    t mobile town east   navigational   2\n",
+       "3                                             starbucks   navigational   2\n",
+       "4                                           tech crunch   navigational   2\n",
+       "...                                                 ...            ...  ..\n",
+       "1703  How to make homemade pet accessories from recy...  informational   1\n",
+       "1704  Top 10 science fiction book series that take r...  informational   1\n",
+       "1705  How to start a car restoration and customizati...  informational   1\n",
+       "1706  Ancient Mesopotamian architecture and its infl...  informational   1\n",
+       "1707  Benefits of a flexitarian diet for those seeki...  informational   1\n",
        "\n",
+       "[1500 rows x 3 columns]"
       ]
      },
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
+       "      <th>keyword</th>\n",
        "      <th>id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
+       "      <td>citalopram vs prozac</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
+       "      <td>who is the oldest football player</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
+       "      <td>t mobile town east</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>starbucks</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
+       "      <td>tech crunch</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1703</th>\n",
+       "      <td>How to make homemade pet accessories from recy...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1704</th>\n",
+       "      <td>Top 10 science fiction book series that take r...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1705</th>\n",
+       "      <td>How to start a car restoration and customizati...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1706</th>\n",
+       "      <td>Ancient Mesopotamian architecture and its infl...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1707</th>\n",
+       "      <td>Benefits of a flexitarian diet for those seeki...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>1500 rows × 2 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
+       "                                                keyword  id\n",
+       "0                                  citalopram vs prozac   0\n",
+       "1                     who is the oldest football player   1\n",
+       "2                                    t mobile town east   2\n",
+       "3                                             starbucks   2\n",
+       "4                                           tech crunch   2\n",
        "...                                                 ...  ..\n",
+       "1703  How to make homemade pet accessories from recy...   1\n",
+       "1704  Top 10 science fiction book series that take r...   1\n",
+       "1705  How to start a car restoration and customizati...   1\n",
+       "1706  Ancient Mesopotamian architecture and its infl...   1\n",
+       "1707  Benefits of a flexitarian diet for those seeki...   1\n",
        "\n",
+       "[1500 rows x 2 columns]"
       ]
      },
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# df= original_df[['metatitle', 'id']]\n",
+    "df= original_df[['keyword', 'id']]\n",
     "df"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Buy baby stroller</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Why do leaves change color in the fall?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How to improve your leadership skills</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>sneakers amazon</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Shop for photography equipment</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1495</th>\n",
+       "      <td>Why do stars twinkle?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1496</th>\n",
+       "      <td>Buy eco-friendly beauty products</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1497</th>\n",
+       "      <td>Order makeup kit</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1498</th>\n",
+       "      <td>Lowe's</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1499</th>\n",
+       "      <td>Get photography equipment</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1500 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      keyword  id\n",
+       "0                           Buy baby stroller   3\n",
+       "1     Why do leaves change color in the fall?   1\n",
+       "2       How to improve your leadership skills   1\n",
+       "3                             sneakers amazon   3\n",
+       "4              Shop for photography equipment   3\n",
+       "...                                       ...  ..\n",
+       "1495                    Why do stars twinkle?   1\n",
+       "1496         Buy eco-friendly beauty products   0\n",
+       "1497                         Order makeup kit   3\n",
+       "1498                                   Lowe's   2\n",
+       "1499                Get photography equipment   3\n",
+       "\n",
+       "[1500 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df= df.sample(frac=1).reset_index(drop=True)\n",
+    "\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/home/ubuntu/FineTunedDistilledBertAIChecker/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
+       "      <th>870</th>\n",
+       "      <td>Important space missions to the New Horizons s...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>947</th>\n",
+       "      <td>How to start a travel and adventure blog</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>477</th>\n",
+       "      <td>How to improve your critical thinking skills</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>174</th>\n",
+       "      <td>How to make homemade baby food</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1369</th>\n",
+       "      <td>Cheap sustainable clothing brands</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>396</th>\n",
+       "      <td>Exploring the mysteries of the deep ocean</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>206</th>\n",
+       "      <td>Discounted eco-friendly patio decor</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>191</th>\n",
+       "      <td>Cheap eco-friendly office products</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>533</th>\n",
+       "      <td>Affordable pet supplies</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1398</th>\n",
+       "      <td>Travel tips for Japan</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
       ],
       "text/plain": [
        "                                                   text  label\n",
+       "870   Important space missions to the New Horizons s...      1\n",
+       "947            How to start a travel and adventure blog      1\n",
+       "477        How to improve your critical thinking skills      1\n",
+       "174                      How to make homemade baby food      1\n",
+       "1369                  Cheap sustainable clothing brands      0\n",
+       "396           Exploring the mysteries of the deep ocean      1\n",
+       "206                 Discounted eco-friendly patio decor      0\n",
+       "191                  Cheap eco-friendly office products      0\n",
+       "533                             Affordable pet supplies      0\n",
+       "1398                              Travel tips for Japan      1"
       ]
      },
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df.rename(columns={\n",
+    "    \"keyword\": \"text\", \n",
+    "    # \"metatitle\": \"text\", \n",
     "    \"id\": \"label\"\n",
     "}, \n",
     "          inplace=True\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "Dataset({\n",
+       "    features: ['text', 'label'],\n",
+       "    num_rows: 1500\n",
        "})"
       ]
      },
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
       "text/plain": [
        "DatasetDict({\n",
        "    train: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 1125\n",
        "    })\n",
        "    test: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 375\n",
        "    })\n",
        "})"
       ]
      },
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "new_data= dataset_df.train_test_split(test_size=0.25)\n",
     "new_data"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Map: 100%|██████████| 1125/1125 [00:00<00:00, 31352.56 examples/s]\n",
+      "Map: 100%|██████████| 375/375 [00:00<00:00, 29503.00 examples/s]\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2023-11-04 12:46:03.199613: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-11-04 12:46:03.249373: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-04 12:46:03.249409: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-04 12:46:03.249439: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-04 12:46:03.257947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-11-04 12:46:04.345188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     }
     "\n",
     "model = AutoModelForSequenceClassification.from_pretrained(\n",
     "    # \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n",
+    "    # \"distilbert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
+    "    \"bert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
        "\n",
        "    <div>\n",
        "      \n",
+       "      <progress value='426' max='426' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [426/426 00:56, Epoch 6/6]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <td>1</td>\n",
        "      <td>No log</td>\n",
+       "      <td>0.350181</td>\n",
+       "      <td>0.957333</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>2</td>\n",
        "      <td>No log</td>\n",
+       "      <td>0.107043</td>\n",
+       "      <td>0.973333</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>3</td>\n",
        "      <td>No log</td>\n",
+       "      <td>0.087978</td>\n",
+       "      <td>0.978667</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>4</td>\n",
        "      <td>No log</td>\n",
+       "      <td>0.085274</td>\n",
+       "      <td>0.973333</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>5</td>\n",
        "      <td>No log</td>\n",
+       "      <td>0.086987</td>\n",
+       "      <td>0.973333</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>6</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.093197</td>\n",
+       "      <td>0.970667</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
     {
      "data": {
       "text/plain": [
+       "TrainOutput(global_step=426, training_loss=0.1806535676051753, metrics={'train_runtime': 57.2339, 'train_samples_per_second': 117.937, 'train_steps_per_second': 7.443, 'total_flos': 44042600979624.0, 'train_loss': 0.1806535676051753, 'epoch': 6.0})"
       ]
      },
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "training_args = TrainingArguments(\n",
+    "    output_dir=\"intent_classification_model_without_metatitle_with_local23\",\n",
     "    learning_rate=2e-5,\n",
     "    per_device_train_batch_size=16,\n",
     "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=6,\n",
     "    weight_decay=0.01,\n",
     "    evaluation_strategy=\"epoch\",\n",
     "    save_strategy=\"epoch\",\n",

research/14_keyword_intent.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/__pycache__/client.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/client.cpython-310.pyc and b/utils/__pycache__/client.cpython-310.pyc differ

utils/client.py CHANGED Viewed

@@ -40,6 +40,28 @@ client = RestClient(data_for_seo_email, data_for_seo_password)
 # client = RestClient("deepankar@warewe.com", "cb1661e9ec7c1fba")
 def generate_seo_metatitle(keyword, num_query_results=10):
     post_data = dict()
     # You can set only one task at a time

 # client = RestClient("deepankar@warewe.com", "cb1661e9ec7c1fba")
+def generate_keyword_intent_list(list_of_keywords: list):
+    post_data = dict()
+    # simple way to set a task
+    post_data[len(post_data)] = dict(
+        keywords= list_of_keywords,
+        language_name="English"
+    )
+    # POST /v3/dataforseo_labs/google/search_intent/live
+    response = client.post("/v3/dataforseo_labs/google/search_intent/live", post_data)
+    # you can find the full list of the response codes here https://docs.dataforseo.com/v3/appendix/errors
+    if response["status_code"] == 20000:
+        # print(response)
+        return response["tasks"][0]["result"][0]["items"]
+        # do something with result
+    else:
+        print("error. Code: %d Message: %s" % (response["status_code"], response["status_message"]))
 def generate_seo_metatitle(keyword, num_query_results=10):
     post_data = dict()
     # You can set only one task at a time