Ubuntu commited on Oct 13, 2023

Commit

e77b318

•

1 Parent(s): d0702fa

added intent classification using distil bert

Browse files

Files changed (29) hide show

data_intent/intent_data.csv +3 -0
intent_classification_model/checkpoint-324/added_tokens.json +7 -0
intent_classification_model/checkpoint-324/config.json +39 -0
intent_classification_model/checkpoint-324/optimizer.pt +3 -0
intent_classification_model/checkpoint-324/pytorch_model.bin +3 -0
intent_classification_model/checkpoint-324/rng_state.pth +0 -0
intent_classification_model/checkpoint-324/scheduler.pt +3 -0
intent_classification_model/checkpoint-324/special_tokens_map.json +7 -0
intent_classification_model/checkpoint-324/tokenizer.json +0 -0
intent_classification_model/checkpoint-324/tokenizer_config.json +56 -0
intent_classification_model/checkpoint-324/trainer_state.json +73 -0
intent_classification_model/checkpoint-324/training_args.bin +3 -0
intent_classification_model/checkpoint-324/vocab.txt +0 -0
intent_classification_model/runs/Oct13_09-06-59_ip-172-31-95-165/events.out.tfevents.1697188019.ip-172-31-95-165.137562.0 +0 -0
intent_classification_model/runs/Oct13_09-08-12_ip-172-31-95-165/events.out.tfevents.1697188092.ip-172-31-95-165.137562.1 +0 -0
intent_classification_model/runs/Oct13_09-08-49_ip-172-31-95-165/events.out.tfevents.1697188130.ip-172-31-95-165.137562.2 +0 -0
intent_classification_model/runs/Oct13_09-09-35_ip-172-31-95-165/events.out.tfevents.1697188176.ip-172-31-95-165.137562.3 +0 -0
intent_classification_model/runs/Oct13_09-10-07_ip-172-31-95-165/events.out.tfevents.1697188208.ip-172-31-95-165.138160.0 +0 -0
research/04_inference.ipynb +217 -0
research/10_demo_test_data.ipynb +19 -10
research/11_evaluation.html +0 -0
research/11_evaluation.ipynb +290 -0
research/11_intent_classification_using_distilbert.ipynb +898 -0
utils/__pycache__/get_category.cpython-310.pyc +0 -0
utils/__pycache__/get_intent.cpython-310.pyc +0 -0
utils/__pycache__/get_sentence_status.cpython-310.pyc +0 -0
utils/get_category.py +8 -4
utils/get_intent.py +69 -0
utils/get_sentence_status.py +48 -1

data_intent/intent_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24091e2e977d444be178138ac717fa57b8d16534dcf5e66d4084cf3f77e6f6ce
+size 39551

intent_classification_model/checkpoint-324/added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "[CLS]": 101,
+  "[MASK]": 103,
+  "[PAD]": 0,
+  "[SEP]": 102,
+  "[UNK]": 100
+}

intent_classification_model/checkpoint-324/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "Commercial",
+    "1": "Informational",
+    "2": "Navigational",
+    "3": "Local",
+    "4": "Transactional"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "Commercial": 0,
+    "Informational": 1,
+    "Local": 3,
+    "Navigational": 2,
+    "Transactional": 4
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "vocab_size": 30522
+}

intent_classification_model/checkpoint-324/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50f88f7a9097ecddb2b3c7e3d38747deec4ca3a386132fac9e0e4efaa82ae0e
+size 535745722

intent_classification_model/checkpoint-324/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b339df5c0d892e025a1749d085ab010e551f4b249eb497812a1a3bd7ebd5fd99
+size 267865194

intent_classification_model/checkpoint-324/rng_state.pth ADDED Viewed

Binary file (14.2 kB). View file

intent_classification_model/checkpoint-324/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73f74582c189fe624f606122980ccb279125588a1db45b4052dc704fa2b51184
+size 1064

intent_classification_model/checkpoint-324/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

intent_classification_model/checkpoint-324/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

intent_classification_model/checkpoint-324/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

intent_classification_model/checkpoint-324/trainer_state.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "best_metric": 0.16397738456726074,
+  "best_model_checkpoint": "intent_classification_model/checkpoint-270",
+  "epoch": 6.0,
+  "eval_steps": 500,
+  "global_step": 324,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9488372093023256,
+      "eval_loss": 0.4676927328109741,
+      "eval_runtime": 0.1185,
+      "eval_samples_per_second": 1814.083,
+      "eval_steps_per_second": 118.126,
+      "step": 54
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9534883720930233,
+      "eval_loss": 0.20428764820098877,
+      "eval_runtime": 0.0972,
+      "eval_samples_per_second": 2210.83,
+      "eval_steps_per_second": 143.961,
+      "step": 108
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9674418604651163,
+      "eval_loss": 0.16401757299900055,
+      "eval_runtime": 0.1015,
+      "eval_samples_per_second": 2118.828,
+      "eval_steps_per_second": 137.97,
+      "step": 162
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.9674418604651163,
+      "eval_loss": 0.16496841609477997,
+      "eval_runtime": 0.0941,
+      "eval_samples_per_second": 2284.398,
+      "eval_steps_per_second": 148.752,
+      "step": 216
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9674418604651163,
+      "eval_loss": 0.16397738456726074,
+      "eval_runtime": 0.0975,
+      "eval_samples_per_second": 2204.851,
+      "eval_steps_per_second": 143.572,
+      "step": 270
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9674418604651163,
+      "eval_loss": 0.16553252935409546,
+      "eval_runtime": 0.0947,
+      "eval_samples_per_second": 2271.063,
+      "eval_steps_per_second": 147.883,
+      "step": 324
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 324,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "total_flos": 13032177536640.0,
+  "trial_name": null,
+  "trial_params": null
+}

intent_classification_model/checkpoint-324/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c27308f0087e544f12e1806abafb33d65745a5791fb1559d9e521f3670215df9
+size 4536

intent_classification_model/checkpoint-324/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

intent_classification_model/runs/Oct13_09-06-59_ip-172-31-95-165/events.out.tfevents.1697188019.ip-172-31-95-165.137562.0 ADDED Viewed

Binary file (5.3 kB). View file

intent_classification_model/runs/Oct13_09-08-12_ip-172-31-95-165/events.out.tfevents.1697188092.ip-172-31-95-165.137562.1 ADDED Viewed

Binary file (6.02 kB). View file

intent_classification_model/runs/Oct13_09-08-49_ip-172-31-95-165/events.out.tfevents.1697188130.ip-172-31-95-165.137562.2 ADDED Viewed

Binary file (5.93 kB). View file

intent_classification_model/runs/Oct13_09-09-35_ip-172-31-95-165/events.out.tfevents.1697188176.ip-172-31-95-165.137562.3 ADDED Viewed

Binary file (4.73 kB). View file

intent_classification_model/runs/Oct13_09-10-07_ip-172-31-95-165/events.out.tfevents.1697188208.ip-172-31-95-165.138160.0 ADDED Viewed

Binary file (6.6 kB). View file

research/04_inference.ipynb CHANGED Viewed

@@ -673,6 +673,223 @@
     "There are a few reasons why language modeling people like perplexity instead of just using entropy. One is that, because of the exponent, improvements in perplexity \"feel\" like they are more substantial than the equivalent improvement in entropy. Another is that before they started using perplexity, the complexity of a language model was reported using a simplistic branching factor measurement that is more similar to perplexity than it is to entropy.''')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "There are a few reasons why language modeling people like perplexity instead of just using entropy. One is that, because of the exponent, improvements in perplexity \"feel\" like they are more substantial than the equivalent improvement in entropy. Another is that before they started using perplexity, the complexity of a language model was reported using a simplistic branching factor measurement that is more similar to perplexity than it is to entropy.''')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os; os.chdir(\n",
+    "    '..'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from utils.get_sentence_status import get_top_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Human Written', 0.999), ('AI written', 0.002)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels('''12\n",
+    "\n",
+    "Yes, the perplexity is always equal to two to the power of the entropy. It doesn't matter what type of model you have, n-gram, unigram, or neural network.\n",
+    "\n",
+    "There are a few reasons why language modeling people like perplexity instead of just using entropy. One is that, because of the exponent, improvements in perplexity \"feel\" like they are more substantial than the equivalent improvement in entropy. Another is that before they started using perplexity, the complexity of a language model was reported using a simplistic branching factor measurement that is more similar to perplexity than it is to entropy.''')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('AI written', 1.0), ('Human Written', 0.0)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    'My name is deepankar'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('AI written', 0.999), ('Human Written', 0.001)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    '''Hate speech or discriminatory content: Hate speech is speech, conduct, writing, or expressions that discriminate or promote discrimination against individuals or groups based on attributes such as race, religion, nationality, gender, sexual orientation, disability, or other characteristics. It often includes offensive language, stereotypes, or harmful stereotypes and can contribute to a hostile or unsafe environment for affected individuals.\n",
+    "\n",
+    "Explicit or adult content: Explicit or adult content typically refers to material that is sexually explicit, pornographic, or contains graphic depictions of sexual acts. This content may not be suitable for all audiences and is subject to age restrictions and content regulations in many jurisdictions.'''\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('AI written', 0.912), ('Human Written', 0.115)]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    '''Of course, I can provide a more detailed explanation of these topics:\n",
+    "\n",
+    "1. **Hate speech or discriminatory content:** Hate speech is speech, conduct, writing, or expressions that discriminate or promote discrimination against individuals or groups based on attributes such as race, religion, nationality, gender, sexual orientation, disability, or other characteristics. It often includes offensive language, stereotypes, or harmful stereotypes and can contribute to a hostile or unsafe environment for affected individuals.\n",
+    "\n",
+    "2. **Explicit or adult content:** Explicit or adult content typically refers to material that is sexually explicit, pornographic, or contains graphic depictions of sexual acts. This content may not be suitable for all audiences and is subject to age restrictions and content regulations in many jurisdictions.\n",
+    "\n",
+    "9. **Inflammatory or extremist viewpoints:** Inflammatory viewpoints are those that are deliberately provocative, offensive, or designed to incite anger or outrage. Extreme or extremist viewpoints often involve radical ideologies and can contribute to division and hostility in discussions. Engaging in conversations that promote understanding and open dialogue is generally more constructive.\n",
+    "\n",
+    "In summary, these topics can be divisive, offensive, and harmful. When discussing or encountering them, it's essential to approach with respect, empathy, and a focus on maintaining a positive and safe environment for everyone involved.'''\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('AI written', 0.998), ('Human Written', 0.003)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    '''The situation in Israel remains tense. More than 1,200 people have been killed so far in the terror attacks by Hamas groups, both by their infiltration and rockets. The southern part of Israel which shares borders with the Gaza Strip still remains vulnerable. \n",
+    "\n",
+    "Ashkelon, one of the biggest cities in South Israel, has become a ghost town. Life is no longer normal here. Post noon till midnight there are a number of siren alarms, creating a constant atmosphere of panic ever since rockets were pounded into the city.'''\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('AI written', 1.0), ('Human Written', 0.0)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    '''Optical illusions are fascinating pictures that trick our eyes, making us doubt what's real. They come in different types and can make us question what we see, think, and understand about the world. Even scientists sometimes struggle to figure out these puzzling illusions.\n",
+    "\n",
+    "These illusions have various purposes. They challenge our minds, testing our thinking abilities. But they also provide a special way to delve into our personalities, revealing hidden aspects of who we are.\n",
+    "\n",
+    "The task is simple: look at the image and note what you see first. Your initial observation can unveil your deepest insecurity. Most people see either a ditch surrounded by trees or an eye.'''\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Human Written', 0.941), ('AI written', 0.056)]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_labels(\n",
+    "    '''Learn from IIT Faculty & Industry Experts with Guaranteed Job Interviews.\n",
+    "Campus Immersion at IIT Roorkee.\n",
+    "Master machine learning and artificial intelligence skills with this advanced data science and artificial intelligence course from iHub IIT Roorkee. Learn from IIT faculty and industry experts with 1:1 mentorship in this intensive online bootcamp. Top 2 performers from each batch may get a fellowship worth Rs. 80,000, plus the opportunity to showcase their startup ideas and secure incubation support of upto Rs. 50 Lakhs for their startup from iHUB DivyaSampark, IIT Roorkee.'''\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

research/10_demo_test_data.ipynb CHANGED Viewed

@@ -768,7 +768,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -785,7 +785,13 @@
      "output_type": "stream",
      "text": [
       "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
      ]
     }
@@ -802,7 +808,10 @@
     {
      "data": {
       "text/plain": [
-       "[('Food_and_Drink', 0.99), ('Computers_and_Electronics', 0.973)]"
       ]
      },
      "execution_count": 3,
@@ -818,38 +827,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[('Pets_and_Animals', 0.583)]"
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "get_top_labels(\n",
-    "    'turtle beach shaped headset guide'\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[('Home_and_Garden', 1.0)]"
       ]
      },
-     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
      "output_type": "stream",
      "text": [
       "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
      ]
     }
     {
      "data": {
       "text/plain": [
+       "[('Food_and_Drink', 0.989),\n",
+       " ('Computers_and_Electronics', 0.973),\n",
+       " ('Games', 0.172),\n",
+       " ('Shopping', 0.134)]"
       ]
      },
      "execution_count": 3,
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "[('Computers_and_Electronics', 0.999), ('Shopping', 0.993)]"
       ]
      },
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "get_top_labels(\n",
+    "    'amazon  mindkoo headsets with discount'\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "[('Home_and_Garden', 0.999), ('Computers_and_Electronics', 0.243)]"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }

research/11_evaluation.html ADDED Viewed

The diff for this file is too large to render. See raw diff

research/11_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,290 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os; os.chdir('..')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.get_intent import get_top_intent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Commercial', 0.969),\n",
+       " ('Transactional', 0.673),\n",
+       " ('Informational', 0.237),\n",
+       " ('Navigational', 0.215),\n",
+       " ('Local', 0.155)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"best cat ear headphones\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Transactional', 0.987),\n",
+       " ('Navigational', 0.317),\n",
+       " ('Commercial', 0.27),\n",
+       " ('Informational', 0.249),\n",
+       " ('Local', 0.229)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"buy cat ear headphones\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Informational', 0.984),\n",
+       " ('Local', 0.244),\n",
+       " ('Commercial', 0.237),\n",
+       " ('Transactional', 0.212),\n",
+       " ('Navigational', 0.194)]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"how to create a facebook account\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Local', 0.988),\n",
+       " ('Informational', 0.3),\n",
+       " ('Commercial', 0.278),\n",
+       " ('Navigational', 0.273),\n",
+       " ('Transactional', 0.234)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"barber shops in USA\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Informational', 0.763),\n",
+       " ('Navigational', 0.638),\n",
+       " ('Transactional', 0.433),\n",
+       " ('Commercial', 0.286),\n",
+       " ('Local', 0.236)]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"Razer Kraken Headsets\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Navigational', 0.861),\n",
+       " ('Transactional', 0.725),\n",
+       " ('Local', 0.422),\n",
+       " ('Commercial', 0.287),\n",
+       " ('Informational', 0.202)]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"Amazon Great indian festival\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Navigational', 0.983),\n",
+       " ('Transactional', 0.27),\n",
+       " ('Local', 0.23),\n",
+       " ('Informational', 0.209),\n",
+       " ('Commercial', 0.192)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"facebook\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Navigational', 0.983),\n",
+       " ('Transactional', 0.256),\n",
+       " ('Informational', 0.241),\n",
+       " ('Local', 0.214),\n",
+       " ('Commercial', 0.184)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"spotify\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Local', 0.988),\n",
+       " ('Informational', 0.294),\n",
+       " ('Navigational', 0.284),\n",
+       " ('Commercial', 0.252),\n",
+       " ('Transactional', 0.235)]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"parlours in dubai\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Informational', 0.984),\n",
+       " ('Local', 0.245),\n",
+       " ('Commercial', 0.242),\n",
+       " ('Transactional', 0.226),\n",
+       " ('Navigational', 0.189)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_top_intent(\"how to wear headphones\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/11_intent_classification_using_distilbert.ipynb ADDED Viewed

	@@ -0,0 +1,898 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os; os.chdir('..')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>intent</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>citalopram vs prozac</td>\n",
+       "      <td>Commercial</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>who is the oldest football player</td>\n",
+       "      <td>Informational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>t mobile town east</td>\n",
+       "      <td>Navigational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>starbucks</td>\n",
+       "      <td>Navigational</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tech crunch</td>\n",
+       "      <td>Navigational</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             keyword         intent\n",
+       "0               citalopram vs prozac     Commercial\n",
+       "1  who is the oldest football player  Informational\n",
+       "2                 t mobile town east   Navigational\n",
+       "3                          starbucks   Navigational\n",
+       "4                        tech crunch   Navigational"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_df= pd.read_csv(\"data_intent/intent_data.csv\")\n",
+    "original_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "intents= original_df.intent.unique().tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id2label= {}\n",
+    "label2id= {}\n",
+    "for i in range(len(intents)):\n",
+    "    id2label[i]= intents[i]\n",
+    "    label2id[intents[i]]= i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{0: 'Commercial',\n",
+       " 1: 'Informational',\n",
+       " 2: 'Navigational',\n",
+       " 3: 'Local',\n",
+       " 4: 'Transactional'}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "id2label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Commercial': 0,\n",
+       " 'Informational': 1,\n",
+       " 'Navigational': 2,\n",
+       " 'Local': 3,\n",
+       " 'Transactional': 4}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "label2id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_label2id(label):\n",
+    "    return label2id[label]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>intent</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>citalopram vs prozac</td>\n",
+       "      <td>Commercial</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>who is the oldest football player</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>t mobile town east</td>\n",
+       "      <td>Navigational</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>starbucks</td>\n",
+       "      <td>Navigational</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tech crunch</td>\n",
+       "      <td>Navigational</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1066</th>\n",
+       "      <td>How to make a paper flower?</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1067</th>\n",
+       "      <td>Why do some animals camouflage?</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1068</th>\n",
+       "      <td>What is the history of ancient civilizations?</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1069</th>\n",
+       "      <td>How to make a simple machine?</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1070</th>\n",
+       "      <td>Why do we see the phases of the moon?</td>\n",
+       "      <td>Informational</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1071 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            keyword         intent  id\n",
+       "0                              citalopram vs prozac     Commercial   0\n",
+       "1                 who is the oldest football player  Informational   1\n",
+       "2                                t mobile town east   Navigational   2\n",
+       "3                                         starbucks   Navigational   2\n",
+       "4                                       tech crunch   Navigational   2\n",
+       "...                                             ...            ...  ..\n",
+       "1066                    How to make a paper flower?  Informational   1\n",
+       "1067                Why do some animals camouflage?  Informational   1\n",
+       "1068  What is the history of ancient civilizations?  Informational   1\n",
+       "1069                  How to make a simple machine?  Informational   1\n",
+       "1070          Why do we see the phases of the moon?  Informational   1\n",
+       "\n",
+       "[1071 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_df['id']= original_df.intent.map(make_label2id)\n",
+    "original_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>citalopram vs prozac</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>who is the oldest football player</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>t mobile town east</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>starbucks</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tech crunch</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1066</th>\n",
+       "      <td>How to make a paper flower?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1067</th>\n",
+       "      <td>Why do some animals camouflage?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1068</th>\n",
+       "      <td>What is the history of ancient civilizations?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1069</th>\n",
+       "      <td>How to make a simple machine?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1070</th>\n",
+       "      <td>Why do we see the phases of the moon?</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1071 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            keyword  id\n",
+       "0                              citalopram vs prozac   0\n",
+       "1                 who is the oldest football player   1\n",
+       "2                                t mobile town east   2\n",
+       "3                                         starbucks   2\n",
+       "4                                       tech crunch   2\n",
+       "...                                             ...  ..\n",
+       "1066                    How to make a paper flower?   1\n",
+       "1067                Why do some animals camouflage?   1\n",
+       "1068  What is the history of ancient civilizations?   1\n",
+       "1069                  How to make a simple machine?   1\n",
+       "1070          Why do we see the phases of the moon?   1\n",
+       "\n",
+       "[1071 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df= original_df[['keyword', 'id']]\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Dataset, load_dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_138160/1635098052.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df.rename(columns={\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>706</th>\n",
+       "      <td>Purchase DJ equipment</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>best headphones quora</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>727</th>\n",
+       "      <td>Purchase fitness tracker</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>facebook</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>808</th>\n",
+       "      <td>Outdoor activities in Lake Tahoe</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>946</th>\n",
+       "      <td>Wine bars in Napa Valley</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>944</th>\n",
+       "      <td>Art installations in Chicago</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>899</th>\n",
+       "      <td>Snowboarding parks in Utah</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Mission Immpossible</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>129</th>\n",
+       "      <td>Instagram</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 text  label\n",
+       "706             Purchase DJ equipment      4\n",
+       "24              best headphones quora      2\n",
+       "727          Purchase fitness tracker      4\n",
+       "17                           facebook      2\n",
+       "808  Outdoor activities in Lake Tahoe      3\n",
+       "946          Wine bars in Napa Valley      3\n",
+       "944      Art installations in Chicago      3\n",
+       "899        Snowboarding parks in Utah      3\n",
+       "36                Mission Immpossible      1\n",
+       "129                         Instagram      2"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.rename(columns={\n",
+    "    \"keyword\": \"text\", \n",
+    "    \"id\": \"label\"\n",
+    "}, \n",
+    "          inplace=True\n",
+    ")\n",
+    "\n",
+    "df.sample(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:373: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n",
+      "  if _pandas_api.is_sparse(col):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['text', 'label'],\n",
+       "    num_rows: 1071\n",
+       "})"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset_df= Dataset.from_pandas(df)\n",
+    "dataset_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 856\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 215\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_data= dataset_df.train_test_split(test_size=0.2)\n",
+    "new_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    return tokenizer(examples[\"text\"], truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 856/856 [00:00<00:00, 18779.12 examples/s]\n",
+      "Map: 100%|██████████| 215/215 [00:00<00:00, 27520.84 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenized_df = new_data.map(preprocess_function, batched=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-10-13 09:10:00.122326: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-10-13 09:10:01.611782: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
+   "source": [
+    "# from transformers import DataCollatorWithPadding\n",
+    "\n",
+    "# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "from transformers import DataCollatorWithPadding\n",
+    "\n",
+    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate\n",
+    "\n",
+    "accuracy = evaluate.load(\"accuracy\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    predictions = np.argmax(predictions, axis=1)\n",
+    "    return accuracy.compute(predictions=predictions, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
+    "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='324' max='324' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [324/324 00:39, Epoch 6/6]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.467693</td>\n",
+       "      <td>0.948837</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.204288</td>\n",
+       "      <td>0.953488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.164018</td>\n",
+       "      <td>0.967442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.164968</td>\n",
+       "      <td>0.967442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.163977</td>\n",
+       "      <td>0.967442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.165533</td>\n",
+       "      <td>0.967442</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=324, training_loss=0.2842947171058184, metrics={'train_runtime': 40.8212, 'train_samples_per_second': 125.817, 'train_steps_per_second': 7.937, 'total_flos': 13032177536640.0, 'train_loss': 0.2842947171058184, 'epoch': 6.0})"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"intent_classification_model\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=6,\n",
+    "    weight_decay=0.01,\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    load_best_model_at_end=True,\n",
+    "    # push_to_hub=True,\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_df[\"train\"],\n",
+    "    eval_dataset=tokenized_df[\"test\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/__pycache__/get_category.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/get_category.cpython-310.pyc and b/utils/__pycache__/get_category.cpython-310.pyc differ

utils/__pycache__/get_intent.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file

utils/__pycache__/get_sentence_status.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/get_sentence_status.cpython-310.pyc and b/utils/__pycache__/get_sentence_status.cpython-310.pyc differ

utils/get_category.py CHANGED Viewed

@@ -93,16 +93,20 @@ def get_top_labels(keyword: str):
     for i in range(27):
         score= individual_probabilities_scores[i]
-        if score>=0.5:
             score_list.append(
-                (id2label[i], score)
-            )
     score_list.sort(
         key= lambda x: x[1], reverse=True
     )
-    return score_list

     for i in range(27):
         score= individual_probabilities_scores[i]
+        if score>=0.1:
             score_list.append(
+                    (id2label[i], score)
+                )
+        # if score>=0.5:
+        #     score_list.append(
+        #         (id2label[i], score)
+        #     )
     score_list.sort(
         key= lambda x: x[1], reverse=True
     )
+    return score_list[:5]

utils/get_intent.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from transformers import AutoTokenizer
+from transformers import AutoModelForSequenceClassification
+import torch
+from torch.nn import functional as F
+import numpy as np
+import json
+label2id= json.load(
+    open('data/categories_refined.json', 'r')
+)
+id2label= {}
+for key in label2id.keys():
+    id2label[label2id[key]] = key
+model_name= "intent_classification_model/checkpoint-324"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")
+# probabilities = 1 / (1 + np.exp(-logit_score))
+def logit2prob(logit):
+    # odds =np.exp(logit)
+    # prob = odds / (1 + odds)
+    prob= 1/(1+ np.exp(-logit))
+    return np.round(prob, 3)
+def get_top_intent(keyword: str):
+    '''
+    Returns score list
+    '''
+    inputs = tokenizer(keyword, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # print("logits: ", logits)
+    # predicted_class_id = logits.argmax().item()
+    # get probabilities using softmax from logit score and convert it to numpy array
+    # probabilities_scores = F.softmax(logits.cpu(), dim = -1).numpy()[0]
+    individual_probabilities_scores = logit2prob(logits.cpu().numpy()[0])
+    score_list= []
+    for i in range(5):
+        label= model.config.id2label[i]
+        score= individual_probabilities_scores[i]
+        score_list.append(
+                    (label, score)
+                )
+        # if score>=0.5:
+        #     score_list.append(
+        #         (id2label[i], score)
+        #     )
+    score_list.sort(
+        key= lambda x: x[1], reverse=True
+    )
+    return score_list

utils/get_sentence_status.py CHANGED Viewed

@@ -12,6 +12,13 @@ tokenizer_v2 = AutoTokenizer.from_pretrained("gpt2-large")
 model = AutoModelForSequenceClassification.from_pretrained("gpt3_finetuned_model/checkpoint-30048").to("cuda")
 def split_sentence(sentence:str):
     # Create a regular expression pattern from the list of separators
     sentence= sentence.replace('\n', '')
@@ -98,4 +105,44 @@ def complete_sentence_analysis(sentence:str):
         "label": label,
         "variance": variance,
         "avg_length": avg_length
-    }

 model = AutoModelForSequenceClassification.from_pretrained("gpt3_finetuned_model/checkpoint-30048").to("cuda")
+# probabilities = 1 / (1 + np.exp(-logit_score))
+def logit2prob(logit):
+    # odds =np.exp(logit)
+    # prob = odds / (1 + odds)
+    prob= 1/(1+ np.exp(-logit))
+    return np.round(prob, 3)
 def split_sentence(sentence:str):
     # Create a regular expression pattern from the list of separators
     sentence= sentence.replace('\n', '')
         "label": label,
         "variance": variance,
         "avg_length": avg_length
+    }
+def get_top_labels(keyword: str):
+    '''
+    Returns score list
+    '''
+    inputs = tokenizer(keyword, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # print("logits: ", logits)
+    # predicted_class_id = logits.argmax().item()
+    # get probabilities using softmax from logit score and convert it to numpy array
+    # probabilities_scores = F.softmax(logits.cpu(), dim = -1).numpy()[0]
+    individual_probabilities_scores = logit2prob(logits.cpu().numpy()[0])
+    score_list= []
+    for i in range(2):
+        label= "Human Written" if model.config.id2label[i]=='NEGATIVE' else 'AI written'
+        score= individual_probabilities_scores[i]
+        score_list.append(
+                    (label, score)
+                )
+        # if score>=0.5:
+        #     score_list.append(
+        #         (id2label[i], score)
+        #     )
+    score_list.sort(
+        key= lambda x: x[1], reverse=True
+    )
+    return score_list[:5]