Text Classification
English
amlanscloud commited on
Commit
0bcc91b
1 Parent(s): 293c45c

Upload text_sentiment_analysis_blog_notebook.ipynb

Browse files
text_sentiment_analysis_blog_notebook.ipynb CHANGED
@@ -2768,7 +2768,7 @@
2768
  },
2769
  {
2770
  "cell_type": "code",
2771
- "execution_count": 1,
2772
  "metadata": {
2773
  "colab": {
2774
  "base_uri": "https://localhost:8080/"
@@ -2893,7 +2893,8 @@
2893
  {
2894
  "cell_type": "markdown",
2895
  "source": [
2896
- "# Get and process Dataset"
 
2897
  ],
2898
  "metadata": {
2899
  "id": "1f0WYksXHPAM"
@@ -3003,7 +3004,7 @@
3003
  "id": "mqwYf9xWHUuh",
3004
  "outputId": "66965e88-8115-4d73-db90-6706371a0654"
3005
  },
3006
- "execution_count": 2,
3007
  "outputs": [
3008
  {
3009
  "output_type": "display_data",
@@ -3155,7 +3156,7 @@
3155
  "metadata": {
3156
  "id": "GuCNFWV_HXlf"
3157
  },
3158
- "execution_count": 3,
3159
  "outputs": []
3160
  },
3161
  {
@@ -3169,7 +3170,7 @@
3169
  "metadata": {
3170
  "id": "NiSQuUUgHdLn"
3171
  },
3172
- "execution_count": 4,
3173
  "outputs": []
3174
  },
3175
  {
@@ -3181,7 +3182,7 @@
3181
  "metadata": {
3182
  "id": "W_50sqXGHetc"
3183
  },
3184
- "execution_count": 5,
3185
  "outputs": []
3186
  },
3187
  {
@@ -3197,7 +3198,7 @@
3197
  "id": "TntoBCAvHg0D",
3198
  "outputId": "3820572d-2293-4b08-b9c5-7a6953c19ce8"
3199
  },
3200
- "execution_count": 6,
3201
  "outputs": [
3202
  {
3203
  "output_type": "execute_result",
@@ -3385,7 +3386,7 @@
3385
  "metadata": {
3386
  "id": "asZstARkHjz_"
3387
  },
3388
- "execution_count": 7,
3389
  "outputs": []
3390
  },
3391
  {
@@ -3403,7 +3404,7 @@
3403
  "id": "qPKqxIVwHu_9",
3404
  "outputId": "112271bf-d4db-47cd-9410-b92482488dd2"
3405
  },
3406
- "execution_count": 8,
3407
  "outputs": [
3408
  {
3409
  "output_type": "execute_result",
@@ -3575,7 +3576,7 @@
3575
  "id": "QwE34N-1Hw4O",
3576
  "outputId": "7a0ec80d-1c2e-497b-bc3a-a5eceae4fc11"
3577
  },
3578
- "execution_count": 9,
3579
  "outputs": [
3580
  {
3581
  "output_type": "execute_result",
@@ -3601,7 +3602,7 @@
3601
  "id": "R-bQ7kIfH4Lr",
3602
  "outputId": "1514f226-26dc-4ed4-adbc-edfef2c25e31"
3603
  },
3604
- "execution_count": 10,
3605
  "outputs": [
3606
  {
3607
  "output_type": "execute_result",
@@ -3629,7 +3630,7 @@
3629
  "id": "vZ91HtgCH5-B",
3630
  "outputId": "45ffe43b-9718-421d-c8d0-8fc2be5d9c5c"
3631
  },
3632
- "execution_count": 11,
3633
  "outputs": [
3634
  {
3635
  "output_type": "stream",
@@ -3651,7 +3652,8 @@
3651
  {
3652
  "cell_type": "markdown",
3653
  "source": [
3654
- "# Limit dataset for quick training"
 
3655
  ],
3656
  "metadata": {
3657
  "id": "6PQddaR0H-g-"
@@ -3666,7 +3668,7 @@
3666
  "metadata": {
3667
  "id": "DrtZRy3gH8IC"
3668
  },
3669
- "execution_count": 12,
3670
  "outputs": []
3671
  },
3672
  {
@@ -3681,7 +3683,8 @@
3681
  {
3682
  "cell_type": "markdown",
3683
  "source": [
3684
- "# Train test split"
 
3685
  ],
3686
  "metadata": {
3687
  "id": "bUEBfxgkIHQv"
@@ -3698,7 +3701,7 @@
3698
  "metadata": {
3699
  "id": "W7qXhZvdIJni"
3700
  },
3701
- "execution_count": 13,
3702
  "outputs": []
3703
  },
3704
  {
@@ -3716,7 +3719,7 @@
3716
  "id": "fxqHXivAILVc",
3717
  "outputId": "d5ce0660-92b9-445a-9e26-52445a7c1bb7"
3718
  },
3719
- "execution_count": 14,
3720
  "outputs": [
3721
  {
3722
  "output_type": "stream",
@@ -3739,7 +3742,7 @@
3739
  "metadata": {
3740
  "id": "c4a1lNf2INRx"
3741
  },
3742
- "execution_count": 15,
3743
  "outputs": []
3744
  },
3745
  {
@@ -3755,6 +3758,7 @@
3755
  "cell_type": "markdown",
3756
  "source": [
3757
  "# Pre process steps \n",
 
3758
  "\n",
3759
  "1. Stemming and Lemmatization\n",
3760
  "2. Tokenizer\n",
@@ -3810,7 +3814,7 @@
3810
  "metadata": {
3811
  "id": "wrs4BuXGIaQP"
3812
  },
3813
- "execution_count": 16,
3814
  "outputs": []
3815
  },
3816
  {
@@ -3838,7 +3842,7 @@
3838
  "metadata": {
3839
  "id": "z4bO5f5KIinV"
3840
  },
3841
- "execution_count": 17,
3842
  "outputs": []
3843
  },
3844
  {
@@ -3865,7 +3869,7 @@
3865
  "id": "gWpOtBFtIqnm",
3866
  "outputId": "d4a8c94c-65c8-4f7a-9248-071d0d427772"
3867
  },
3868
- "execution_count": 18,
3869
  "outputs": [
3870
  {
3871
  "output_type": "execute_result",
@@ -3896,7 +3900,7 @@
3896
  "metadata": {
3897
  "id": "kropkedQI0zE"
3898
  },
3899
- "execution_count": 19,
3900
  "outputs": []
3901
  },
3902
  {
@@ -3927,7 +3931,7 @@
3927
  "id": "x0Kp2C5LI9nP",
3928
  "outputId": "b13990c0-8aaf-42a2-ee6d-7122bf0f72a7"
3929
  },
3930
- "execution_count": 20,
3931
  "outputs": [
3932
  {
3933
  "output_type": "execute_result",
@@ -3950,7 +3954,8 @@
3950
  {
3951
  "cell_type": "markdown",
3952
  "source": [
3953
- "# Create Model"
 
3954
  ],
3955
  "metadata": {
3956
  "id": "OnZDEGl2JJ2c"
@@ -3970,7 +3975,7 @@
3970
  "metadata": {
3971
  "id": "8ENmS8h0JHcm"
3972
  },
3973
- "execution_count": 21,
3974
  "outputs": []
3975
  },
3976
  {
@@ -3992,7 +3997,7 @@
3992
  "id": "7sByLvCUJQZP",
3993
  "outputId": "16daa2e5-3955-4885-d8bc-84f05399bd52"
3994
  },
3995
- "execution_count": 22,
3996
  "outputs": [
3997
  {
3998
  "output_type": "stream",
@@ -4023,7 +4028,8 @@
4023
  {
4024
  "cell_type": "markdown",
4025
  "source": [
4026
- "# Train Model"
 
4027
  ],
4028
  "metadata": {
4029
  "id": "CxpgHz4AJZm1"
@@ -4041,7 +4047,7 @@
4041
  "id": "m5nEI4mUJV0C",
4042
  "outputId": "7fb00a14-e2c2-41ee-b0b1-0d9e7325937a"
4043
  },
4044
- "execution_count": 23,
4045
  "outputs": [
4046
  {
4047
  "output_type": "stream",
@@ -4105,7 +4111,7 @@
4105
  "source": [
4106
  "# Evaluate model \n",
4107
  "\n",
4108
- "A very bad case of overfitting happening in this trained model because of the limited data I used. Can be improved by increasing training data and tuning other parameters."
4109
  ],
4110
  "metadata": {
4111
  "id": "J69DUun-JtTH"
@@ -4136,7 +4142,7 @@
4136
  "id": "I0bm7lZLJwZH",
4137
  "outputId": "f92ef16a-dac1-454b-ceb6-d9077dea4d59"
4138
  },
4139
- "execution_count": 24,
4140
  "outputs": [
4141
  {
4142
  "output_type": "execute_result",
@@ -4163,7 +4169,8 @@
4163
  {
4164
  "cell_type": "markdown",
4165
  "source": [
4166
- "# Peform Inference"
 
4167
  ],
4168
  "metadata": {
4169
  "id": "AWOv0GINKJGj"
@@ -4189,7 +4196,7 @@
4189
  "id": "lF-CiYmbJ1Eb",
4190
  "outputId": "c70f4f33-a9b7-4338-83e8-5f3357cbd787"
4191
  },
4192
- "execution_count": 25,
4193
  "outputs": [
4194
  {
4195
  "output_type": "stream",
@@ -4204,7 +4211,8 @@
4204
  {
4205
  "cell_type": "markdown",
4206
  "source": [
4207
- "# Save the Model files"
 
4208
  ],
4209
  "metadata": {
4210
  "id": "_GfgPLt6KQRn"
@@ -4250,7 +4258,7 @@
4250
  "metadata": {
4251
  "id": "tVlsCgDMKkom"
4252
  },
4253
- "execution_count": 27,
4254
  "outputs": []
4255
  },
4256
  {
 
2768
  },
2769
  {
2770
  "cell_type": "code",
2771
+ "execution_count": null,
2772
  "metadata": {
2773
  "colab": {
2774
  "base_uri": "https://localhost:8080/"
 
2893
  {
2894
  "cell_type": "markdown",
2895
  "source": [
2896
+ "# Get and process Dataset\n",
2897
+ "Downloading and loading the dataset from Huggingface. The dataset package is used to get the dataset."
2898
  ],
2899
  "metadata": {
2900
  "id": "1f0WYksXHPAM"
 
3004
  "id": "mqwYf9xWHUuh",
3005
  "outputId": "66965e88-8115-4d73-db90-6706371a0654"
3006
  },
3007
+ "execution_count": null,
3008
  "outputs": [
3009
  {
3010
  "output_type": "display_data",
 
3156
  "metadata": {
3157
  "id": "GuCNFWV_HXlf"
3158
  },
3159
+ "execution_count": null,
3160
  "outputs": []
3161
  },
3162
  {
 
3170
  "metadata": {
3171
  "id": "NiSQuUUgHdLn"
3172
  },
3173
+ "execution_count": null,
3174
  "outputs": []
3175
  },
3176
  {
 
3182
  "metadata": {
3183
  "id": "W_50sqXGHetc"
3184
  },
3185
+ "execution_count": null,
3186
  "outputs": []
3187
  },
3188
  {
 
3198
  "id": "TntoBCAvHg0D",
3199
  "outputId": "3820572d-2293-4b08-b9c5-7a6953c19ce8"
3200
  },
3201
+ "execution_count": null,
3202
  "outputs": [
3203
  {
3204
  "output_type": "execute_result",
 
3386
  "metadata": {
3387
  "id": "asZstARkHjz_"
3388
  },
3389
+ "execution_count": null,
3390
  "outputs": []
3391
  },
3392
  {
 
3404
  "id": "qPKqxIVwHu_9",
3405
  "outputId": "112271bf-d4db-47cd-9410-b92482488dd2"
3406
  },
3407
+ "execution_count": null,
3408
  "outputs": [
3409
  {
3410
  "output_type": "execute_result",
 
3576
  "id": "QwE34N-1Hw4O",
3577
  "outputId": "7a0ec80d-1c2e-497b-bc3a-a5eceae4fc11"
3578
  },
3579
+ "execution_count": null,
3580
  "outputs": [
3581
  {
3582
  "output_type": "execute_result",
 
3602
  "id": "R-bQ7kIfH4Lr",
3603
  "outputId": "1514f226-26dc-4ed4-adbc-edfef2c25e31"
3604
  },
3605
+ "execution_count": null,
3606
  "outputs": [
3607
  {
3608
  "output_type": "execute_result",
 
3630
  "id": "vZ91HtgCH5-B",
3631
  "outputId": "45ffe43b-9718-421d-c8d0-8fc2be5d9c5c"
3632
  },
3633
+ "execution_count": null,
3634
  "outputs": [
3635
  {
3636
  "output_type": "stream",
 
3652
  {
3653
  "cell_type": "markdown",
3654
  "source": [
3655
+ "# Limit dataset for quick training\n",
3656
+ "This step is only done for this post example. In real scenario, good amount of data will be needed for the training."
3657
  ],
3658
  "metadata": {
3659
  "id": "6PQddaR0H-g-"
 
3668
  "metadata": {
3669
  "id": "DrtZRy3gH8IC"
3670
  },
3671
+ "execution_count": null,
3672
  "outputs": []
3673
  },
3674
  {
 
3683
  {
3684
  "cell_type": "markdown",
3685
  "source": [
3686
+ "# Train test split \n",
3687
+ "Splitting the dataset into Training and Testing sets. The Train set will be used for training and the Test one will be used for evaluating the model."
3688
  ],
3689
  "metadata": {
3690
  "id": "bUEBfxgkIHQv"
 
3701
  "metadata": {
3702
  "id": "W7qXhZvdIJni"
3703
  },
3704
+ "execution_count": null,
3705
  "outputs": []
3706
  },
3707
  {
 
3719
  "id": "fxqHXivAILVc",
3720
  "outputId": "d5ce0660-92b9-445a-9e26-52445a7c1bb7"
3721
  },
3722
+ "execution_count": null,
3723
  "outputs": [
3724
  {
3725
  "output_type": "stream",
 
3742
  "metadata": {
3743
  "id": "c4a1lNf2INRx"
3744
  },
3745
+ "execution_count": null,
3746
  "outputs": []
3747
  },
3748
  {
 
3758
  "cell_type": "markdown",
3759
  "source": [
3760
  "# Pre process steps \n",
3761
+ "For an efficient training, dataset need to be pre-processed to get better results. Below are the steps I am handling here.\n",
3762
  "\n",
3763
  "1. Stemming and Lemmatization\n",
3764
  "2. Tokenizer\n",
 
3814
  "metadata": {
3815
  "id": "wrs4BuXGIaQP"
3816
  },
3817
+ "execution_count": null,
3818
  "outputs": []
3819
  },
3820
  {
 
3842
  "metadata": {
3843
  "id": "z4bO5f5KIinV"
3844
  },
3845
+ "execution_count": null,
3846
  "outputs": []
3847
  },
3848
  {
 
3869
  "id": "gWpOtBFtIqnm",
3870
  "outputId": "d4a8c94c-65c8-4f7a-9248-071d0d427772"
3871
  },
3872
+ "execution_count": null,
3873
  "outputs": [
3874
  {
3875
  "output_type": "execute_result",
 
3900
  "metadata": {
3901
  "id": "kropkedQI0zE"
3902
  },
3903
+ "execution_count": null,
3904
  "outputs": []
3905
  },
3906
  {
 
3931
  "id": "x0Kp2C5LI9nP",
3932
  "outputId": "b13990c0-8aaf-42a2-ee6d-7122bf0f72a7"
3933
  },
3934
+ "execution_count": null,
3935
  "outputs": [
3936
  {
3937
  "output_type": "execute_result",
 
3954
  {
3955
  "cell_type": "markdown",
3956
  "source": [
3957
+ "# Create Model \n",
3958
+ "I am creating an LSTM model with dropout layer for this example"
3959
  ],
3960
  "metadata": {
3961
  "id": "OnZDEGl2JJ2c"
 
3975
  "metadata": {
3976
  "id": "8ENmS8h0JHcm"
3977
  },
3978
+ "execution_count": null,
3979
  "outputs": []
3980
  },
3981
  {
 
3997
  "id": "7sByLvCUJQZP",
3998
  "outputId": "16daa2e5-3955-4885-d8bc-84f05399bd52"
3999
  },
4000
+ "execution_count": null,
4001
  "outputs": [
4002
  {
4003
  "output_type": "stream",
 
4028
  {
4029
  "cell_type": "markdown",
4030
  "source": [
4031
+ "# Train Model \n",
4032
+ "The actual training step for the model"
4033
  ],
4034
  "metadata": {
4035
  "id": "CxpgHz4AJZm1"
 
4047
  "id": "m5nEI4mUJV0C",
4048
  "outputId": "7fb00a14-e2c2-41ee-b0b1-0d9e7325937a"
4049
  },
4050
+ "execution_count": null,
4051
  "outputs": [
4052
  {
4053
  "output_type": "stream",
 
4111
  "source": [
4112
  "# Evaluate model \n",
4113
  "\n",
4114
+ "Evaluting the performance of the model. A very bad case of overfitting happening in this trained model because of the limited data I used. Can be improved by increasing training data and tuning other parameters."
4115
  ],
4116
  "metadata": {
4117
  "id": "J69DUun-JtTH"
 
4142
  "id": "I0bm7lZLJwZH",
4143
  "outputId": "f92ef16a-dac1-454b-ceb6-d9077dea4d59"
4144
  },
4145
+ "execution_count": null,
4146
  "outputs": [
4147
  {
4148
  "output_type": "execute_result",
 
4169
  {
4170
  "cell_type": "markdown",
4171
  "source": [
4172
+ "# Peform Inference \n",
4173
+ "Here the model is being tested with some text input"
4174
  ],
4175
  "metadata": {
4176
  "id": "AWOv0GINKJGj"
 
4196
  "id": "lF-CiYmbJ1Eb",
4197
  "outputId": "c70f4f33-a9b7-4338-83e8-5f3357cbd787"
4198
  },
4199
+ "execution_count": null,
4200
  "outputs": [
4201
  {
4202
  "output_type": "stream",
 
4211
  {
4212
  "cell_type": "markdown",
4213
  "source": [
4214
+ "# Save the Model files \n",
4215
+ "Using MLEM package to save the model files for deployment"
4216
  ],
4217
  "metadata": {
4218
  "id": "_GfgPLt6KQRn"
 
4258
  "metadata": {
4259
  "id": "tVlsCgDMKkom"
4260
  },
4261
+ "execution_count": null,
4262
  "outputs": []
4263
  },
4264
  {