Ubuntu commited on
Commit
07eb0e9
β€’
1 Parent(s): e77b318

added Azure NER

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. __pycache__/keys.cpython-310.pyc +0 -0
  2. data/wolf_cut_labelled.csv +3 -0
  3. data/wolf_cut_temp.csv +3 -0
  4. data_intent/intent_data.csv +2 -2
  5. data_intent/temp.csv +3 -0
  6. finetuned_entity_categorical_classification/checkpoint-1681/optimizer.pt +1 -1
  7. finetuned_entity_categorical_classification/checkpoint-1681/pytorch_model.bin +1 -1
  8. finetuned_entity_categorical_classification/checkpoint-1681/rng_state.pth +0 -0
  9. finetuned_entity_categorical_classification/checkpoint-1681/trainer_state.json +10 -10
  10. finetuned_entity_categorical_classification/checkpoint-1681/training_args.bin +1 -1
  11. finetuned_entity_categorical_classification/checkpoint-3362/optimizer.pt +1 -1
  12. finetuned_entity_categorical_classification/checkpoint-3362/pytorch_model.bin +1 -1
  13. finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth +0 -0
  14. finetuned_entity_categorical_classification/checkpoint-3362/trainer_state.json +18 -18
  15. finetuned_entity_categorical_classification/checkpoint-3362/training_args.bin +1 -1
  16. finetuned_entity_categorical_classification/runs/Oct13_10-29-55_ip-172-31-95-165/events.out.tfevents.1697192996.ip-172-31-95-165.139501.0 +0 -0
  17. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/added_tokens.json +0 -0
  18. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/config.json +0 -0
  19. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/optimizer.pt +1 -1
  20. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/pytorch_model.bin +1 -1
  21. intent_classification_model/checkpoint-1216/rng_state.pth +0 -0
  22. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/scheduler.pt +1 -1
  23. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/special_tokens_map.json +0 -0
  24. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/tokenizer.json +0 -0
  25. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/tokenizer_config.json +0 -0
  26. intent_classification_model/checkpoint-1216/trainer_state.json +175 -0
  27. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/training_args.bin +1 -1
  28. intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/vocab.txt +0 -0
  29. intent_classification_model/checkpoint-1376/added_tokens.json +7 -0
  30. intent_classification_model/checkpoint-1376/config.json +39 -0
  31. intent_classification_model/checkpoint-1376/optimizer.pt +3 -0
  32. intent_classification_model/checkpoint-1376/pytorch_model.bin +3 -0
  33. intent_classification_model/{checkpoint-324 β†’ checkpoint-1376}/rng_state.pth +0 -0
  34. intent_classification_model/checkpoint-1376/scheduler.pt +3 -0
  35. intent_classification_model/checkpoint-1376/special_tokens_map.json +7 -0
  36. intent_classification_model/checkpoint-1376/tokenizer.json +0 -0
  37. intent_classification_model/checkpoint-1376/tokenizer_config.json +56 -0
  38. intent_classification_model/checkpoint-1376/trainer_state.json +175 -0
  39. intent_classification_model/checkpoint-1376/training_args.bin +3 -0
  40. intent_classification_model/checkpoint-1376/vocab.txt +0 -0
  41. intent_classification_model/checkpoint-324/trainer_state.json +0 -73
  42. intent_classification_model/runs/Oct13_10-35-17_ip-172-31-95-165/events.out.tfevents.1697193318.ip-172-31-95-165.139816.0 +0 -0
  43. intent_classification_model/runs/Oct13_10-49-20_ip-172-31-95-165/events.out.tfevents.1697194161.ip-172-31-95-165.140238.0 +0 -0
  44. research/09_fine_tuning_for_datacategories.ipynb +122 -115
  45. research/11_evaluation.ipynb +258 -50
  46. research/11_intent_classification_using_distilbert.ipynb +255 -143
  47. research/12_text_analytics_using_azure.ipynb +407 -0
  48. research/13_data_categories.ipynb +0 -0
  49. utils/__pycache__/get_category.cpython-310.pyc +0 -0
  50. utils/__pycache__/get_intent.cpython-310.pyc +0 -0
__pycache__/keys.cpython-310.pyc CHANGED
Binary files a/__pycache__/keys.cpython-310.pyc and b/__pycache__/keys.cpython-310.pyc differ
 
data/wolf_cut_labelled.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:809d5432ceb512c742171eaefe4862dcc283674b8eab13eacf17ff15595fc16a
3
+ size 278211
data/wolf_cut_temp.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7a72974667af5a81b8012edba66f761f6c6784d03658413c37db06b0e94f0fb
3
+ size 52781
data_intent/intent_data.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24091e2e977d444be178138ac717fa57b8d16534dcf5e66d4084cf3f77e6f6ce
3
- size 39551
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ee34445e32b84ac258ad523d7c6b1c6babf326a6932ae05f4a9aeae01ae4366
3
+ size 72303
data_intent/temp.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c89381303aa0fec070d7141d2e3ad2699daf9d0fb0c2a99eec7625c41977b62
3
+ size 632216
finetuned_entity_categorical_classification/checkpoint-1681/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7482411d85a2d5cf5f632c997d2e07449fe4217bcf4b1aad0b38f9138d1acd0a
3
  size 535881018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ddb82ef6b7ce9d69183007173cd0480840f0e859a1284293e8d83debea834d5
3
  size 535881018
finetuned_entity_categorical_classification/checkpoint-1681/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f30aacfea59fa26f3b7edc0f510fe6d083c82c0a92e3118f80f0b13f375cb74e
3
  size 267932842
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1026e1cb049c206c60d220d76f2ad9cccabbb8a8e435bf46049bfcbb6b973a7f
3
  size 267932842
finetuned_entity_categorical_classification/checkpoint-1681/rng_state.pth CHANGED
Binary files a/finetuned_entity_categorical_classification/checkpoint-1681/rng_state.pth and b/finetuned_entity_categorical_classification/checkpoint-1681/rng_state.pth differ
 
finetuned_entity_categorical_classification/checkpoint-1681/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.10296357423067093,
3
  "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
@@ -11,28 +11,28 @@
11
  {
12
  "epoch": 0.3,
13
  "learning_rate": 1.7025580011897683e-05,
14
- "loss": 0.1045,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.59,
19
  "learning_rate": 1.405116002379536e-05,
20
- "loss": 0.1056,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.89,
25
  "learning_rate": 1.1076740035693041e-05,
26
- "loss": 0.1041,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
- "eval_accuracy": 0.9721850364420646,
32
- "eval_loss": 0.10296357423067093,
33
- "eval_runtime": 2.316,
34
- "eval_samples_per_second": 2902.854,
35
- "eval_steps_per_second": 181.779,
36
  "step": 1681
37
  }
38
  ],
@@ -40,7 +40,7 @@
40
  "max_steps": 3362,
41
  "num_train_epochs": 2,
42
  "save_steps": 500,
43
- "total_flos": 108413372385396.0,
44
  "trial_name": null,
45
  "trial_params": null
46
  }
 
1
  {
2
+ "best_metric": 0.07765195518732071,
3
  "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
 
11
  {
12
  "epoch": 0.3,
13
  "learning_rate": 1.7025580011897683e-05,
14
+ "loss": 0.1008,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.59,
19
  "learning_rate": 1.405116002379536e-05,
20
+ "loss": 0.1133,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.89,
25
  "learning_rate": 1.1076740035693041e-05,
26
+ "loss": 0.1023,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
+ "eval_accuracy": 0.9753086419753086,
32
+ "eval_loss": 0.07765195518732071,
33
+ "eval_runtime": 2.2887,
34
+ "eval_samples_per_second": 2937.427,
35
+ "eval_steps_per_second": 183.944,
36
  "step": 1681
37
  }
38
  ],
 
40
  "max_steps": 3362,
41
  "num_train_epochs": 2,
42
  "save_steps": 500,
43
+ "total_flos": 106434534943386.0,
44
  "trial_name": null,
45
  "trial_params": null
46
  }
finetuned_entity_categorical_classification/checkpoint-1681/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2de83bc1893d1870cbe886f5287e02f718e1fe0be09dba843ccfc561aeb95ec6
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ca296b683b24f6f80d4f29a9a0c986a837732910bd0a31303095257578ddfb
3
  size 4600
finetuned_entity_categorical_classification/checkpoint-3362/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d015879f29a2744736a3ba7748885a4ec943584a74c779bc00637389c2d90ccd
3
  size 535881018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:167b28137ba8f1cd7b5e16c91eb0e53bf3273a77a9f450b8f88896a8fc0333a5
3
  size 535881018
finetuned_entity_categorical_classification/checkpoint-3362/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2f9ac5b4263d73b4fe5715bd69766cb18cb5925f401945d0c67275a65364524
3
  size 267932842
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4394c17645f6749fa890492765494e6f6dcf094a971ee68dff1d187d6339a1d
3
  size 267932842
finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth CHANGED
Binary files a/finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth and b/finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth differ
 
finetuned_entity_categorical_classification/checkpoint-3362/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.10296357423067093,
3
  "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
  "epoch": 2.0,
5
  "eval_steps": 500,
@@ -11,55 +11,55 @@
11
  {
12
  "epoch": 0.3,
13
  "learning_rate": 1.7025580011897683e-05,
14
- "loss": 0.1045,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.59,
19
  "learning_rate": 1.405116002379536e-05,
20
- "loss": 0.1056,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.89,
25
  "learning_rate": 1.1076740035693041e-05,
26
- "loss": 0.1041,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
- "eval_accuracy": 0.9721850364420646,
32
- "eval_loss": 0.10296357423067093,
33
- "eval_runtime": 2.316,
34
- "eval_samples_per_second": 2902.854,
35
- "eval_steps_per_second": 181.779,
36
  "step": 1681
37
  },
38
  {
39
  "epoch": 1.19,
40
  "learning_rate": 8.10232004759072e-06,
41
- "loss": 0.0776,
42
  "step": 2000
43
  },
44
  {
45
  "epoch": 1.49,
46
  "learning_rate": 5.1279000594884e-06,
47
- "loss": 0.0675,
48
  "step": 2500
49
  },
50
  {
51
  "epoch": 1.78,
52
  "learning_rate": 2.1534800713860798e-06,
53
- "loss": 0.0773,
54
  "step": 3000
55
  },
56
  {
57
  "epoch": 2.0,
58
- "eval_accuracy": 0.9708463483563885,
59
- "eval_loss": 0.11056160181760788,
60
- "eval_runtime": 2.2742,
61
- "eval_samples_per_second": 2956.182,
62
- "eval_steps_per_second": 185.119,
63
  "step": 3362
64
  }
65
  ],
@@ -67,7 +67,7 @@
67
  "max_steps": 3362,
68
  "num_train_epochs": 2,
69
  "save_steps": 500,
70
- "total_flos": 216609059710134.0,
71
  "trial_name": null,
72
  "trial_params": null
73
  }
 
1
  {
2
+ "best_metric": 0.07765195518732071,
3
  "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
  "epoch": 2.0,
5
  "eval_steps": 500,
 
11
  {
12
  "epoch": 0.3,
13
  "learning_rate": 1.7025580011897683e-05,
14
+ "loss": 0.1008,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.59,
19
  "learning_rate": 1.405116002379536e-05,
20
+ "loss": 0.1133,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.89,
25
  "learning_rate": 1.1076740035693041e-05,
26
+ "loss": 0.1023,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
+ "eval_accuracy": 0.9753086419753086,
32
+ "eval_loss": 0.07765195518732071,
33
+ "eval_runtime": 2.2887,
34
+ "eval_samples_per_second": 2937.427,
35
+ "eval_steps_per_second": 183.944,
36
  "step": 1681
37
  },
38
  {
39
  "epoch": 1.19,
40
  "learning_rate": 8.10232004759072e-06,
41
+ "loss": 0.0827,
42
  "step": 2000
43
  },
44
  {
45
  "epoch": 1.49,
46
  "learning_rate": 5.1279000594884e-06,
47
+ "loss": 0.0702,
48
  "step": 2500
49
  },
50
  {
51
  "epoch": 1.78,
52
  "learning_rate": 2.1534800713860798e-06,
53
+ "loss": 0.0834,
54
  "step": 3000
55
  },
56
  {
57
  "epoch": 2.0,
58
+ "eval_accuracy": 0.9747136694927859,
59
+ "eval_loss": 0.08629146963357925,
60
+ "eval_runtime": 2.3024,
61
+ "eval_samples_per_second": 2919.969,
62
+ "eval_steps_per_second": 182.851,
63
  "step": 3362
64
  }
65
  ],
 
67
  "max_steps": 3362,
68
  "num_train_epochs": 2,
69
  "save_steps": 500,
70
+ "total_flos": 213673546900476.0,
71
  "trial_name": null,
72
  "trial_params": null
73
  }
finetuned_entity_categorical_classification/checkpoint-3362/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2de83bc1893d1870cbe886f5287e02f718e1fe0be09dba843ccfc561aeb95ec6
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ca296b683b24f6f80d4f29a9a0c986a837732910bd0a31303095257578ddfb
3
  size 4600
finetuned_entity_categorical_classification/runs/Oct13_10-29-55_ip-172-31-95-165/events.out.tfevents.1697192996.ip-172-31-95-165.139501.0 ADDED
Binary file (7.68 kB). View file
 
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/added_tokens.json RENAMED
File without changes
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/config.json RENAMED
File without changes
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a50f88f7a9097ecddb2b3c7e3d38747deec4ca3a386132fac9e0e4efaa82ae0e
3
  size 535745722
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97791790fb47e0d2262cfd6c379f3e36d956e7ef05ddcfcd905abba63c990209
3
  size 535745722
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b339df5c0d892e025a1749d085ab010e551f4b249eb497812a1a3bd7ebd5fd99
3
  size 267865194
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d83acd64be6fc794a8e6c94f48eb095fd23679e7c612bd83712b5738588b1b8
3
  size 267865194
intent_classification_model/checkpoint-1216/rng_state.pth ADDED
Binary file (14.2 kB). View file
 
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73f74582c189fe624f606122980ccb279125588a1db45b4052dc704fa2b51184
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94db5976ef19e649b033b8c416b03f555990a66e540f81cc5eccc167168f1bc
3
  size 1064
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/special_tokens_map.json RENAMED
File without changes
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/tokenizer.json RENAMED
File without changes
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/tokenizer_config.json RENAMED
File without changes
intent_classification_model/checkpoint-1216/trainer_state.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.06275933235883713,
3
+ "best_model_checkpoint": "intent_classification_model/checkpoint-152",
4
+ "epoch": 16.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1216,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.9867549668874173,
14
+ "eval_loss": 0.20886486768722534,
15
+ "eval_runtime": 0.1475,
16
+ "eval_samples_per_second": 2048.099,
17
+ "eval_steps_per_second": 128.854,
18
+ "step": 76
19
+ },
20
+ {
21
+ "epoch": 2.0,
22
+ "eval_accuracy": 0.9834437086092715,
23
+ "eval_loss": 0.06275933235883713,
24
+ "eval_runtime": 0.1586,
25
+ "eval_samples_per_second": 1904.103,
26
+ "eval_steps_per_second": 119.795,
27
+ "step": 152
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "eval_accuracy": 0.9867549668874173,
32
+ "eval_loss": 0.06509935110807419,
33
+ "eval_runtime": 0.1445,
34
+ "eval_samples_per_second": 2090.586,
35
+ "eval_steps_per_second": 131.527,
36
+ "step": 228
37
+ },
38
+ {
39
+ "epoch": 4.0,
40
+ "eval_accuracy": 0.9768211920529801,
41
+ "eval_loss": 0.08112386614084244,
42
+ "eval_runtime": 0.1335,
43
+ "eval_samples_per_second": 2262.833,
44
+ "eval_steps_per_second": 142.364,
45
+ "step": 304
46
+ },
47
+ {
48
+ "epoch": 5.0,
49
+ "eval_accuracy": 0.9701986754966887,
50
+ "eval_loss": 0.11257749050855637,
51
+ "eval_runtime": 0.134,
52
+ "eval_samples_per_second": 2253.71,
53
+ "eval_steps_per_second": 141.79,
54
+ "step": 380
55
+ },
56
+ {
57
+ "epoch": 6.0,
58
+ "eval_accuracy": 0.9735099337748344,
59
+ "eval_loss": 0.11174333095550537,
60
+ "eval_runtime": 0.1339,
61
+ "eval_samples_per_second": 2255.512,
62
+ "eval_steps_per_second": 141.903,
63
+ "step": 456
64
+ },
65
+ {
66
+ "epoch": 6.58,
67
+ "learning_rate": 1.1776315789473684e-05,
68
+ "loss": 0.1883,
69
+ "step": 500
70
+ },
71
+ {
72
+ "epoch": 7.0,
73
+ "eval_accuracy": 0.9768211920529801,
74
+ "eval_loss": 0.10020075738430023,
75
+ "eval_runtime": 0.145,
76
+ "eval_samples_per_second": 2083.04,
77
+ "eval_steps_per_second": 131.052,
78
+ "step": 532
79
+ },
80
+ {
81
+ "epoch": 8.0,
82
+ "eval_accuracy": 0.9735099337748344,
83
+ "eval_loss": 0.116866335272789,
84
+ "eval_runtime": 0.1348,
85
+ "eval_samples_per_second": 2240.912,
86
+ "eval_steps_per_second": 140.985,
87
+ "step": 608
88
+ },
89
+ {
90
+ "epoch": 9.0,
91
+ "eval_accuracy": 0.9701986754966887,
92
+ "eval_loss": 0.14152054488658905,
93
+ "eval_runtime": 0.1308,
94
+ "eval_samples_per_second": 2309.736,
95
+ "eval_steps_per_second": 145.314,
96
+ "step": 684
97
+ },
98
+ {
99
+ "epoch": 10.0,
100
+ "eval_accuracy": 0.9735099337748344,
101
+ "eval_loss": 0.1344088315963745,
102
+ "eval_runtime": 0.1195,
103
+ "eval_samples_per_second": 2526.256,
104
+ "eval_steps_per_second": 158.937,
105
+ "step": 760
106
+ },
107
+ {
108
+ "epoch": 11.0,
109
+ "eval_accuracy": 0.9735099337748344,
110
+ "eval_loss": 0.13409321010112762,
111
+ "eval_runtime": 0.1399,
112
+ "eval_samples_per_second": 2159.267,
113
+ "eval_steps_per_second": 135.848,
114
+ "step": 836
115
+ },
116
+ {
117
+ "epoch": 12.0,
118
+ "eval_accuracy": 0.9735099337748344,
119
+ "eval_loss": 0.12705937027931213,
120
+ "eval_runtime": 0.1366,
121
+ "eval_samples_per_second": 2210.321,
122
+ "eval_steps_per_second": 139.06,
123
+ "step": 912
124
+ },
125
+ {
126
+ "epoch": 13.0,
127
+ "eval_accuracy": 0.9735099337748344,
128
+ "eval_loss": 0.13874845206737518,
129
+ "eval_runtime": 0.1374,
130
+ "eval_samples_per_second": 2197.254,
131
+ "eval_steps_per_second": 138.238,
132
+ "step": 988
133
+ },
134
+ {
135
+ "epoch": 13.16,
136
+ "learning_rate": 3.5526315789473687e-06,
137
+ "loss": 0.018,
138
+ "step": 1000
139
+ },
140
+ {
141
+ "epoch": 14.0,
142
+ "eval_accuracy": 0.9735099337748344,
143
+ "eval_loss": 0.13716736435890198,
144
+ "eval_runtime": 0.1193,
145
+ "eval_samples_per_second": 2530.546,
146
+ "eval_steps_per_second": 159.207,
147
+ "step": 1064
148
+ },
149
+ {
150
+ "epoch": 15.0,
151
+ "eval_accuracy": 0.9735099337748344,
152
+ "eval_loss": 0.13588877022266388,
153
+ "eval_runtime": 0.1396,
154
+ "eval_samples_per_second": 2163.789,
155
+ "eval_steps_per_second": 136.132,
156
+ "step": 1140
157
+ },
158
+ {
159
+ "epoch": 16.0,
160
+ "eval_accuracy": 0.9735099337748344,
161
+ "eval_loss": 0.13579562306404114,
162
+ "eval_runtime": 0.1288,
163
+ "eval_samples_per_second": 2345.226,
164
+ "eval_steps_per_second": 147.547,
165
+ "step": 1216
166
+ }
167
+ ],
168
+ "logging_steps": 500,
169
+ "max_steps": 1216,
170
+ "num_train_epochs": 16,
171
+ "save_steps": 500,
172
+ "total_flos": 62384098266840.0,
173
+ "trial_name": null,
174
+ "trial_params": null
175
+ }
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c27308f0087e544f12e1806abafb33d65745a5791fb1559d9e521f3670215df9
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40b975e2b309584fec6c9097bbbfc4736c3bbe492681259866398911daf0ae0c
3
  size 4536
intent_classification_model/{checkpoint-324 β†’ checkpoint-1216}/vocab.txt RENAMED
File without changes
intent_classification_model/checkpoint-1376/added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLS]": 101,
3
+ "[MASK]": 103,
4
+ "[PAD]": 0,
5
+ "[SEP]": 102,
6
+ "[UNK]": 100
7
+ }
intent_classification_model/checkpoint-1376/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Commercial",
13
+ "1": "Informational",
14
+ "2": "Navigational",
15
+ "3": "Local",
16
+ "4": "Transactional"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "Commercial": 0,
21
+ "Informational": 1,
22
+ "Local": 3,
23
+ "Navigational": 2,
24
+ "Transactional": 4
25
+ },
26
+ "max_position_embeddings": 512,
27
+ "model_type": "distilbert",
28
+ "n_heads": 12,
29
+ "n_layers": 6,
30
+ "pad_token_id": 0,
31
+ "problem_type": "single_label_classification",
32
+ "qa_dropout": 0.1,
33
+ "seq_classif_dropout": 0.2,
34
+ "sinusoidal_pos_embds": false,
35
+ "tie_weights_": true,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.34.0",
38
+ "vocab_size": 30522
39
+ }
intent_classification_model/checkpoint-1376/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f2ed586c32f48dd2cece37baf89590cc951fda221ec175eadd3034e996abe25
3
+ size 535745722
intent_classification_model/checkpoint-1376/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153cb325de818e493f8a0a7aa1fbcc5cf3d8fa27d07339fbfd1d8e238d8cb38b
3
+ size 267865194
intent_classification_model/{checkpoint-324 β†’ checkpoint-1376}/rng_state.pth RENAMED
Binary files a/intent_classification_model/checkpoint-324/rng_state.pth and b/intent_classification_model/checkpoint-1376/rng_state.pth differ
 
intent_classification_model/checkpoint-1376/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c80c9f7b843dea09bd3b8739eafa7b84f67f346b13150be7548d804af238e2c
3
+ size 1064
intent_classification_model/checkpoint-1376/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
intent_classification_model/checkpoint-1376/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
intent_classification_model/checkpoint-1376/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [],
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
intent_classification_model/checkpoint-1376/trainer_state.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.10133440792560577,
3
+ "best_model_checkpoint": "intent_classification_model/checkpoint-344",
4
+ "epoch": 16.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1376,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.956140350877193,
14
+ "eval_loss": 0.24781915545463562,
15
+ "eval_runtime": 0.1669,
16
+ "eval_samples_per_second": 2049.176,
17
+ "eval_steps_per_second": 131.818,
18
+ "step": 86
19
+ },
20
+ {
21
+ "epoch": 2.0,
22
+ "eval_accuracy": 0.9766081871345029,
23
+ "eval_loss": 0.10303749144077301,
24
+ "eval_runtime": 0.2792,
25
+ "eval_samples_per_second": 1224.804,
26
+ "eval_steps_per_second": 78.789,
27
+ "step": 172
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "eval_accuracy": 0.9736842105263158,
32
+ "eval_loss": 0.12486349791288376,
33
+ "eval_runtime": 0.1527,
34
+ "eval_samples_per_second": 2239.207,
35
+ "eval_steps_per_second": 144.043,
36
+ "step": 258
37
+ },
38
+ {
39
+ "epoch": 4.0,
40
+ "eval_accuracy": 0.9766081871345029,
41
+ "eval_loss": 0.10133440792560577,
42
+ "eval_runtime": 0.1513,
43
+ "eval_samples_per_second": 2260.581,
44
+ "eval_steps_per_second": 145.418,
45
+ "step": 344
46
+ },
47
+ {
48
+ "epoch": 5.0,
49
+ "eval_accuracy": 0.9766081871345029,
50
+ "eval_loss": 0.11906354874372482,
51
+ "eval_runtime": 0.1397,
52
+ "eval_samples_per_second": 2448.535,
53
+ "eval_steps_per_second": 157.508,
54
+ "step": 430
55
+ },
56
+ {
57
+ "epoch": 5.81,
58
+ "learning_rate": 1.2732558139534886e-05,
59
+ "loss": 0.1903,
60
+ "step": 500
61
+ },
62
+ {
63
+ "epoch": 6.0,
64
+ "eval_accuracy": 0.9678362573099415,
65
+ "eval_loss": 0.14922283589839935,
66
+ "eval_runtime": 0.1511,
67
+ "eval_samples_per_second": 2264.082,
68
+ "eval_steps_per_second": 145.643,
69
+ "step": 516
70
+ },
71
+ {
72
+ "epoch": 7.0,
73
+ "eval_accuracy": 0.9736842105263158,
74
+ "eval_loss": 0.10685376077890396,
75
+ "eval_runtime": 0.1562,
76
+ "eval_samples_per_second": 2189.014,
77
+ "eval_steps_per_second": 140.814,
78
+ "step": 602
79
+ },
80
+ {
81
+ "epoch": 8.0,
82
+ "eval_accuracy": 0.9736842105263158,
83
+ "eval_loss": 0.12596090137958527,
84
+ "eval_runtime": 0.1543,
85
+ "eval_samples_per_second": 2216.873,
86
+ "eval_steps_per_second": 142.606,
87
+ "step": 688
88
+ },
89
+ {
90
+ "epoch": 9.0,
91
+ "eval_accuracy": 0.9707602339181286,
92
+ "eval_loss": 0.129041388630867,
93
+ "eval_runtime": 0.1334,
94
+ "eval_samples_per_second": 2563.696,
95
+ "eval_steps_per_second": 164.916,
96
+ "step": 774
97
+ },
98
+ {
99
+ "epoch": 10.0,
100
+ "eval_accuracy": 0.9736842105263158,
101
+ "eval_loss": 0.12375017255544662,
102
+ "eval_runtime": 0.1513,
103
+ "eval_samples_per_second": 2261.041,
104
+ "eval_steps_per_second": 145.447,
105
+ "step": 860
106
+ },
107
+ {
108
+ "epoch": 11.0,
109
+ "eval_accuracy": 0.9736842105263158,
110
+ "eval_loss": 0.12813875079154968,
111
+ "eval_runtime": 0.1546,
112
+ "eval_samples_per_second": 2212.042,
113
+ "eval_steps_per_second": 142.295,
114
+ "step": 946
115
+ },
116
+ {
117
+ "epoch": 11.63,
118
+ "learning_rate": 5.465116279069767e-06,
119
+ "loss": 0.0258,
120
+ "step": 1000
121
+ },
122
+ {
123
+ "epoch": 12.0,
124
+ "eval_accuracy": 0.9736842105263158,
125
+ "eval_loss": 0.13388033211231232,
126
+ "eval_runtime": 0.1607,
127
+ "eval_samples_per_second": 2128.444,
128
+ "eval_steps_per_second": 136.917,
129
+ "step": 1032
130
+ },
131
+ {
132
+ "epoch": 13.0,
133
+ "eval_accuracy": 0.9736842105263158,
134
+ "eval_loss": 0.1308409869670868,
135
+ "eval_runtime": 0.1401,
136
+ "eval_samples_per_second": 2441.546,
137
+ "eval_steps_per_second": 157.058,
138
+ "step": 1118
139
+ },
140
+ {
141
+ "epoch": 14.0,
142
+ "eval_accuracy": 0.9736842105263158,
143
+ "eval_loss": 0.13211463391780853,
144
+ "eval_runtime": 0.1539,
145
+ "eval_samples_per_second": 2222.296,
146
+ "eval_steps_per_second": 142.955,
147
+ "step": 1204
148
+ },
149
+ {
150
+ "epoch": 15.0,
151
+ "eval_accuracy": 0.9736842105263158,
152
+ "eval_loss": 0.13366281986236572,
153
+ "eval_runtime": 0.1507,
154
+ "eval_samples_per_second": 2269.433,
155
+ "eval_steps_per_second": 145.987,
156
+ "step": 1290
157
+ },
158
+ {
159
+ "epoch": 16.0,
160
+ "eval_accuracy": 0.9736842105263158,
161
+ "eval_loss": 0.13524049520492554,
162
+ "eval_runtime": 0.1603,
163
+ "eval_samples_per_second": 2133.42,
164
+ "eval_steps_per_second": 137.238,
165
+ "step": 1376
166
+ }
167
+ ],
168
+ "logging_steps": 500,
169
+ "max_steps": 1376,
170
+ "num_train_epochs": 16,
171
+ "save_steps": 500,
172
+ "total_flos": 70181981180580.0,
173
+ "trial_name": null,
174
+ "trial_params": null
175
+ }
intent_classification_model/checkpoint-1376/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b92fcfbb60dcd18505e69a8641e67a12b1dbb1bb4cf8cf1817bb473e3ed0dc
3
+ size 4536
intent_classification_model/checkpoint-1376/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
intent_classification_model/checkpoint-324/trainer_state.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "best_metric": 0.16397738456726074,
3
- "best_model_checkpoint": "intent_classification_model/checkpoint-270",
4
- "epoch": 6.0,
5
- "eval_steps": 500,
6
- "global_step": 324,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.9488372093023256,
14
- "eval_loss": 0.4676927328109741,
15
- "eval_runtime": 0.1185,
16
- "eval_samples_per_second": 1814.083,
17
- "eval_steps_per_second": 118.126,
18
- "step": 54
19
- },
20
- {
21
- "epoch": 2.0,
22
- "eval_accuracy": 0.9534883720930233,
23
- "eval_loss": 0.20428764820098877,
24
- "eval_runtime": 0.0972,
25
- "eval_samples_per_second": 2210.83,
26
- "eval_steps_per_second": 143.961,
27
- "step": 108
28
- },
29
- {
30
- "epoch": 3.0,
31
- "eval_accuracy": 0.9674418604651163,
32
- "eval_loss": 0.16401757299900055,
33
- "eval_runtime": 0.1015,
34
- "eval_samples_per_second": 2118.828,
35
- "eval_steps_per_second": 137.97,
36
- "step": 162
37
- },
38
- {
39
- "epoch": 4.0,
40
- "eval_accuracy": 0.9674418604651163,
41
- "eval_loss": 0.16496841609477997,
42
- "eval_runtime": 0.0941,
43
- "eval_samples_per_second": 2284.398,
44
- "eval_steps_per_second": 148.752,
45
- "step": 216
46
- },
47
- {
48
- "epoch": 5.0,
49
- "eval_accuracy": 0.9674418604651163,
50
- "eval_loss": 0.16397738456726074,
51
- "eval_runtime": 0.0975,
52
- "eval_samples_per_second": 2204.851,
53
- "eval_steps_per_second": 143.572,
54
- "step": 270
55
- },
56
- {
57
- "epoch": 6.0,
58
- "eval_accuracy": 0.9674418604651163,
59
- "eval_loss": 0.16553252935409546,
60
- "eval_runtime": 0.0947,
61
- "eval_samples_per_second": 2271.063,
62
- "eval_steps_per_second": 147.883,
63
- "step": 324
64
- }
65
- ],
66
- "logging_steps": 500,
67
- "max_steps": 324,
68
- "num_train_epochs": 6,
69
- "save_steps": 500,
70
- "total_flos": 13032177536640.0,
71
- "trial_name": null,
72
- "trial_params": null
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
intent_classification_model/runs/Oct13_10-35-17_ip-172-31-95-165/events.out.tfevents.1697193318.ip-172-31-95-165.139816.0 ADDED
Binary file (10.2 kB). View file
 
intent_classification_model/runs/Oct13_10-49-20_ip-172-31-95-165/events.out.tfevents.1697194161.ip-172-31-95-165.140238.0 ADDED
Binary file (10.2 kB). View file
 
research/09_fine_tuning_for_datacategories.ipynb CHANGED
@@ -62,93 +62,93 @@
62
  " </thead>\n",
63
  " <tbody>\n",
64
  " <tr>\n",
65
- " <th>22910</th>\n",
66
- " <td>Retirement income streams explanation</td>\n",
67
- " <td>Finance</td>\n",
68
- " <td>18</td>\n",
69
  " </tr>\n",
70
  " <tr>\n",
71
- " <th>3202</th>\n",
72
- " <td>Social justice strategies</td>\n",
73
- " <td>People_and_Society</td>\n",
74
- " <td>10</td>\n",
75
- " </tr>\n",
76
- " <tr>\n",
77
- " <th>23191</th>\n",
78
- " <td>Nanomaterials engineering</td>\n",
79
  " <td>Science</td>\n",
80
  " <td>2</td>\n",
81
  " </tr>\n",
82
  " <tr>\n",
83
- " <th>25025</th>\n",
84
- " <td>Acrylic nails</td>\n",
85
- " <td>Beauty_and_Fitness</td>\n",
86
- " <td>9</td>\n",
87
  " </tr>\n",
88
  " <tr>\n",
89
- " <th>14018</th>\n",
90
- " <td>Substance abuse recovery strategies</td>\n",
91
- " <td>People_and_Society</td>\n",
92
- " <td>10</td>\n",
93
  " </tr>\n",
94
  " <tr>\n",
95
- " <th>30887</th>\n",
96
- " <td>Facebook privacy</td>\n",
97
- " <td>Online Communities</td>\n",
98
- " <td>8</td>\n",
99
  " </tr>\n",
100
  " <tr>\n",
101
- " <th>5716</th>\n",
102
- " <td>disability</td>\n",
103
- " <td>Sensitive Subjects</td>\n",
104
- " <td>23</td>\n",
105
  " </tr>\n",
106
  " <tr>\n",
107
- " <th>25854</th>\n",
108
- " <td>Zumba dance fitness</td>\n",
109
- " <td>Beauty_and_Fitness</td>\n",
110
- " <td>9</td>\n",
111
  " </tr>\n",
112
  " <tr>\n",
113
- " <th>25032</th>\n",
114
- " <td>Enjoy dick porn</td>\n",
115
- " <td>Adult</td>\n",
116
- " <td>6</td>\n",
117
  " </tr>\n",
118
  " <tr>\n",
119
- " <th>2008</th>\n",
120
- " <td>iPhone Face ID</td>\n",
121
- " <td>Computers_and_Electronics</td>\n",
122
- " <td>7</td>\n",
 
 
 
 
 
 
123
  " </tr>\n",
124
  " </tbody>\n",
125
  "</table>\n",
126
  "</div>"
127
  ],
128
  "text/plain": [
129
- " category label \\\n",
130
- "22910 Retirement income streams explanation Finance \n",
131
- "3202 Social justice strategies People_and_Society \n",
132
- "23191 Nanomaterials engineering Science \n",
133
- "25025 Acrylic nails Beauty_and_Fitness \n",
134
- "14018 Substance abuse recovery strategies People_and_Society \n",
135
- "30887 Facebook privacy Online Communities \n",
136
- "5716 disability Sensitive Subjects \n",
137
- "25854 Zumba dance fitness Beauty_and_Fitness \n",
138
- "25032 Enjoy dick porn Adult \n",
139
- "2008 iPhone Face ID Computers_and_Electronics \n",
140
  "\n",
141
- " label_id \n",
142
- "22910 18 \n",
143
- "3202 10 \n",
144
- "23191 2 \n",
145
- "25025 9 \n",
146
- "14018 10 \n",
147
- "30887 8 \n",
148
- "5716 23 \n",
149
- "25854 9 \n",
150
- "25032 6 \n",
151
- "2008 7 "
152
  ]
153
  },
154
  "execution_count": 3,
@@ -273,7 +273,7 @@
273
  "name": "stderr",
274
  "output_type": "stream",
275
  "text": [
276
- "/tmp/ipykernel_129502/984288843.py:1: SettingWithCopyWarning: \n",
277
  "A value is trying to be set on a copy of a slice from a DataFrame\n",
278
  "\n",
279
  "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@@ -307,71 +307,71 @@
307
  " </thead>\n",
308
  " <tbody>\n",
309
  " <tr>\n",
310
- " <th>7152</th>\n",
311
- " <td>Social justice strategies</td>\n",
312
- " <td>10</td>\n",
313
  " </tr>\n",
314
  " <tr>\n",
315
- " <th>31780</th>\n",
316
- " <td>LinkedIn job search for food writing organizat...</td>\n",
317
- " <td>21</td>\n",
318
  " </tr>\n",
319
  " <tr>\n",
320
- " <th>20244</th>\n",
321
- " <td>Nobel Prize in Literature news</td>\n",
322
  " <td>1</td>\n",
323
  " </tr>\n",
324
  " <tr>\n",
325
- " <th>16634</th>\n",
326
- " <td>Job search for people with public health impai...</td>\n",
327
- " <td>21</td>\n",
328
  " </tr>\n",
329
  " <tr>\n",
330
- " <th>8603</th>\n",
331
- " <td>Car insurance for luxury cars</td>\n",
332
- " <td>3</td>\n",
333
  " </tr>\n",
334
  " <tr>\n",
335
- " <th>30042</th>\n",
336
- " <td>Personal development and self-help techniques ...</td>\n",
337
- " <td>8</td>\n",
338
  " </tr>\n",
339
  " <tr>\n",
340
- " <th>9345</th>\n",
341
- " <td>Smartwatch features</td>\n",
342
- " <td>7</td>\n",
343
  " </tr>\n",
344
  " <tr>\n",
345
- " <th>19660</th>\n",
346
- " <td>Travel deals for beachfront chalets</td>\n",
347
- " <td>14</td>\n",
348
  " </tr>\n",
349
  " <tr>\n",
350
- " <th>27349</th>\n",
351
- " <td>Choosing energy-efficient HVAC</td>\n",
352
- " <td>20</td>\n",
353
  " </tr>\n",
354
  " <tr>\n",
355
- " <th>12660</th>\n",
356
- " <td>Advocacy for native land rights</td>\n",
357
- " <td>10</td>\n",
358
  " </tr>\n",
359
  " </tbody>\n",
360
  "</table>\n",
361
  "</div>"
362
  ],
363
  "text/plain": [
364
- " text label\n",
365
- "7152 Social justice strategies 10\n",
366
- "31780 LinkedIn job search for food writing organizat... 21\n",
367
- "20244 Nobel Prize in Literature news 1\n",
368
- "16634 Job search for people with public health impai... 21\n",
369
- "8603 Car insurance for luxury cars 3\n",
370
- "30042 Personal development and self-help techniques ... 8\n",
371
- "9345 Smartwatch features 7\n",
372
- "19660 Travel deals for beachfront chalets 14\n",
373
- "27349 Choosing energy-efficient HVAC 20\n",
374
- "12660 Advocacy for native land rights 10"
375
  ]
376
  },
377
  "execution_count": 6,
@@ -483,8 +483,15 @@
483
  "name": "stderr",
484
  "output_type": "stream",
485
  "text": [
486
- "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 26889/26889 [00:00<00:00, 33262.24 examples/s]\n",
487
- "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6723/6723 [00:00<00:00, 42992.17 examples/s]\n"
 
 
 
 
 
 
 
488
  ]
489
  }
490
  ],
@@ -501,9 +508,9 @@
501
  "name": "stderr",
502
  "output_type": "stream",
503
  "text": [
504
- "2023-10-12 11:59:02.472987: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
505
  "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
506
- "2023-10-12 11:59:03.211664: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
507
  ]
508
  }
509
  ],
@@ -686,7 +693,7 @@
686
  " <div>\n",
687
  " \n",
688
  " <progress value='3362' max='3362' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
689
- " [3362/3362 01:46, Epoch 2/2]\n",
690
  " </div>\n",
691
  " <table border=\"1\" class=\"dataframe\">\n",
692
  " <thead>\n",
@@ -700,15 +707,15 @@
700
  " <tbody>\n",
701
  " <tr>\n",
702
  " <td>1</td>\n",
703
- " <td>0.104100</td>\n",
704
- " <td>0.102964</td>\n",
705
- " <td>0.972185</td>\n",
706
  " </tr>\n",
707
  " <tr>\n",
708
  " <td>2</td>\n",
709
- " <td>0.077300</td>\n",
710
- " <td>0.110562</td>\n",
711
- " <td>0.970846</td>\n",
712
  " </tr>\n",
713
  " </tbody>\n",
714
  "</table><p>"
@@ -723,7 +730,7 @@
723
  {
724
  "data": {
725
  "text/plain": [
726
- "TrainOutput(global_step=3362, training_loss=0.08810693149691462, metrics={'train_runtime': 106.8757, 'train_samples_per_second': 503.183, 'train_steps_per_second': 31.457, 'total_flos': 216609059710134.0, 'train_loss': 0.08810693149691462, 'epoch': 2.0})"
727
  ]
728
  },
729
  "execution_count": 19,
 
62
  " </thead>\n",
63
  " <tbody>\n",
64
  " <tr>\n",
65
+ " <th>3982</th>\n",
66
+ " <td>Citation context relevance assessment platforms</td>\n",
67
+ " <td>Reference</td>\n",
68
+ " <td>12</td>\n",
69
  " </tr>\n",
70
  " <tr>\n",
71
+ " <th>24651</th>\n",
72
+ " <td>Geology fieldwork</td>\n",
 
 
 
 
 
 
73
  " <td>Science</td>\n",
74
  " <td>2</td>\n",
75
  " </tr>\n",
76
  " <tr>\n",
77
+ " <th>28113</th>\n",
78
+ " <td>Password management for individuals</td>\n",
79
+ " <td>Computers_and_Electronics</td>\n",
80
+ " <td>7</td>\n",
81
  " </tr>\n",
82
  " <tr>\n",
83
+ " <th>10999</th>\n",
84
+ " <td>Real estate market statistics</td>\n",
85
+ " <td>Real Estate</td>\n",
86
+ " <td>24</td>\n",
87
  " </tr>\n",
88
  " <tr>\n",
89
+ " <th>17096</th>\n",
90
+ " <td>Running gear for women</td>\n",
91
+ " <td>Beauty_and_Fitness</td>\n",
92
+ " <td>9</td>\n",
93
  " </tr>\n",
94
  " <tr>\n",
95
+ " <th>2374</th>\n",
96
+ " <td>Sports Team Fan Pride</td>\n",
97
+ " <td>Sports</td>\n",
98
+ " <td>26</td>\n",
99
  " </tr>\n",
100
  " <tr>\n",
101
+ " <th>9932</th>\n",
102
+ " <td>Wine and food events</td>\n",
103
+ " <td>Food_and_Drink</td>\n",
104
+ " <td>15</td>\n",
105
  " </tr>\n",
106
  " <tr>\n",
107
+ " <th>2953</th>\n",
108
+ " <td>College admissions for aspiring dancers</td>\n",
109
+ " <td>Jobs_and_Education</td>\n",
110
+ " <td>21</td>\n",
111
  " </tr>\n",
112
  " <tr>\n",
113
+ " <th>25038</th>\n",
114
+ " <td>Software development best practices forums</td>\n",
115
+ " <td>Online Communities</td>\n",
116
+ " <td>8</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>29703</th>\n",
120
+ " <td>Quantum physics theories</td>\n",
121
+ " <td>Science</td>\n",
122
+ " <td>2</td>\n",
123
  " </tr>\n",
124
  " </tbody>\n",
125
  "</table>\n",
126
  "</div>"
127
  ],
128
  "text/plain": [
129
+ " category \\\n",
130
+ "3982 Citation context relevance assessment platforms \n",
131
+ "24651 Geology fieldwork \n",
132
+ "28113 Password management for individuals \n",
133
+ "10999 Real estate market statistics \n",
134
+ "17096 Running gear for women \n",
135
+ "2374 Sports Team Fan Pride \n",
136
+ "9932 Wine and food events \n",
137
+ "2953 College admissions for aspiring dancers \n",
138
+ "25038 Software development best practices forums \n",
139
+ "29703 Quantum physics theories \n",
140
  "\n",
141
+ " label label_id \n",
142
+ "3982 Reference 12 \n",
143
+ "24651 Science 2 \n",
144
+ "28113 Computers_and_Electronics 7 \n",
145
+ "10999 Real Estate 24 \n",
146
+ "17096 Beauty_and_Fitness 9 \n",
147
+ "2374 Sports 26 \n",
148
+ "9932 Food_and_Drink 15 \n",
149
+ "2953 Jobs_and_Education 21 \n",
150
+ "25038 Online Communities 8 \n",
151
+ "29703 Science 2 "
152
  ]
153
  },
154
  "execution_count": 3,
 
273
  "name": "stderr",
274
  "output_type": "stream",
275
  "text": [
276
+ "/tmp/ipykernel_139501/984288843.py:1: SettingWithCopyWarning: \n",
277
  "A value is trying to be set on a copy of a slice from a DataFrame\n",
278
  "\n",
279
  "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 
307
  " </thead>\n",
308
  " <tbody>\n",
309
  " <tr>\n",
310
+ " <th>2925</th>\n",
311
+ " <td>Kids' toy stores online</td>\n",
312
+ " <td>13</td>\n",
313
  " </tr>\n",
314
  " <tr>\n",
315
+ " <th>31108</th>\n",
316
+ " <td>Birdwatching apps for bird behavior</td>\n",
317
+ " <td>5</td>\n",
318
  " </tr>\n",
319
  " <tr>\n",
320
+ " <th>6817</th>\n",
321
+ " <td>Legal developments</td>\n",
322
  " <td>1</td>\n",
323
  " </tr>\n",
324
  " <tr>\n",
325
+ " <th>20037</th>\n",
326
+ " <td>Citation context relevance assessment tools</td>\n",
327
+ " <td>12</td>\n",
328
  " </tr>\n",
329
  " <tr>\n",
330
+ " <th>18928</th>\n",
331
+ " <td>Orchid care guide</td>\n",
332
+ " <td>20</td>\n",
333
  " </tr>\n",
334
  " <tr>\n",
335
+ " <th>33358</th>\n",
336
+ " <td>Scientific publications and journals</td>\n",
337
+ " <td>2</td>\n",
338
  " </tr>\n",
339
  " <tr>\n",
340
+ " <th>16499</th>\n",
341
+ " <td>Service dog etiquette</td>\n",
342
+ " <td>5</td>\n",
343
  " </tr>\n",
344
  " <tr>\n",
345
+ " <th>26484</th>\n",
346
+ " <td>Social media trends analysis</td>\n",
347
+ " <td>25</td>\n",
348
  " </tr>\n",
349
  " <tr>\n",
350
+ " <th>15543</th>\n",
351
+ " <td>Troubleshooting computer issues</td>\n",
352
+ " <td>7</td>\n",
353
  " </tr>\n",
354
  " <tr>\n",
355
+ " <th>15854</th>\n",
356
+ " <td>large</td>\n",
357
+ " <td>23</td>\n",
358
  " </tr>\n",
359
  " </tbody>\n",
360
  "</table>\n",
361
  "</div>"
362
  ],
363
  "text/plain": [
364
+ " text label\n",
365
+ "2925 Kids' toy stores online 13\n",
366
+ "31108 Birdwatching apps for bird behavior 5\n",
367
+ "6817 Legal developments 1\n",
368
+ "20037 Citation context relevance assessment tools 12\n",
369
+ "18928 Orchid care guide 20\n",
370
+ "33358 Scientific publications and journals 2\n",
371
+ "16499 Service dog etiquette 5\n",
372
+ "26484 Social media trends analysis 25\n",
373
+ "15543 Troubleshooting computer issues 7\n",
374
+ "15854 large 23"
375
  ]
376
  },
377
  "execution_count": 6,
 
483
  "name": "stderr",
484
  "output_type": "stream",
485
  "text": [
486
+ "Map: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 13000/26889 [00:00<00:00, 32226.42 examples/s]"
487
+ ]
488
+ },
489
+ {
490
+ "name": "stderr",
491
+ "output_type": "stream",
492
+ "text": [
493
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 26889/26889 [00:00<00:00, 34388.34 examples/s]\n",
494
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6723/6723 [00:00<00:00, 41978.69 examples/s]\n"
495
  ]
496
  }
497
  ],
 
508
  "name": "stderr",
509
  "output_type": "stream",
510
  "text": [
511
+ "2023-10-13 10:29:49.212220: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
512
  "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
513
+ "2023-10-13 10:29:50.573292: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
514
  ]
515
  }
516
  ],
 
693
  " <div>\n",
694
  " \n",
695
  " <progress value='3362' max='3362' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
696
+ " [3362/3362 01:52, Epoch 2/2]\n",
697
  " </div>\n",
698
  " <table border=\"1\" class=\"dataframe\">\n",
699
  " <thead>\n",
 
707
  " <tbody>\n",
708
  " <tr>\n",
709
  " <td>1</td>\n",
710
+ " <td>0.102300</td>\n",
711
+ " <td>0.077652</td>\n",
712
+ " <td>0.975309</td>\n",
713
  " </tr>\n",
714
  " <tr>\n",
715
  " <td>2</td>\n",
716
+ " <td>0.083400</td>\n",
717
+ " <td>0.086291</td>\n",
718
+ " <td>0.974714</td>\n",
719
  " </tr>\n",
720
  " </tbody>\n",
721
  "</table><p>"
 
730
  {
731
  "data": {
732
  "text/plain": [
733
+ "TrainOutput(global_step=3362, training_loss=0.08880683540376008, metrics={'train_runtime': 113.5357, 'train_samples_per_second': 473.666, 'train_steps_per_second': 29.612, 'total_flos': 213673546900476.0, 'train_loss': 0.08880683540376008, 'epoch': 2.0})"
734
  ]
735
  },
736
  "execution_count": 19,
research/11_evaluation.ipynb CHANGED
@@ -13,7 +13,17 @@
13
  "cell_type": "code",
14
  "execution_count": 2,
15
  "metadata": {},
16
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
17
  "source": [
18
  "from utils.get_intent import get_top_intent"
19
  ]
@@ -26,11 +36,11 @@
26
  {
27
  "data": {
28
  "text/plain": [
29
- "[('Commercial', 0.969),\n",
30
- " ('Transactional', 0.673),\n",
31
- " ('Informational', 0.237),\n",
32
- " ('Navigational', 0.215),\n",
33
- " ('Local', 0.155)]"
34
  ]
35
  },
36
  "execution_count": 3,
@@ -50,11 +60,11 @@
50
  {
51
  "data": {
52
  "text/plain": [
53
- "[('Transactional', 0.987),\n",
54
- " ('Navigational', 0.317),\n",
55
- " ('Commercial', 0.27),\n",
56
- " ('Informational', 0.249),\n",
57
- " ('Local', 0.229)]"
58
  ]
59
  },
60
  "execution_count": 4,
@@ -74,11 +84,11 @@
74
  {
75
  "data": {
76
  "text/plain": [
77
- "[('Informational', 0.984),\n",
78
- " ('Local', 0.244),\n",
79
- " ('Commercial', 0.237),\n",
80
- " ('Transactional', 0.212),\n",
81
- " ('Navigational', 0.194)]"
82
  ]
83
  },
84
  "execution_count": 5,
@@ -98,11 +108,11 @@
98
  {
99
  "data": {
100
  "text/plain": [
101
- "[('Local', 0.988),\n",
102
- " ('Informational', 0.3),\n",
103
- " ('Commercial', 0.278),\n",
104
- " ('Navigational', 0.273),\n",
105
- " ('Transactional', 0.234)]"
106
  ]
107
  },
108
  "execution_count": 6,
@@ -122,11 +132,11 @@
122
  {
123
  "data": {
124
  "text/plain": [
125
- "[('Informational', 0.763),\n",
126
- " ('Navigational', 0.638),\n",
127
- " ('Transactional', 0.433),\n",
128
- " ('Commercial', 0.286),\n",
129
- " ('Local', 0.236)]"
130
  ]
131
  },
132
  "execution_count": 7,
@@ -146,11 +156,11 @@
146
  {
147
  "data": {
148
  "text/plain": [
149
- "[('Navigational', 0.861),\n",
150
- " ('Transactional', 0.725),\n",
151
- " ('Local', 0.422),\n",
152
- " ('Commercial', 0.287),\n",
153
- " ('Informational', 0.202)]"
154
  ]
155
  },
156
  "execution_count": 8,
@@ -170,11 +180,11 @@
170
  {
171
  "data": {
172
  "text/plain": [
173
- "[('Navigational', 0.983),\n",
174
- " ('Transactional', 0.27),\n",
175
- " ('Local', 0.23),\n",
176
- " ('Informational', 0.209),\n",
177
- " ('Commercial', 0.192)]"
178
  ]
179
  },
180
  "execution_count": 9,
@@ -194,11 +204,11 @@
194
  {
195
  "data": {
196
  "text/plain": [
197
- "[('Navigational', 0.983),\n",
198
  " ('Transactional', 0.256),\n",
199
- " ('Informational', 0.241),\n",
200
- " ('Local', 0.214),\n",
201
- " ('Commercial', 0.184)]"
202
  ]
203
  },
204
  "execution_count": 10,
@@ -218,11 +228,11 @@
218
  {
219
  "data": {
220
  "text/plain": [
221
- "[('Local', 0.988),\n",
222
- " ('Informational', 0.294),\n",
223
- " ('Navigational', 0.284),\n",
224
- " ('Commercial', 0.252),\n",
225
- " ('Transactional', 0.235)]"
226
  ]
227
  },
228
  "execution_count": 11,
@@ -242,11 +252,11 @@
242
  {
243
  "data": {
244
  "text/plain": [
245
- "[('Informational', 0.984),\n",
246
- " ('Local', 0.245),\n",
247
- " ('Commercial', 0.242),\n",
248
- " ('Transactional', 0.226),\n",
249
- " ('Navigational', 0.189)]"
250
  ]
251
  },
252
  "execution_count": 12,
@@ -258,6 +268,204 @@
258
  "get_top_intent(\"how to wear headphones\")"
259
  ]
260
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  {
262
  "cell_type": "code",
263
  "execution_count": null,
 
13
  "cell_type": "code",
14
  "execution_count": 2,
15
  "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
22
+ " from .autonotebook import tqdm as notebook_tqdm\n",
23
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
24
+ ]
25
+ }
26
+ ],
27
  "source": [
28
  "from utils.get_intent import get_top_intent"
29
  ]
 
36
  {
37
  "data": {
38
  "text/plain": [
39
+ "[('Commercial', 0.997),\n",
40
+ " ('Transactional', 0.199),\n",
41
+ " ('Local', 0.132),\n",
42
+ " ('Navigational', 0.099),\n",
43
+ " ('Informational', 0.088)]"
44
  ]
45
  },
46
  "execution_count": 3,
 
60
  {
61
  "data": {
62
  "text/plain": [
63
+ "[('Transactional', 0.996),\n",
64
+ " ('Commercial', 0.315),\n",
65
+ " ('Navigational', 0.149),\n",
66
+ " ('Local', 0.146),\n",
67
+ " ('Informational', 0.133)]"
68
  ]
69
  },
70
  "execution_count": 4,
 
84
  {
85
  "data": {
86
  "text/plain": [
87
+ "[('Informational', 0.999),\n",
88
+ " ('Transactional', 0.116),\n",
89
+ " ('Local', 0.094),\n",
90
+ " ('Commercial', 0.075),\n",
91
+ " ('Navigational', 0.075)]"
92
  ]
93
  },
94
  "execution_count": 5,
 
108
  {
109
  "data": {
110
  "text/plain": [
111
+ "[('Local', 0.997),\n",
112
+ " ('Commercial', 0.134),\n",
113
+ " ('Informational', 0.122),\n",
114
+ " ('Navigational', 0.121),\n",
115
+ " ('Transactional', 0.12)]"
116
  ]
117
  },
118
  "execution_count": 6,
 
132
  {
133
  "data": {
134
  "text/plain": [
135
+ "[('Informational', 0.892),\n",
136
+ " ('Transactional', 0.685),\n",
137
+ " ('Navigational', 0.533),\n",
138
+ " ('Commercial', 0.123),\n",
139
+ " ('Local', 0.072)]"
140
  ]
141
  },
142
  "execution_count": 7,
 
156
  {
157
  "data": {
158
  "text/plain": [
159
+ "[('Informational', 0.993),\n",
160
+ " ('Commercial', 0.183),\n",
161
+ " ('Transactional', 0.173),\n",
162
+ " ('Local', 0.123),\n",
163
+ " ('Navigational', 0.082)]"
164
  ]
165
  },
166
  "execution_count": 8,
 
180
  {
181
  "data": {
182
  "text/plain": [
183
+ "[('Navigational', 0.998),\n",
184
+ " ('Transactional', 0.271),\n",
185
+ " ('Local', 0.164),\n",
186
+ " ('Commercial', 0.134),\n",
187
+ " ('Informational', 0.129)]"
188
  ]
189
  },
190
  "execution_count": 9,
 
204
  {
205
  "data": {
206
  "text/plain": [
207
+ "[('Navigational', 0.998),\n",
208
  " ('Transactional', 0.256),\n",
209
+ " ('Local', 0.171),\n",
210
+ " ('Informational', 0.151),\n",
211
+ " ('Commercial', 0.127)]"
212
  ]
213
  },
214
  "execution_count": 10,
 
228
  {
229
  "data": {
230
  "text/plain": [
231
+ "[('Local', 0.997),\n",
232
+ " ('Commercial', 0.136),\n",
233
+ " ('Transactional', 0.124),\n",
234
+ " ('Informational', 0.119),\n",
235
+ " ('Navigational', 0.118)]"
236
  ]
237
  },
238
  "execution_count": 11,
 
252
  {
253
  "data": {
254
  "text/plain": [
255
+ "[('Informational', 0.999),\n",
256
+ " ('Transactional', 0.131),\n",
257
+ " ('Local', 0.09),\n",
258
+ " ('Commercial', 0.072),\n",
259
+ " ('Navigational', 0.069)]"
260
  ]
261
  },
262
  "execution_count": 12,
 
268
  "get_top_intent(\"how to wear headphones\")"
269
  ]
270
  },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 13,
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "data": {
278
+ "text/plain": [
279
+ "[('Navigational', 0.997),\n",
280
+ " ('Transactional', 0.452),\n",
281
+ " ('Local', 0.127),\n",
282
+ " ('Informational', 0.126),\n",
283
+ " ('Commercial', 0.12)]"
284
+ ]
285
+ },
286
+ "execution_count": 13,
287
+ "metadata": {},
288
+ "output_type": "execute_result"
289
+ }
290
+ ],
291
+ "source": [
292
+ "get_top_intent(\"receiptify\")"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": 14,
298
+ "metadata": {},
299
+ "outputs": [
300
+ {
301
+ "data": {
302
+ "text/plain": [
303
+ "[('Transactional', 0.995),\n",
304
+ " ('Commercial', 0.27),\n",
305
+ " ('Informational', 0.181),\n",
306
+ " ('Local', 0.162),\n",
307
+ " ('Navigational', 0.133)]"
308
+ ]
309
+ },
310
+ "execution_count": 14,
311
+ "metadata": {},
312
+ "output_type": "execute_result"
313
+ }
314
+ ],
315
+ "source": [
316
+ "get_top_intent(\"cat ear headphones\")"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 15,
322
+ "metadata": {},
323
+ "outputs": [
324
+ {
325
+ "data": {
326
+ "text/plain": [
327
+ "[('Transactional', 0.977),\n",
328
+ " ('Navigational', 0.808),\n",
329
+ " ('Commercial', 0.254),\n",
330
+ " ('Informational', 0.107),\n",
331
+ " ('Local', 0.081)]"
332
+ ]
333
+ },
334
+ "execution_count": 15,
335
+ "metadata": {},
336
+ "output_type": "execute_result"
337
+ }
338
+ ],
339
+ "source": [
340
+ "get_top_intent(\"sony headphones guide\")"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 16,
346
+ "metadata": {},
347
+ "outputs": [
348
+ {
349
+ "data": {
350
+ "text/plain": [
351
+ "[('Navigational', 0.949),\n",
352
+ " ('Transactional', 0.89),\n",
353
+ " ('Informational', 0.328),\n",
354
+ " ('Commercial', 0.113),\n",
355
+ " ('Local', 0.069)]"
356
+ ]
357
+ },
358
+ "execution_count": 16,
359
+ "metadata": {},
360
+ "output_type": "execute_result"
361
+ }
362
+ ],
363
+ "source": [
364
+ "get_top_intent(\"wolf cut\") # informational"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 17,
370
+ "metadata": {},
371
+ "outputs": [
372
+ {
373
+ "data": {
374
+ "text/plain": [
375
+ "[('Transactional', 0.996),\n",
376
+ " ('Commercial', 0.217),\n",
377
+ " ('Informational', 0.199),\n",
378
+ " ('Navigational', 0.17),\n",
379
+ " ('Local', 0.136)]"
380
+ ]
381
+ },
382
+ "execution_count": 17,
383
+ "metadata": {},
384
+ "output_type": "execute_result"
385
+ }
386
+ ],
387
+ "source": [
388
+ "get_top_intent(\"help plumbing supply\") # informational"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": 18,
394
+ "metadata": {},
395
+ "outputs": [
396
+ {
397
+ "data": {
398
+ "text/plain": [
399
+ "[('Informational', 0.969),\n",
400
+ " ('Commercial', 0.677),\n",
401
+ " ('Transactional', 0.276),\n",
402
+ " ('Local', 0.071),\n",
403
+ " ('Navigational', 0.035)]"
404
+ ]
405
+ },
406
+ "execution_count": 18,
407
+ "metadata": {},
408
+ "output_type": "execute_result"
409
+ }
410
+ ],
411
+ "source": [
412
+ "get_top_intent('yoga purpose') # informational"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "metadata": {},
419
+ "outputs": [],
420
+ "source": []
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": null,
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": []
428
+ },
429
+ {
430
+ "cell_type": "code",
431
+ "execution_count": 1,
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "import os; os.chdir('..')"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 2,
441
+ "metadata": {},
442
+ "outputs": [],
443
+ "source": [
444
+ "from utils.get_category import get_top_labels"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 3,
450
+ "metadata": {},
451
+ "outputs": [
452
+ {
453
+ "data": {
454
+ "text/plain": [
455
+ "[('Computers_and_Electronics', 1.0), ('Shopping', 0.182)]"
456
+ ]
457
+ },
458
+ "execution_count": 3,
459
+ "metadata": {},
460
+ "output_type": "execute_result"
461
+ }
462
+ ],
463
+ "source": [
464
+ "get_top_labels(\n",
465
+ " \"best cat ear headphones\"\n",
466
+ ")"
467
+ ]
468
+ },
469
  {
470
  "cell_type": "code",
471
  "execution_count": null,
research/11_intent_classification_using_distilbert.ipynb CHANGED
@@ -20,7 +20,7 @@
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 3,
24
  "metadata": {},
25
  "outputs": [
26
  {
@@ -87,7 +87,7 @@
87
  "4 tech crunch Navigational"
88
  ]
89
  },
90
- "execution_count": 3,
91
  "metadata": {},
92
  "output_type": "execute_result"
93
  }
@@ -99,7 +99,59 @@
99
  },
100
  {
101
  "cell_type": "code",
102
- "execution_count": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  "metadata": {},
104
  "outputs": [],
105
  "source": [
@@ -108,7 +160,7 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 5,
112
  "metadata": {},
113
  "outputs": [],
114
  "source": [
@@ -121,7 +173,7 @@
121
  },
122
  {
123
  "cell_type": "code",
124
- "execution_count": 6,
125
  "metadata": {},
126
  "outputs": [
127
  {
@@ -134,7 +186,7 @@
134
  " 4: 'Transactional'}"
135
  ]
136
  },
137
- "execution_count": 6,
138
  "metadata": {},
139
  "output_type": "execute_result"
140
  }
@@ -145,7 +197,7 @@
145
  },
146
  {
147
  "cell_type": "code",
148
- "execution_count": 7,
149
  "metadata": {},
150
  "outputs": [
151
  {
@@ -158,7 +210,7 @@
158
  " 'Transactional': 4}"
159
  ]
160
  },
161
- "execution_count": 7,
162
  "metadata": {},
163
  "output_type": "execute_result"
164
  }
@@ -169,7 +221,7 @@
169
  },
170
  {
171
  "cell_type": "code",
172
- "execution_count": 8,
173
  "metadata": {},
174
  "outputs": [],
175
  "source": [
@@ -179,7 +231,7 @@
179
  },
180
  {
181
  "cell_type": "code",
182
- "execution_count": 9,
183
  "metadata": {},
184
  "outputs": [
185
  {
@@ -246,58 +298,58 @@
246
  " <td>...</td>\n",
247
  " </tr>\n",
248
  " <tr>\n",
249
- " <th>1066</th>\n",
250
- " <td>How to make a paper flower?</td>\n",
251
  " <td>Informational</td>\n",
252
  " <td>1</td>\n",
253
  " </tr>\n",
254
  " <tr>\n",
255
- " <th>1067</th>\n",
256
- " <td>Why do some animals camouflage?</td>\n",
257
  " <td>Informational</td>\n",
258
  " <td>1</td>\n",
259
  " </tr>\n",
260
  " <tr>\n",
261
- " <th>1068</th>\n",
262
- " <td>What is the history of ancient civilizations?</td>\n",
263
  " <td>Informational</td>\n",
264
  " <td>1</td>\n",
265
  " </tr>\n",
266
  " <tr>\n",
267
- " <th>1069</th>\n",
268
- " <td>How to make a simple machine?</td>\n",
269
  " <td>Informational</td>\n",
270
  " <td>1</td>\n",
271
  " </tr>\n",
272
  " <tr>\n",
273
- " <th>1070</th>\n",
274
- " <td>Why do we see the phases of the moon?</td>\n",
275
  " <td>Informational</td>\n",
276
  " <td>1</td>\n",
277
  " </tr>\n",
278
  " </tbody>\n",
279
  "</table>\n",
280
- "<p>1071 rows Γ— 3 columns</p>\n",
281
  "</div>"
282
  ],
283
  "text/plain": [
284
- " keyword intent id\n",
285
- "0 citalopram vs prozac Commercial 0\n",
286
- "1 who is the oldest football player Informational 1\n",
287
- "2 t mobile town east Navigational 2\n",
288
- "3 starbucks Navigational 2\n",
289
- "4 tech crunch Navigational 2\n",
290
- "... ... ... ..\n",
291
- "1066 How to make a paper flower? Informational 1\n",
292
- "1067 Why do some animals camouflage? Informational 1\n",
293
- "1068 What is the history of ancient civilizations? Informational 1\n",
294
- "1069 How to make a simple machine? Informational 1\n",
295
- "1070 Why do we see the phases of the moon? Informational 1\n",
296
  "\n",
297
- "[1071 rows x 3 columns]"
298
  ]
299
  },
300
- "execution_count": 9,
301
  "metadata": {},
302
  "output_type": "execute_result"
303
  }
@@ -309,7 +361,7 @@
309
  },
310
  {
311
  "cell_type": "code",
312
- "execution_count": 10,
313
  "metadata": {},
314
  "outputs": [
315
  {
@@ -369,53 +421,53 @@
369
  " <td>...</td>\n",
370
  " </tr>\n",
371
  " <tr>\n",
372
- " <th>1066</th>\n",
373
- " <td>How to make a paper flower?</td>\n",
374
  " <td>1</td>\n",
375
  " </tr>\n",
376
  " <tr>\n",
377
- " <th>1067</th>\n",
378
- " <td>Why do some animals camouflage?</td>\n",
379
  " <td>1</td>\n",
380
  " </tr>\n",
381
  " <tr>\n",
382
- " <th>1068</th>\n",
383
- " <td>What is the history of ancient civilizations?</td>\n",
384
  " <td>1</td>\n",
385
  " </tr>\n",
386
  " <tr>\n",
387
- " <th>1069</th>\n",
388
- " <td>How to make a simple machine?</td>\n",
389
  " <td>1</td>\n",
390
  " </tr>\n",
391
  " <tr>\n",
392
- " <th>1070</th>\n",
393
- " <td>Why do we see the phases of the moon?</td>\n",
394
  " <td>1</td>\n",
395
  " </tr>\n",
396
  " </tbody>\n",
397
  "</table>\n",
398
- "<p>1071 rows Γ— 2 columns</p>\n",
399
  "</div>"
400
  ],
401
  "text/plain": [
402
- " keyword id\n",
403
- "0 citalopram vs prozac 0\n",
404
- "1 who is the oldest football player 1\n",
405
- "2 t mobile town east 2\n",
406
- "3 starbucks 2\n",
407
- "4 tech crunch 2\n",
408
- "... ... ..\n",
409
- "1066 How to make a paper flower? 1\n",
410
- "1067 Why do some animals camouflage? 1\n",
411
- "1068 What is the history of ancient civilizations? 1\n",
412
- "1069 How to make a simple machine? 1\n",
413
- "1070 Why do we see the phases of the moon? 1\n",
414
  "\n",
415
- "[1071 rows x 2 columns]"
416
  ]
417
  },
418
- "execution_count": 10,
419
  "metadata": {},
420
  "output_type": "execute_result"
421
  }
@@ -427,7 +479,7 @@
427
  },
428
  {
429
  "cell_type": "code",
430
- "execution_count": 11,
431
  "metadata": {},
432
  "outputs": [
433
  {
@@ -445,14 +497,14 @@
445
  },
446
  {
447
  "cell_type": "code",
448
- "execution_count": 12,
449
  "metadata": {},
450
  "outputs": [
451
  {
452
  "name": "stderr",
453
  "output_type": "stream",
454
  "text": [
455
- "/tmp/ipykernel_138160/1635098052.py:1: SettingWithCopyWarning: \n",
456
  "A value is trying to be set on a copy of a slice from a DataFrame\n",
457
  "\n",
458
  "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@@ -486,74 +538,74 @@
486
  " </thead>\n",
487
  " <tbody>\n",
488
  " <tr>\n",
489
- " <th>706</th>\n",
490
- " <td>Purchase DJ equipment</td>\n",
491
  " <td>4</td>\n",
492
  " </tr>\n",
493
  " <tr>\n",
494
- " <th>24</th>\n",
495
- " <td>best headphones quora</td>\n",
496
- " <td>2</td>\n",
497
  " </tr>\n",
498
  " <tr>\n",
499
- " <th>727</th>\n",
500
- " <td>Purchase fitness tracker</td>\n",
501
  " <td>4</td>\n",
502
  " </tr>\n",
503
  " <tr>\n",
504
- " <th>17</th>\n",
505
- " <td>facebook</td>\n",
506
- " <td>2</td>\n",
507
  " </tr>\n",
508
  " <tr>\n",
509
- " <th>808</th>\n",
510
- " <td>Outdoor activities in Lake Tahoe</td>\n",
511
- " <td>3</td>\n",
512
  " </tr>\n",
513
  " <tr>\n",
514
- " <th>946</th>\n",
515
- " <td>Wine bars in Napa Valley</td>\n",
516
- " <td>3</td>\n",
517
  " </tr>\n",
518
  " <tr>\n",
519
- " <th>944</th>\n",
520
- " <td>Art installations in Chicago</td>\n",
521
- " <td>3</td>\n",
522
  " </tr>\n",
523
  " <tr>\n",
524
- " <th>899</th>\n",
525
- " <td>Snowboarding parks in Utah</td>\n",
526
- " <td>3</td>\n",
527
  " </tr>\n",
528
  " <tr>\n",
529
- " <th>36</th>\n",
530
- " <td>Mission Immpossible</td>\n",
531
  " <td>1</td>\n",
532
  " </tr>\n",
533
  " <tr>\n",
534
- " <th>129</th>\n",
535
- " <td>Instagram</td>\n",
536
- " <td>2</td>\n",
537
  " </tr>\n",
538
  " </tbody>\n",
539
  "</table>\n",
540
  "</div>"
541
  ],
542
  "text/plain": [
543
- " text label\n",
544
- "706 Purchase DJ equipment 4\n",
545
- "24 best headphones quora 2\n",
546
- "727 Purchase fitness tracker 4\n",
547
- "17 facebook 2\n",
548
- "808 Outdoor activities in Lake Tahoe 3\n",
549
- "946 Wine bars in Napa Valley 3\n",
550
- "944 Art installations in Chicago 3\n",
551
- "899 Snowboarding parks in Utah 3\n",
552
- "36 Mission Immpossible 1\n",
553
- "129 Instagram 2"
554
  ]
555
  },
556
- "execution_count": 12,
557
  "metadata": {},
558
  "output_type": "execute_result"
559
  }
@@ -571,7 +623,7 @@
571
  },
572
  {
573
  "cell_type": "code",
574
- "execution_count": 13,
575
  "metadata": {},
576
  "outputs": [
577
  {
@@ -586,12 +638,12 @@
586
  "data": {
587
  "text/plain": [
588
  "Dataset({\n",
589
- " features: ['text', 'label'],\n",
590
- " num_rows: 1071\n",
591
  "})"
592
  ]
593
  },
594
- "execution_count": 13,
595
  "metadata": {},
596
  "output_type": "execute_result"
597
  }
@@ -603,7 +655,7 @@
603
  },
604
  {
605
  "cell_type": "code",
606
- "execution_count": 14,
607
  "metadata": {},
608
  "outputs": [
609
  {
@@ -611,17 +663,17 @@
611
  "text/plain": [
612
  "DatasetDict({\n",
613
  " train: Dataset({\n",
614
- " features: ['text', 'label'],\n",
615
- " num_rows: 856\n",
616
  " })\n",
617
  " test: Dataset({\n",
618
- " features: ['text', 'label'],\n",
619
- " num_rows: 215\n",
620
  " })\n",
621
  "})"
622
  ]
623
  },
624
- "execution_count": 14,
625
  "metadata": {},
626
  "output_type": "execute_result"
627
  }
@@ -633,7 +685,7 @@
633
  },
634
  {
635
  "cell_type": "code",
636
- "execution_count": 15,
637
  "metadata": {},
638
  "outputs": [],
639
  "source": [
@@ -644,7 +696,7 @@
644
  },
645
  {
646
  "cell_type": "code",
647
- "execution_count": 16,
648
  "metadata": {},
649
  "outputs": [],
650
  "source": [
@@ -654,15 +706,15 @@
654
  },
655
  {
656
  "cell_type": "code",
657
- "execution_count": 17,
658
  "metadata": {},
659
  "outputs": [
660
  {
661
  "name": "stderr",
662
  "output_type": "stream",
663
  "text": [
664
- "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 856/856 [00:00<00:00, 18779.12 examples/s]\n",
665
- "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 215/215 [00:00<00:00, 27520.84 examples/s]\n"
666
  ]
667
  }
668
  ],
@@ -672,16 +724,16 @@
672
  },
673
  {
674
  "cell_type": "code",
675
- "execution_count": 18,
676
  "metadata": {},
677
  "outputs": [
678
  {
679
  "name": "stderr",
680
  "output_type": "stream",
681
  "text": [
682
- "2023-10-13 09:10:00.122326: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
683
  "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
684
- "2023-10-13 09:10:01.611782: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
685
  ]
686
  }
687
  ],
@@ -700,7 +752,7 @@
700
  },
701
  {
702
  "cell_type": "code",
703
- "execution_count": 19,
704
  "metadata": {},
705
  "outputs": [],
706
  "source": [
@@ -711,7 +763,7 @@
711
  },
712
  {
713
  "cell_type": "code",
714
- "execution_count": 20,
715
  "metadata": {},
716
  "outputs": [],
717
  "source": [
@@ -726,14 +778,14 @@
726
  },
727
  {
728
  "cell_type": "code",
729
- "execution_count": 21,
730
  "metadata": {},
731
  "outputs": [
732
  {
733
  "name": "stderr",
734
  "output_type": "stream",
735
  "text": [
736
- "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']\n",
737
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
738
  ]
739
  }
@@ -748,7 +800,7 @@
748
  },
749
  {
750
  "cell_type": "code",
751
- "execution_count": 22,
752
  "metadata": {},
753
  "outputs": [
754
  {
@@ -764,8 +816,8 @@
764
  "\n",
765
  " <div>\n",
766
  " \n",
767
- " <progress value='324' max='324' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
768
- " [324/324 00:39, Epoch 6/6]\n",
769
  " </div>\n",
770
  " <table border=\"1\" class=\"dataframe\">\n",
771
  " <thead>\n",
@@ -780,38 +832,98 @@
780
  " <tr>\n",
781
  " <td>1</td>\n",
782
  " <td>No log</td>\n",
783
- " <td>0.467693</td>\n",
784
- " <td>0.948837</td>\n",
785
  " </tr>\n",
786
  " <tr>\n",
787
  " <td>2</td>\n",
788
  " <td>No log</td>\n",
789
- " <td>0.204288</td>\n",
790
- " <td>0.953488</td>\n",
791
  " </tr>\n",
792
  " <tr>\n",
793
  " <td>3</td>\n",
794
  " <td>No log</td>\n",
795
- " <td>0.164018</td>\n",
796
- " <td>0.967442</td>\n",
797
  " </tr>\n",
798
  " <tr>\n",
799
  " <td>4</td>\n",
800
  " <td>No log</td>\n",
801
- " <td>0.164968</td>\n",
802
- " <td>0.967442</td>\n",
803
  " </tr>\n",
804
  " <tr>\n",
805
  " <td>5</td>\n",
806
  " <td>No log</td>\n",
807
- " <td>0.163977</td>\n",
808
- " <td>0.967442</td>\n",
809
  " </tr>\n",
810
  " <tr>\n",
811
  " <td>6</td>\n",
812
  " <td>No log</td>\n",
813
- " <td>0.165533</td>\n",
814
- " <td>0.967442</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
815
  " </tr>\n",
816
  " </tbody>\n",
817
  "</table><p>"
@@ -826,10 +938,10 @@
826
  {
827
  "data": {
828
  "text/plain": [
829
- "TrainOutput(global_step=324, training_loss=0.2842947171058184, metrics={'train_runtime': 40.8212, 'train_samples_per_second': 125.817, 'train_steps_per_second': 7.937, 'total_flos': 13032177536640.0, 'train_loss': 0.2842947171058184, 'epoch': 6.0})"
830
  ]
831
  },
832
- "execution_count": 22,
833
  "metadata": {},
834
  "output_type": "execute_result"
835
  }
@@ -840,7 +952,7 @@
840
  " learning_rate=2e-5,\n",
841
  " per_device_train_batch_size=16,\n",
842
  " per_device_eval_batch_size=16,\n",
843
- " num_train_epochs=6,\n",
844
  " weight_decay=0.01,\n",
845
  " evaluation_strategy=\"epoch\",\n",
846
  " save_strategy=\"epoch\",\n",
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 10,
24
  "metadata": {},
25
  "outputs": [
26
  {
 
87
  "4 tech crunch Navigational"
88
  ]
89
  },
90
+ "execution_count": 10,
91
  "metadata": {},
92
  "output_type": "execute_result"
93
  }
 
99
  },
100
  {
101
  "cell_type": "code",
102
+ "execution_count": 16,
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "data": {
107
+ "text/plain": [
108
+ "False 1506\n",
109
+ "True 202\n",
110
+ "Name: count, dtype: int64"
111
+ ]
112
+ },
113
+ "execution_count": 16,
114
+ "metadata": {},
115
+ "output_type": "execute_result"
116
+ }
117
+ ],
118
+ "source": [
119
+ "original_df.duplicated().value_counts()"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 17,
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "original_df.drop_duplicates(inplace=True)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 18,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "data": {
138
+ "text/plain": [
139
+ "False 1506\n",
140
+ "Name: count, dtype: int64"
141
+ ]
142
+ },
143
+ "execution_count": 18,
144
+ "metadata": {},
145
+ "output_type": "execute_result"
146
+ }
147
+ ],
148
+ "source": [
149
+ "original_df.duplicated().value_counts()"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 19,
155
  "metadata": {},
156
  "outputs": [],
157
  "source": [
 
160
  },
161
  {
162
  "cell_type": "code",
163
+ "execution_count": 20,
164
  "metadata": {},
165
  "outputs": [],
166
  "source": [
 
173
  },
174
  {
175
  "cell_type": "code",
176
+ "execution_count": 21,
177
  "metadata": {},
178
  "outputs": [
179
  {
 
186
  " 4: 'Transactional'}"
187
  ]
188
  },
189
+ "execution_count": 21,
190
  "metadata": {},
191
  "output_type": "execute_result"
192
  }
 
197
  },
198
  {
199
  "cell_type": "code",
200
+ "execution_count": 22,
201
  "metadata": {},
202
  "outputs": [
203
  {
 
210
  " 'Transactional': 4}"
211
  ]
212
  },
213
+ "execution_count": 22,
214
  "metadata": {},
215
  "output_type": "execute_result"
216
  }
 
221
  },
222
  {
223
  "cell_type": "code",
224
+ "execution_count": 23,
225
  "metadata": {},
226
  "outputs": [],
227
  "source": [
 
231
  },
232
  {
233
  "cell_type": "code",
234
+ "execution_count": 24,
235
  "metadata": {},
236
  "outputs": [
237
  {
 
298
  " <td>...</td>\n",
299
  " </tr>\n",
300
  " <tr>\n",
301
+ " <th>1703</th>\n",
302
+ " <td>How to make homemade pet accessories from recy...</td>\n",
303
  " <td>Informational</td>\n",
304
  " <td>1</td>\n",
305
  " </tr>\n",
306
  " <tr>\n",
307
+ " <th>1704</th>\n",
308
+ " <td>Top 10 science fiction book series that take r...</td>\n",
309
  " <td>Informational</td>\n",
310
  " <td>1</td>\n",
311
  " </tr>\n",
312
  " <tr>\n",
313
+ " <th>1705</th>\n",
314
+ " <td>How to start a car restoration and customizati...</td>\n",
315
  " <td>Informational</td>\n",
316
  " <td>1</td>\n",
317
  " </tr>\n",
318
  " <tr>\n",
319
+ " <th>1706</th>\n",
320
+ " <td>Ancient Mesopotamian architecture and its infl...</td>\n",
321
  " <td>Informational</td>\n",
322
  " <td>1</td>\n",
323
  " </tr>\n",
324
  " <tr>\n",
325
+ " <th>1707</th>\n",
326
+ " <td>Benefits of a flexitarian diet for those seeki...</td>\n",
327
  " <td>Informational</td>\n",
328
  " <td>1</td>\n",
329
  " </tr>\n",
330
  " </tbody>\n",
331
  "</table>\n",
332
+ "<p>1506 rows Γ— 3 columns</p>\n",
333
  "</div>"
334
  ],
335
  "text/plain": [
336
+ " keyword intent id\n",
337
+ "0 citalopram vs prozac Commercial 0\n",
338
+ "1 who is the oldest football player Informational 1\n",
339
+ "2 t mobile town east Navigational 2\n",
340
+ "3 starbucks Navigational 2\n",
341
+ "4 tech crunch Navigational 2\n",
342
+ "... ... ... ..\n",
343
+ "1703 How to make homemade pet accessories from recy... Informational 1\n",
344
+ "1704 Top 10 science fiction book series that take r... Informational 1\n",
345
+ "1705 How to start a car restoration and customizati... Informational 1\n",
346
+ "1706 Ancient Mesopotamian architecture and its infl... Informational 1\n",
347
+ "1707 Benefits of a flexitarian diet for those seeki... Informational 1\n",
348
  "\n",
349
+ "[1506 rows x 3 columns]"
350
  ]
351
  },
352
+ "execution_count": 24,
353
  "metadata": {},
354
  "output_type": "execute_result"
355
  }
 
361
  },
362
  {
363
  "cell_type": "code",
364
+ "execution_count": 25,
365
  "metadata": {},
366
  "outputs": [
367
  {
 
421
  " <td>...</td>\n",
422
  " </tr>\n",
423
  " <tr>\n",
424
+ " <th>1703</th>\n",
425
+ " <td>How to make homemade pet accessories from recy...</td>\n",
426
  " <td>1</td>\n",
427
  " </tr>\n",
428
  " <tr>\n",
429
+ " <th>1704</th>\n",
430
+ " <td>Top 10 science fiction book series that take r...</td>\n",
431
  " <td>1</td>\n",
432
  " </tr>\n",
433
  " <tr>\n",
434
+ " <th>1705</th>\n",
435
+ " <td>How to start a car restoration and customizati...</td>\n",
436
  " <td>1</td>\n",
437
  " </tr>\n",
438
  " <tr>\n",
439
+ " <th>1706</th>\n",
440
+ " <td>Ancient Mesopotamian architecture and its infl...</td>\n",
441
  " <td>1</td>\n",
442
  " </tr>\n",
443
  " <tr>\n",
444
+ " <th>1707</th>\n",
445
+ " <td>Benefits of a flexitarian diet for those seeki...</td>\n",
446
  " <td>1</td>\n",
447
  " </tr>\n",
448
  " </tbody>\n",
449
  "</table>\n",
450
+ "<p>1506 rows Γ— 2 columns</p>\n",
451
  "</div>"
452
  ],
453
  "text/plain": [
454
+ " keyword id\n",
455
+ "0 citalopram vs prozac 0\n",
456
+ "1 who is the oldest football player 1\n",
457
+ "2 t mobile town east 2\n",
458
+ "3 starbucks 2\n",
459
+ "4 tech crunch 2\n",
460
+ "... ... ..\n",
461
+ "1703 How to make homemade pet accessories from recy... 1\n",
462
+ "1704 Top 10 science fiction book series that take r... 1\n",
463
+ "1705 How to start a car restoration and customizati... 1\n",
464
+ "1706 Ancient Mesopotamian architecture and its infl... 1\n",
465
+ "1707 Benefits of a flexitarian diet for those seeki... 1\n",
466
  "\n",
467
+ "[1506 rows x 2 columns]"
468
  ]
469
  },
470
+ "execution_count": 25,
471
  "metadata": {},
472
  "output_type": "execute_result"
473
  }
 
479
  },
480
  {
481
  "cell_type": "code",
482
+ "execution_count": 26,
483
  "metadata": {},
484
  "outputs": [
485
  {
 
497
  },
498
  {
499
  "cell_type": "code",
500
+ "execution_count": 27,
501
  "metadata": {},
502
  "outputs": [
503
  {
504
  "name": "stderr",
505
  "output_type": "stream",
506
  "text": [
507
+ "/tmp/ipykernel_140238/1635098052.py:1: SettingWithCopyWarning: \n",
508
  "A value is trying to be set on a copy of a slice from a DataFrame\n",
509
  "\n",
510
  "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 
538
  " </thead>\n",
539
  " <tbody>\n",
540
  " <tr>\n",
541
+ " <th>26</th>\n",
542
+ " <td>Iphone 13 prices</td>\n",
543
  " <td>4</td>\n",
544
  " </tr>\n",
545
  " <tr>\n",
546
+ " <th>1604</th>\n",
547
+ " <td>Basics of string theory and its applications</td>\n",
548
+ " <td>1</td>\n",
549
  " </tr>\n",
550
  " <tr>\n",
551
+ " <th>622</th>\n",
552
+ " <td>Purchase air purifier</td>\n",
553
  " <td>4</td>\n",
554
  " </tr>\n",
555
  " <tr>\n",
556
+ " <th>841</th>\n",
557
+ " <td>Art studios in Asheville</td>\n",
558
+ " <td>3</td>\n",
559
  " </tr>\n",
560
  " <tr>\n",
561
+ " <th>1504</th>\n",
562
+ " <td>What is epigenetic inheritance?</td>\n",
563
+ " <td>1</td>\n",
564
  " </tr>\n",
565
  " <tr>\n",
566
+ " <th>311</th>\n",
567
+ " <td>Target Business login</td>\n",
568
+ " <td>2</td>\n",
569
  " </tr>\n",
570
  " <tr>\n",
571
+ " <th>61</th>\n",
572
+ " <td>How to get Spotify Premium</td>\n",
573
+ " <td>1</td>\n",
574
  " </tr>\n",
575
  " <tr>\n",
576
+ " <th>980</th>\n",
577
+ " <td>How to meditate?</td>\n",
578
+ " <td>1</td>\n",
579
  " </tr>\n",
580
  " <tr>\n",
581
+ " <th>1428</th>\n",
582
+ " <td>Basics of black holes</td>\n",
583
  " <td>1</td>\n",
584
  " </tr>\n",
585
  " <tr>\n",
586
+ " <th>1266</th>\n",
587
+ " <td>Ancient Chinese dynasties</td>\n",
588
+ " <td>1</td>\n",
589
  " </tr>\n",
590
  " </tbody>\n",
591
  "</table>\n",
592
  "</div>"
593
  ],
594
  "text/plain": [
595
+ " text label\n",
596
+ "26 Iphone 13 prices 4\n",
597
+ "1604 Basics of string theory and its applications 1\n",
598
+ "622 Purchase air purifier 4\n",
599
+ "841 Art studios in Asheville 3\n",
600
+ "1504 What is epigenetic inheritance? 1\n",
601
+ "311 Target Business login 2\n",
602
+ "61 How to get Spotify Premium 1\n",
603
+ "980 How to meditate? 1\n",
604
+ "1428 Basics of black holes 1\n",
605
+ "1266 Ancient Chinese dynasties 1"
606
  ]
607
  },
608
+ "execution_count": 27,
609
  "metadata": {},
610
  "output_type": "execute_result"
611
  }
 
623
  },
624
  {
625
  "cell_type": "code",
626
+ "execution_count": 28,
627
  "metadata": {},
628
  "outputs": [
629
  {
 
638
  "data": {
639
  "text/plain": [
640
  "Dataset({\n",
641
+ " features: ['text', 'label', '__index_level_0__'],\n",
642
+ " num_rows: 1506\n",
643
  "})"
644
  ]
645
  },
646
+ "execution_count": 28,
647
  "metadata": {},
648
  "output_type": "execute_result"
649
  }
 
655
  },
656
  {
657
  "cell_type": "code",
658
+ "execution_count": 29,
659
  "metadata": {},
660
  "outputs": [
661
  {
 
663
  "text/plain": [
664
  "DatasetDict({\n",
665
  " train: Dataset({\n",
666
+ " features: ['text', 'label', '__index_level_0__'],\n",
667
+ " num_rows: 1204\n",
668
  " })\n",
669
  " test: Dataset({\n",
670
+ " features: ['text', 'label', '__index_level_0__'],\n",
671
+ " num_rows: 302\n",
672
  " })\n",
673
  "})"
674
  ]
675
  },
676
+ "execution_count": 29,
677
  "metadata": {},
678
  "output_type": "execute_result"
679
  }
 
685
  },
686
  {
687
  "cell_type": "code",
688
+ "execution_count": 30,
689
  "metadata": {},
690
  "outputs": [],
691
  "source": [
 
696
  },
697
  {
698
  "cell_type": "code",
699
+ "execution_count": 31,
700
  "metadata": {},
701
  "outputs": [],
702
  "source": [
 
706
  },
707
  {
708
  "cell_type": "code",
709
+ "execution_count": 32,
710
  "metadata": {},
711
  "outputs": [
712
  {
713
  "name": "stderr",
714
  "output_type": "stream",
715
  "text": [
716
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1204/1204 [00:00<00:00, 14009.91 examples/s]\n",
717
+ "Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 302/302 [00:00<00:00, 24935.62 examples/s]\n"
718
  ]
719
  }
720
  ],
 
724
  },
725
  {
726
  "cell_type": "code",
727
+ "execution_count": 33,
728
  "metadata": {},
729
  "outputs": [
730
  {
731
  "name": "stderr",
732
  "output_type": "stream",
733
  "text": [
734
+ "2023-10-13 10:49:11.199157: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
735
  "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
736
+ "2023-10-13 10:49:12.962522: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
737
  ]
738
  }
739
  ],
 
752
  },
753
  {
754
  "cell_type": "code",
755
+ "execution_count": 34,
756
  "metadata": {},
757
  "outputs": [],
758
  "source": [
 
763
  },
764
  {
765
  "cell_type": "code",
766
+ "execution_count": 35,
767
  "metadata": {},
768
  "outputs": [],
769
  "source": [
 
778
  },
779
  {
780
  "cell_type": "code",
781
+ "execution_count": 36,
782
  "metadata": {},
783
  "outputs": [
784
  {
785
  "name": "stderr",
786
  "output_type": "stream",
787
  "text": [
788
+ "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']\n",
789
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
790
  ]
791
  }
 
800
  },
801
  {
802
  "cell_type": "code",
803
+ "execution_count": 37,
804
  "metadata": {},
805
  "outputs": [
806
  {
 
816
  "\n",
817
  " <div>\n",
818
  " \n",
819
+ " <progress value='1216' max='1216' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
820
+ " [1216/1216 02:51, Epoch 16/16]\n",
821
  " </div>\n",
822
  " <table border=\"1\" class=\"dataframe\">\n",
823
  " <thead>\n",
 
832
  " <tr>\n",
833
  " <td>1</td>\n",
834
  " <td>No log</td>\n",
835
+ " <td>0.208865</td>\n",
836
+ " <td>0.986755</td>\n",
837
  " </tr>\n",
838
  " <tr>\n",
839
  " <td>2</td>\n",
840
  " <td>No log</td>\n",
841
+ " <td>0.062759</td>\n",
842
+ " <td>0.983444</td>\n",
843
  " </tr>\n",
844
  " <tr>\n",
845
  " <td>3</td>\n",
846
  " <td>No log</td>\n",
847
+ " <td>0.065099</td>\n",
848
+ " <td>0.986755</td>\n",
849
  " </tr>\n",
850
  " <tr>\n",
851
  " <td>4</td>\n",
852
  " <td>No log</td>\n",
853
+ " <td>0.081124</td>\n",
854
+ " <td>0.976821</td>\n",
855
  " </tr>\n",
856
  " <tr>\n",
857
  " <td>5</td>\n",
858
  " <td>No log</td>\n",
859
+ " <td>0.112577</td>\n",
860
+ " <td>0.970199</td>\n",
861
  " </tr>\n",
862
  " <tr>\n",
863
  " <td>6</td>\n",
864
  " <td>No log</td>\n",
865
+ " <td>0.111743</td>\n",
866
+ " <td>0.973510</td>\n",
867
+ " </tr>\n",
868
+ " <tr>\n",
869
+ " <td>7</td>\n",
870
+ " <td>0.188300</td>\n",
871
+ " <td>0.100201</td>\n",
872
+ " <td>0.976821</td>\n",
873
+ " </tr>\n",
874
+ " <tr>\n",
875
+ " <td>8</td>\n",
876
+ " <td>0.188300</td>\n",
877
+ " <td>0.116866</td>\n",
878
+ " <td>0.973510</td>\n",
879
+ " </tr>\n",
880
+ " <tr>\n",
881
+ " <td>9</td>\n",
882
+ " <td>0.188300</td>\n",
883
+ " <td>0.141521</td>\n",
884
+ " <td>0.970199</td>\n",
885
+ " </tr>\n",
886
+ " <tr>\n",
887
+ " <td>10</td>\n",
888
+ " <td>0.188300</td>\n",
889
+ " <td>0.134409</td>\n",
890
+ " <td>0.973510</td>\n",
891
+ " </tr>\n",
892
+ " <tr>\n",
893
+ " <td>11</td>\n",
894
+ " <td>0.188300</td>\n",
895
+ " <td>0.134093</td>\n",
896
+ " <td>0.973510</td>\n",
897
+ " </tr>\n",
898
+ " <tr>\n",
899
+ " <td>12</td>\n",
900
+ " <td>0.188300</td>\n",
901
+ " <td>0.127059</td>\n",
902
+ " <td>0.973510</td>\n",
903
+ " </tr>\n",
904
+ " <tr>\n",
905
+ " <td>13</td>\n",
906
+ " <td>0.188300</td>\n",
907
+ " <td>0.138748</td>\n",
908
+ " <td>0.973510</td>\n",
909
+ " </tr>\n",
910
+ " <tr>\n",
911
+ " <td>14</td>\n",
912
+ " <td>0.018000</td>\n",
913
+ " <td>0.137167</td>\n",
914
+ " <td>0.973510</td>\n",
915
+ " </tr>\n",
916
+ " <tr>\n",
917
+ " <td>15</td>\n",
918
+ " <td>0.018000</td>\n",
919
+ " <td>0.135889</td>\n",
920
+ " <td>0.973510</td>\n",
921
+ " </tr>\n",
922
+ " <tr>\n",
923
+ " <td>16</td>\n",
924
+ " <td>0.018000</td>\n",
925
+ " <td>0.135796</td>\n",
926
+ " <td>0.973510</td>\n",
927
  " </tr>\n",
928
  " </tbody>\n",
929
  "</table><p>"
 
938
  {
939
  "data": {
940
  "text/plain": [
941
+ "TrainOutput(global_step=1216, training_loss=0.08689324734242339, metrics={'train_runtime': 172.7465, 'train_samples_per_second': 111.516, 'train_steps_per_second': 7.039, 'total_flos': 62384098266840.0, 'train_loss': 0.08689324734242339, 'epoch': 16.0})"
942
  ]
943
  },
944
+ "execution_count": 37,
945
  "metadata": {},
946
  "output_type": "execute_result"
947
  }
 
952
  " learning_rate=2e-5,\n",
953
  " per_device_train_batch_size=16,\n",
954
  " per_device_eval_batch_size=16,\n",
955
+ " num_train_epochs=16,\n",
956
  " weight_decay=0.01,\n",
957
  " evaluation_strategy=\"epoch\",\n",
958
  " save_strategy=\"epoch\",\n",
research/12_text_analytics_using_azure.ipynb ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# ! pip install --upgrade azure-ai-textanalytics"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 16,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "key = \"198414c4d7e54bde91ec77bf776d5211\"\n",
19
+ "endpoint = \"https://new-entity.cognitiveservices.azure.com/\"\n",
20
+ "# endpoint = \"https://eastus.api.cognitive.microsoft.com/\"\n",
21
+ "\n",
22
+ "from azure.ai.textanalytics import TextAnalyticsClient\n",
23
+ "from azure.core.credentials import AzureKeyCredential\n",
24
+ "\n",
25
+ "# Authenticate the client using your key and endpoint \n",
26
+ "def authenticate_client():\n",
27
+ " ta_credential = AzureKeyCredential(key)\n",
28
+ " text_analytics_client = TextAnalyticsClient(\n",
29
+ " endpoint=endpoint, \n",
30
+ " credential=ta_credential)\n",
31
+ " return text_analytics_client\n",
32
+ "\n",
33
+ "client = authenticate_client()\n"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 21,
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "Named Entities:\n",
46
+ "\n",
47
+ "\tText: \t razor kraken \tCategory: \t Organization \tSubCategory: \t None \n",
48
+ "\tConfidence Score: \t 0.54 \tLength: \t 12 \tOffset: \t 0 \n",
49
+ "\n",
50
+ "\tText: \t headphones \tCategory: \t Product \tSubCategory: \t None \n",
51
+ "\tConfidence Score: \t 0.5 \tLength: \t 10 \tOffset: \t 13 \n",
52
+ "\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "key = \"2fd114e7967a4da58854be231fd766a3\"\n",
58
+ "endpoint = \"https://entity-collection.cognitiveservices.azure.com/\"\n",
59
+ "# endpoint = \"https://eastus.api.cognitive.microsoft.com/\"\n",
60
+ "\n",
61
+ "from azure.ai.textanalytics import TextAnalyticsClient\n",
62
+ "from azure.core.credentials import AzureKeyCredential\n",
63
+ "\n",
64
+ "# Authenticate the client using your key and endpoint \n",
65
+ "def authenticate_client():\n",
66
+ " ta_credential = AzureKeyCredential(key)\n",
67
+ " text_analytics_client = TextAnalyticsClient(\n",
68
+ " endpoint=endpoint, \n",
69
+ " credential=ta_credential)\n",
70
+ " return text_analytics_client\n",
71
+ "\n",
72
+ "client = authenticate_client()\n",
73
+ "\n",
74
+ "# Example function for recognizing entities from text\n",
75
+ "def entity_recognition_example(client):\n",
76
+ "\n",
77
+ " try:\n",
78
+ " documents = [\"razor kraken headphones\"]\n",
79
+ " result = client.recognize_entities(documents = documents)[0]\n",
80
+ "\n",
81
+ " print(\"Named Entities:\\n\")\n",
82
+ " for entity in result.entities:\n",
83
+ " print(\"\\tText: \\t\", entity.text, \"\\tCategory: \\t\", entity.category, \"\\tSubCategory: \\t\", entity.subcategory,\n",
84
+ " \"\\n\\tConfidence Score: \\t\", round(entity.confidence_score, 2), \"\\tLength: \\t\", entity.length, \"\\tOffset: \\t\", entity.offset, \"\\n\")\n",
85
+ "\n",
86
+ " except Exception as err:\n",
87
+ " print(\"Encountered exception. {}\".format(err))\n",
88
+ "entity_recognition_example(client)"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 25,
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "def replace_original_text(original_text:str):\n",
98
+ " try:\n",
99
+ " result = client.recognize_entities(documents = [original_text])[0]\n",
100
+ "\n",
101
+ " for entity in result.entities:\n",
102
+ " # print(\"\\tText: \\t\", entity.text, \"\\tCategory: \\t\", entity.category, \"\\tSubCategory: \\t\", entity.subcategory,\n",
103
+ " # \"\\n\\tConfidence Score: \\t\", round(entity.confidence_score, 2), \"\\tLength: \\t\", entity.length, \"\\tOffset: \\t\", entity.offset, \"\\n\")\n",
104
+ " original_text= original_text.replace(\n",
105
+ " entity.text, \n",
106
+ " entity.text+ f' ({entity.category}) '\n",
107
+ " )\n",
108
+ " return original_text\n",
109
+ "\n",
110
+ " except Exception as err:\n",
111
+ " \n",
112
+ " print(\"Encountered exception. {}\".format(err))\n",
113
+ " return original_text\n",
114
+ " "
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 26,
120
+ "metadata": {},
121
+ "outputs": [
122
+ {
123
+ "data": {
124
+ "text/plain": [
125
+ "'best cat ear headphones (Product) '"
126
+ ]
127
+ },
128
+ "execution_count": 26,
129
+ "metadata": {},
130
+ "output_type": "execute_result"
131
+ }
132
+ ],
133
+ "source": [
134
+ "replace_original_text(original_text=\"best cat ear headphones\")"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 29,
140
+ "metadata": {},
141
+ "outputs": [
142
+ {
143
+ "data": {
144
+ "text/plain": [
145
+ "'Barack Obama (Person) in the White House (Location) '"
146
+ ]
147
+ },
148
+ "execution_count": 29,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
+ }
152
+ ],
153
+ "source": [
154
+ "replace_original_text(\n",
155
+ " 'Barack Obama in the White House'\n",
156
+ ")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": []
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": []
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "metadata": {},
177
+ "outputs": [],
178
+ "source": []
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 1,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "from azure.core.credentials import AzureKeyCredential\n",
187
+ "from azure.ai.textanalytics import TextAnalyticsClient\n",
188
+ "\n",
189
+ "credential = AzureKeyCredential(\"c8b849064d6649ea87cbd8fbbd39f708\")\n",
190
+ "text_analytics_client = TextAnalyticsClient(endpoint=\"https://entity-retrieval.cognitiveservices.azure.com/\", credential=credential)\n",
191
+ "# text_analytics_client = TextAnalyticsClient(endpoint=\"https://ktitji5.eastus.cognitiveservices.azure.com/\", credential=credential)"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 2,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "# Get the endpoint for the Language service resource\n",
201
+ "# ! az cognitiveservices account show --name \"resource-name\" --resource-group \"resource-group-name\" --query \"properties.endpoint\""
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 3,
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": [
210
+ "documents = [\n",
211
+ " {\"id\": \"1\", \"language\": \"en\", \"text\": \"I hated the movie. It was so slow!\"},\n",
212
+ " {\"id\": \"2\", \"language\": \"en\", \"text\": \"The movie made it into my top ten favorites. What a great movie!\"},\n",
213
+ "]"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 4,
219
+ "metadata": {},
220
+ "outputs": [
221
+ {
222
+ "ename": "ClientAuthenticationError",
223
+ "evalue": "(401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.",
224
+ "output_type": "error",
225
+ "traceback": [
226
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
227
+ "\u001b[0;31mClientAuthenticationError\u001b[0m Traceback (most recent call last)",
228
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_text_analytics_client.py:991\u001b[0m, in \u001b[0;36mTextAnalyticsClient.analyze_sentiment\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m 988\u001b[0m models \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_client\u001b[39m.\u001b[39mmodels(api_version\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_api_version)\n\u001b[1;32m 989\u001b[0m \u001b[39mreturn\u001b[39;00m cast(\n\u001b[1;32m 990\u001b[0m List[Union[AnalyzeSentimentResult, DocumentError]],\n\u001b[0;32m--> 991\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_client\u001b[39m.\u001b[39;49manalyze_text(\n\u001b[1;32m 992\u001b[0m body\u001b[39m=\u001b[39;49mmodels\u001b[39m.\u001b[39;49mAnalyzeTextSentimentAnalysisInput(\n\u001b[1;32m 993\u001b[0m analysis_input\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39mdocuments\u001b[39;49m\u001b[39m\"\u001b[39;49m: docs},\n\u001b[1;32m 994\u001b[0m parameters\u001b[39m=\u001b[39;49mmodels\u001b[39m.\u001b[39;49mSentimentAnalysisTaskParameters(\n\u001b[1;32m 995\u001b[0m logging_opt_out\u001b[39m=\u001b[39;49mdisable_service_logs,\n\u001b[1;32m 996\u001b[0m model_version\u001b[39m=\u001b[39;49mmodel_version,\n\u001b[1;32m 997\u001b[0m string_index_type\u001b[39m=\u001b[39;49mstring_index_type_compatibility(string_index_type),\n\u001b[1;32m 998\u001b[0m opinion_mining\u001b[39m=\u001b[39;49mshow_opinion_mining,\n\u001b[1;32m 999\u001b[0m )\n\u001b[1;32m 1000\u001b[0m ),\n\u001b[1;32m 1001\u001b[0m show_stats\u001b[39m=\u001b[39;49mshow_stats,\n\u001b[1;32m 1002\u001b[0m \u001b[39mcls\u001b[39;49m\u001b[39m=\u001b[39;49mkwargs\u001b[39m.\u001b[39;49mpop(\u001b[39m\"\u001b[39;49m\u001b[39mcls\u001b[39;49m\u001b[39m\"\u001b[39;49m, sentiment_result),\n\u001b[1;32m 1003\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs\n\u001b[1;32m 1004\u001b[0m )\n\u001b[1;32m 1005\u001b[0m )\n\u001b[1;32m 1007\u001b[0m \u001b[39m# api_versions 3.0, 3.1\u001b[39;00m\n",
229
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_generated/_operations_mixin.py:109\u001b[0m, in \u001b[0;36mTextAnalyticsClientOperationsMixin.analyze_text\u001b[0;34m(self, body, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 108\u001b[0m mixin_instance\u001b[39m.\u001b[39m_deserialize \u001b[39m=\u001b[39m Deserializer(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_models_dict(api_version))\n\u001b[0;32m--> 109\u001b[0m \u001b[39mreturn\u001b[39;00m mixin_instance\u001b[39m.\u001b[39;49manalyze_text(body, show_stats, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
230
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/tracing/decorator.py:78\u001b[0m, in \u001b[0;36mdistributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m span_impl_type \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 80\u001b[0m \u001b[39m# Merge span is parameter is set, but only if no explicit parent are passed\u001b[39;00m\n",
231
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_generated/v2022_05_01/operations/_text_analytics_client_operations.py:299\u001b[0m, in \u001b[0;36mTextAnalyticsClientOperationsMixin.analyze_text\u001b[0;34m(self, body, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39m200\u001b[39m]:\n\u001b[0;32m--> 299\u001b[0m map_error(status_code\u001b[39m=\u001b[39;49mresponse\u001b[39m.\u001b[39;49mstatus_code, response\u001b[39m=\u001b[39;49mresponse, error_map\u001b[39m=\u001b[39;49merror_map)\n\u001b[1;32m 300\u001b[0m error \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_deserialize\u001b[39m.\u001b[39mfailsafe_deserialize(_models\u001b[39m.\u001b[39mErrorResponse, pipeline_response)\n",
232
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/exceptions.py:165\u001b[0m, in \u001b[0;36mmap_error\u001b[0;34m(status_code, response, error_map)\u001b[0m\n\u001b[1;32m 164\u001b[0m error \u001b[39m=\u001b[39m error_type(response\u001b[39m=\u001b[39mresponse)\n\u001b[0;32m--> 165\u001b[0m \u001b[39mraise\u001b[39;00m error\n",
233
+ "\u001b[0;31mClientAuthenticationError\u001b[0m: (401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.",
234
+ "\nThe above exception was the direct cause of the following exception:\n",
235
+ "\u001b[0;31mClientAuthenticationError\u001b[0m Traceback (most recent call last)",
236
+ "\u001b[1;32m/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb Cell 12\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m response \u001b[39m=\u001b[39m text_analytics_client\u001b[39m.\u001b[39;49manalyze_sentiment(documents)\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m successful_responses \u001b[39m=\u001b[39m [doc \u001b[39mfor\u001b[39;00m doc \u001b[39min\u001b[39;00m response \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m doc\u001b[39m.\u001b[39mis_error]\n",
237
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/tracing/decorator.py:78\u001b[0m, in \u001b[0;36mdistributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 76\u001b[0m span_impl_type \u001b[39m=\u001b[39m settings\u001b[39m.\u001b[39mtracing_implementation()\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m span_impl_type \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 80\u001b[0m \u001b[39m# Merge span is parameter is set, but only if no explicit parent are passed\u001b[39;00m\n\u001b[1;32m 81\u001b[0m \u001b[39mif\u001b[39;00m merge_span \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m passed_in_parent:\n",
238
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_validate.py:74\u001b[0m, in \u001b[0;36mvalidate_multiapi_args.<locals>.decorator.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[39m# the latest version is selected, we assume all features supported\u001b[39;00m\n\u001b[1;32m 73\u001b[0m \u001b[39mif\u001b[39;00m selected_api_version \u001b[39m==\u001b[39m VERSIONS_SUPPORTED[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]:\n\u001b[0;32m---> 74\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 76\u001b[0m \u001b[39mif\u001b[39;00m version_method_added \u001b[39mand\u001b[39;00m version_method_added \u001b[39m!=\u001b[39m selected_api_version \u001b[39mand\u001b[39;00m \\\n\u001b[1;32m 77\u001b[0m VERSIONS_SUPPORTED\u001b[39m.\u001b[39mindex(selected_api_version) \u001b[39m<\u001b[39m VERSIONS_SUPPORTED\u001b[39m.\u001b[39mindex(version_method_added):\n\u001b[1;32m 78\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 79\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mclient\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m is not available in API version \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 80\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mselected_api_version\u001b[39m}\u001b[39;00m\u001b[39m. Use service API version \u001b[39m\u001b[39m{\u001b[39;00mversion_method_added\u001b[39m}\u001b[39;00m\u001b[39m or newer.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 81\u001b[0m )\n",
239
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_text_analytics_client.py:1022\u001b[0m, in \u001b[0;36mTextAnalyticsClient.analyze_sentiment\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m 1008\u001b[0m \u001b[39mreturn\u001b[39;00m cast(\n\u001b[1;32m 1009\u001b[0m List[Union[AnalyzeSentimentResult, DocumentError]],\n\u001b[1;32m 1010\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_client\u001b[39m.\u001b[39msentiment(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1019\u001b[0m )\n\u001b[1;32m 1020\u001b[0m )\n\u001b[1;32m 1021\u001b[0m \u001b[39mexcept\u001b[39;00m HttpResponseError \u001b[39mas\u001b[39;00m error:\n\u001b[0;32m-> 1022\u001b[0m \u001b[39mreturn\u001b[39;00m process_http_response_error(error)\n",
240
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_response_handlers.py:60\u001b[0m, in \u001b[0;36mprocess_http_response_error\u001b[0;34m(error)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[39mif\u001b[39;00m error\u001b[39m.\u001b[39mstatus_code \u001b[39m==\u001b[39m \u001b[39m404\u001b[39m:\n\u001b[1;32m 59\u001b[0m raise_error \u001b[39m=\u001b[39m ResourceNotFoundError\n\u001b[0;32m---> 60\u001b[0m \u001b[39mraise\u001b[39;00m raise_error(response\u001b[39m=\u001b[39merror\u001b[39m.\u001b[39mresponse, error_format\u001b[39m=\u001b[39mCSODataV4Format) \u001b[39mfrom\u001b[39;00m \u001b[39merror\u001b[39;00m\n",
241
+ "\u001b[0;31mClientAuthenticationError\u001b[0m: (401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource."
242
+ ]
243
+ }
244
+ ],
245
+ "source": [
246
+ "response = text_analytics_client.analyze_sentiment(documents)\n",
247
+ "successful_responses = [doc for doc in response if not doc.is_error]"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": []
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": null,
260
+ "metadata": {},
261
+ "outputs": [],
262
+ "source": []
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 4,
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "name": "stdout",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "In this sample, we want to find the articles that mention Microsoft to read.\n"
274
+ ]
275
+ },
276
+ {
277
+ "ename": "ClientAuthenticationError",
278
+ "evalue": "(401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.",
279
+ "output_type": "error",
280
+ "traceback": [
281
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
282
+ "\u001b[0;31mClientAuthenticationError\u001b[0m Traceback (most recent call last)",
283
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_text_analytics_client.py:900\u001b[0m, in \u001b[0;36mTextAnalyticsClient.extract_key_phrases\u001b[0;34m(self, documents, disable_service_logs, language, model_version, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 897\u001b[0m models \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_client\u001b[39m.\u001b[39mmodels(api_version\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_api_version)\n\u001b[1;32m 898\u001b[0m \u001b[39mreturn\u001b[39;00m cast(\n\u001b[1;32m 899\u001b[0m List[Union[ExtractKeyPhrasesResult, DocumentError]],\n\u001b[0;32m--> 900\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_client\u001b[39m.\u001b[39;49manalyze_text(\n\u001b[1;32m 901\u001b[0m body\u001b[39m=\u001b[39;49mmodels\u001b[39m.\u001b[39;49mAnalyzeTextKeyPhraseExtractionInput(\n\u001b[1;32m 902\u001b[0m analysis_input\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39mdocuments\u001b[39;49m\u001b[39m\"\u001b[39;49m: docs},\n\u001b[1;32m 903\u001b[0m parameters\u001b[39m=\u001b[39;49mmodels\u001b[39m.\u001b[39;49mKeyPhraseTaskParameters(\n\u001b[1;32m 904\u001b[0m logging_opt_out\u001b[39m=\u001b[39;49mdisable_service_logs,\n\u001b[1;32m 905\u001b[0m model_version\u001b[39m=\u001b[39;49mmodel_version,\n\u001b[1;32m 906\u001b[0m )\n\u001b[1;32m 907\u001b[0m ),\n\u001b[1;32m 908\u001b[0m show_stats\u001b[39m=\u001b[39;49mshow_stats,\n\u001b[1;32m 909\u001b[0m \u001b[39mcls\u001b[39;49m\u001b[39m=\u001b[39;49mkwargs\u001b[39m.\u001b[39;49mpop(\u001b[39m\"\u001b[39;49m\u001b[39mcls\u001b[39;49m\u001b[39m\"\u001b[39;49m, key_phrases_result),\n\u001b[1;32m 910\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs\n\u001b[1;32m 911\u001b[0m )\n\u001b[1;32m 912\u001b[0m )\n\u001b[1;32m 914\u001b[0m \u001b[39m# api_versions 3.0, 3.1\u001b[39;00m\n",
284
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_generated/_operations_mixin.py:111\u001b[0m, in \u001b[0;36mTextAnalyticsClientOperationsMixin.analyze_text\u001b[0;34m(self, body, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 110\u001b[0m mixin_instance\u001b[39m.\u001b[39m_deserialize \u001b[39m=\u001b[39m Deserializer(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_models_dict(api_version))\n\u001b[0;32m--> 111\u001b[0m \u001b[39mreturn\u001b[39;00m mixin_instance\u001b[39m.\u001b[39;49manalyze_text(body, show_stats, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
285
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/tracing/decorator.py:78\u001b[0m, in \u001b[0;36mdistributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m span_impl_type \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 80\u001b[0m \u001b[39m# Merge span is parameter is set, but only if no explicit parent are passed\u001b[39;00m\n",
286
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_generated/v2023_04_01/operations/_text_analytics_client_operations.py:299\u001b[0m, in \u001b[0;36mTextAnalyticsClientOperationsMixin.analyze_text\u001b[0;34m(self, body, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39m200\u001b[39m]:\n\u001b[0;32m--> 299\u001b[0m map_error(status_code\u001b[39m=\u001b[39;49mresponse\u001b[39m.\u001b[39;49mstatus_code, response\u001b[39m=\u001b[39;49mresponse, error_map\u001b[39m=\u001b[39;49merror_map)\n\u001b[1;32m 300\u001b[0m error \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_deserialize\u001b[39m.\u001b[39mfailsafe_deserialize(_models\u001b[39m.\u001b[39mErrorResponse, pipeline_response)\n",
287
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/exceptions.py:165\u001b[0m, in \u001b[0;36mmap_error\u001b[0;34m(status_code, response, error_map)\u001b[0m\n\u001b[1;32m 164\u001b[0m error \u001b[39m=\u001b[39m error_type(response\u001b[39m=\u001b[39mresponse)\n\u001b[0;32m--> 165\u001b[0m \u001b[39mraise\u001b[39;00m error\n",
288
+ "\u001b[0;31mClientAuthenticationError\u001b[0m: (401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.",
289
+ "\nThe above exception was the direct cause of the following exception:\n",
290
+ "\u001b[0;31mClientAuthenticationError\u001b[0m Traceback (most recent call last)",
291
+ "\u001b[1;32m/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb Cell 8\u001b[0m line \u001b[0;36m7\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=65'>66</a>\u001b[0m \u001b[39mprint\u001b[39m(\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=66'>67</a>\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mThe articles that mention Microsoft are articles number: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m. Those are the ones I\u001b[39m\u001b[39m'\u001b[39m\u001b[39mm interested in reading.\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=67'>68</a>\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m, \u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mjoin(articles_that_mention_microsoft)\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=68'>69</a>\u001b[0m )\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=69'>70</a>\u001b[0m )\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=72'>73</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m__name__\u001b[39m \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m'\u001b[39m:\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=73'>74</a>\u001b[0m sample_extract_key_phrases()\n",
292
+ "\u001b[1;32m/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb Cell 8\u001b[0m line \u001b[0;36m5\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=37'>38</a>\u001b[0m text_analytics_client \u001b[39m=\u001b[39m TextAnalyticsClient(endpoint\u001b[39m=\u001b[39mendpoint, credential\u001b[39m=\u001b[39mAzureKeyCredential(key))\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=38'>39</a>\u001b[0m articles \u001b[39m=\u001b[39m [\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=39'>40</a>\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=40'>41</a>\u001b[0m \u001b[39m Washington, D.C. Autumn in DC is a uniquely beautiful season. The leaves fall from the trees\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=51'>52</a>\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=52'>53</a>\u001b[0m ]\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=54'>55</a>\u001b[0m result \u001b[39m=\u001b[39m text_analytics_client\u001b[39m.\u001b[39;49mextract_key_phrases(articles)\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=55'>56</a>\u001b[0m \u001b[39mfor\u001b[39;00m idx, doc \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(result):\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/research/12_text_analytics_using_azure.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=56'>57</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m doc\u001b[39m.\u001b[39mis_error:\n",
293
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/core/tracing/decorator.py:78\u001b[0m, in \u001b[0;36mdistributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 76\u001b[0m span_impl_type \u001b[39m=\u001b[39m settings\u001b[39m.\u001b[39mtracing_implementation()\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m span_impl_type \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 80\u001b[0m \u001b[39m# Merge span is parameter is set, but only if no explicit parent are passed\u001b[39;00m\n\u001b[1;32m 81\u001b[0m \u001b[39mif\u001b[39;00m merge_span \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m passed_in_parent:\n",
294
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_validate.py:79\u001b[0m, in \u001b[0;36mvalidate_multiapi_args.<locals>.decorator.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[39m# the latest version is selected, we assume all features supported\u001b[39;00m\n\u001b[1;32m 78\u001b[0m \u001b[39mif\u001b[39;00m selected_api_version \u001b[39m==\u001b[39m VERSIONS_SUPPORTED[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]:\n\u001b[0;32m---> 79\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 81\u001b[0m \u001b[39mif\u001b[39;00m version_method_added \u001b[39mand\u001b[39;00m version_method_added \u001b[39m!=\u001b[39m selected_api_version \u001b[39mand\u001b[39;00m \\\n\u001b[1;32m 82\u001b[0m VERSIONS_SUPPORTED\u001b[39m.\u001b[39mindex(selected_api_version) \u001b[39m<\u001b[39m VERSIONS_SUPPORTED\u001b[39m.\u001b[39mindex(version_method_added):\n\u001b[1;32m 83\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 84\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mclient\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m is not available in API version \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 85\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mselected_api_version\u001b[39m}\u001b[39;00m\u001b[39m. Use service API version \u001b[39m\u001b[39m{\u001b[39;00mversion_method_added\u001b[39m}\u001b[39;00m\u001b[39m or newer.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 86\u001b[0m )\n",
295
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_text_analytics_client.py:927\u001b[0m, in \u001b[0;36mTextAnalyticsClient.extract_key_phrases\u001b[0;34m(self, documents, disable_service_logs, language, model_version, show_stats, **kwargs)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[39mreturn\u001b[39;00m cast(\n\u001b[1;32m 916\u001b[0m List[Union[ExtractKeyPhrasesResult, DocumentError]],\n\u001b[1;32m 917\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_client\u001b[39m.\u001b[39mkey_phrases(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 924\u001b[0m )\n\u001b[1;32m 925\u001b[0m )\n\u001b[1;32m 926\u001b[0m \u001b[39mexcept\u001b[39;00m HttpResponseError \u001b[39mas\u001b[39;00m error:\n\u001b[0;32m--> 927\u001b[0m \u001b[39mreturn\u001b[39;00m process_http_response_error(error)\n",
296
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/azure/ai/textanalytics/_response_handlers.py:63\u001b[0m, in \u001b[0;36mprocess_http_response_error\u001b[0;34m(error)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[39mif\u001b[39;00m error\u001b[39m.\u001b[39mstatus_code \u001b[39m==\u001b[39m \u001b[39m404\u001b[39m:\n\u001b[1;32m 62\u001b[0m raise_error \u001b[39m=\u001b[39m ResourceNotFoundError\n\u001b[0;32m---> 63\u001b[0m \u001b[39mraise\u001b[39;00m raise_error(response\u001b[39m=\u001b[39merror\u001b[39m.\u001b[39mresponse, error_format\u001b[39m=\u001b[39mCSODataV4Format) \u001b[39mfrom\u001b[39;00m \u001b[39merror\u001b[39;00m\n",
297
+ "\u001b[0;31mClientAuthenticationError\u001b[0m: (401) Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.\nCode: 401\nMessage: Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource."
298
+ ]
299
+ }
300
+ ],
301
+ "source": [
302
+ "# -------------------------------------------------------------------------\n",
303
+ "# Copyright (c) Microsoft Corporation. All rights reserved.\n",
304
+ "# Licensed under the MIT License. See License.txt in the project root for\n",
305
+ "# license information.\n",
306
+ "# --------------------------------------------------------------------------\n",
307
+ "\n",
308
+ "\"\"\"\n",
309
+ "FILE: sample_extract_key_phrases.py\n",
310
+ "\n",
311
+ "DESCRIPTION:\n",
312
+ " This sample demonstrates how to extract key talking points from a batch of documents.\n",
313
+ "\n",
314
+ " In this sample, we want to go over articles and read the ones that mention Microsoft.\n",
315
+ " We're going to use the SDK to create a rudimentary search algorithm to find these articles.\n",
316
+ "\n",
317
+ "USAGE:\n",
318
+ " python sample_extract_key_phrases.py\n",
319
+ "\n",
320
+ " Set the environment variables with your own values before running the sample:\n",
321
+ " 1) AZURE_LANGUAGE_ENDPOINT - the endpoint to your Language resource.\n",
322
+ " 2) AZURE_LANGUAGE_KEY - your Language subscription key\n",
323
+ "\"\"\"\n",
324
+ "\n",
325
+ "\n",
326
+ "def sample_extract_key_phrases() -> None:\n",
327
+ " print(\n",
328
+ " \"In this sample, we want to find the articles that mention Microsoft to read.\"\n",
329
+ " )\n",
330
+ " articles_that_mention_microsoft = []\n",
331
+ " # [START extract_key_phrases]\n",
332
+ " import os\n",
333
+ " from azure.core.credentials import AzureKeyCredential\n",
334
+ " from azure.ai.textanalytics import TextAnalyticsClient\n",
335
+ "\n",
336
+ " endpoint = \"https://xouhou-1234.cognitiveservices.azure.com/\"\n",
337
+ " key = \"d7fcbf17455647adbca355b021334c83\"\n",
338
+ "\n",
339
+ " text_analytics_client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))\n",
340
+ " articles = [\n",
341
+ " \"\"\"\n",
342
+ " Washington, D.C. Autumn in DC is a uniquely beautiful season. The leaves fall from the trees\n",
343
+ " in a city chock-full of forests, leaving yellow leaves on the ground and a clearer view of the\n",
344
+ " blue sky above...\n",
345
+ " \"\"\",\n",
346
+ " \"\"\"\n",
347
+ " Redmond, WA. In the past few days, Microsoft has decided to further postpone the start date of\n",
348
+ " its United States workers, due to the pandemic that rages with no end in sight...\n",
349
+ " \"\"\",\n",
350
+ " \"\"\"\n",
351
+ " Redmond, WA. Employees at Microsoft can be excited about the new coffee shop that will open on campus\n",
352
+ " once workers no longer have to work remotely...\n",
353
+ " \"\"\"\n",
354
+ " ]\n",
355
+ "\n",
356
+ " result = text_analytics_client.extract_key_phrases(articles)\n",
357
+ " for idx, doc in enumerate(result):\n",
358
+ " if not doc.is_error:\n",
359
+ " print(\"Key phrases in article #{}: {}\".format(\n",
360
+ " idx + 1,\n",
361
+ " \", \".join(doc.key_phrases)\n",
362
+ " ))\n",
363
+ " # [END extract_key_phrases]\n",
364
+ " if \"Microsoft\" in doc.key_phrases:\n",
365
+ " articles_that_mention_microsoft.append(str(idx + 1))\n",
366
+ "\n",
367
+ " print(\n",
368
+ " \"The articles that mention Microsoft are articles number: {}. Those are the ones I'm interested in reading.\".format(\n",
369
+ " \", \".join(articles_that_mention_microsoft)\n",
370
+ " )\n",
371
+ " )\n",
372
+ "\n",
373
+ "\n",
374
+ "if __name__ == '__main__':\n",
375
+ " sample_extract_key_phrases()"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": null,
381
+ "metadata": {},
382
+ "outputs": [],
383
+ "source": []
384
+ }
385
+ ],
386
+ "metadata": {
387
+ "kernelspec": {
388
+ "display_name": "venv",
389
+ "language": "python",
390
+ "name": "python3"
391
+ },
392
+ "language_info": {
393
+ "codemirror_mode": {
394
+ "name": "ipython",
395
+ "version": 3
396
+ },
397
+ "file_extension": ".py",
398
+ "mimetype": "text/x-python",
399
+ "name": "python",
400
+ "nbconvert_exporter": "python",
401
+ "pygments_lexer": "ipython3",
402
+ "version": "3.10.12"
403
+ }
404
+ },
405
+ "nbformat": 4,
406
+ "nbformat_minor": 2
407
+ }
research/13_data_categories.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
utils/__pycache__/get_category.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/get_category.cpython-310.pyc and b/utils/__pycache__/get_category.cpython-310.pyc differ
 
utils/__pycache__/get_intent.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/get_intent.cpython-310.pyc and b/utils/__pycache__/get_intent.cpython-310.pyc differ