Ubuntu commited on
Commit
d0702fa
β€’
1 Parent(s): ed39e1a

finetuned the bert model again to classify things right

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. data_categories/Computers_and_Electronics.csv +2 -2
  2. data_categories/Final_Category_Data_With_Labels.csv +2 -2
  3. data_categories/Food_and_Drink.csv +2 -2
  4. data_categories/Pets_and_Animals.csv +2 -2
  5. data_categories/Real Estate.csv +2 -2
  6. data_categories/Reference.csv +2 -2
  7. data_categories/Sensitive Subjects.csv +2 -2
  8. data_categories/Shopping.csv +2 -2
  9. data_test/keywords-2.csv +3 -0
  10. data_test/labelled_data.csv +3 -0
  11. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/added_tokens.json +0 -0
  12. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/config.json +53 -53
  13. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/optimizer.pt +1 -1
  14. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/pytorch_model.bin +1 -1
  15. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/rng_state.pth +0 -0
  16. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/scheduler.pt +1 -1
  17. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/special_tokens_map.json +0 -0
  18. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/tokenizer.json +0 -0
  19. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/tokenizer_config.json +0 -0
  20. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/trainer_state.json +21 -21
  21. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/training_args.bin +1 -1
  22. finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/vocab.txt +0 -0
  23. finetuned_entity_categorical_classification/checkpoint-3346/added_tokens.json +7 -0
  24. finetuned_entity_categorical_classification/checkpoint-3346/config.json +83 -0
  25. finetuned_entity_categorical_classification/checkpoint-3346/optimizer.pt +3 -0
  26. finetuned_entity_categorical_classification/checkpoint-3346/pytorch_model.bin +3 -0
  27. finetuned_entity_categorical_classification/checkpoint-3346/rng_state.pth +0 -0
  28. finetuned_entity_categorical_classification/checkpoint-3346/scheduler.pt +3 -0
  29. finetuned_entity_categorical_classification/checkpoint-3346/special_tokens_map.json +7 -0
  30. finetuned_entity_categorical_classification/checkpoint-3346/tokenizer.json +0 -0
  31. finetuned_entity_categorical_classification/checkpoint-3346/tokenizer_config.json +56 -0
  32. finetuned_entity_categorical_classification/checkpoint-3346/trainer_state.json +73 -0
  33. finetuned_entity_categorical_classification/checkpoint-3346/training_args.bin +3 -0
  34. finetuned_entity_categorical_classification/checkpoint-3346/vocab.txt +0 -0
  35. finetuned_entity_categorical_classification/checkpoint-3362/added_tokens.json +7 -0
  36. finetuned_entity_categorical_classification/checkpoint-3362/config.json +83 -0
  37. finetuned_entity_categorical_classification/checkpoint-3362/optimizer.pt +3 -0
  38. finetuned_entity_categorical_classification/checkpoint-3362/pytorch_model.bin +3 -0
  39. finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth +0 -0
  40. finetuned_entity_categorical_classification/checkpoint-3362/scheduler.pt +3 -0
  41. finetuned_entity_categorical_classification/checkpoint-3362/special_tokens_map.json +7 -0
  42. finetuned_entity_categorical_classification/checkpoint-3362/tokenizer.json +0 -0
  43. finetuned_entity_categorical_classification/checkpoint-3362/tokenizer_config.json +56 -0
  44. finetuned_entity_categorical_classification/checkpoint-3362/trainer_state.json +73 -0
  45. finetuned_entity_categorical_classification/checkpoint-3362/training_args.bin +3 -0
  46. finetuned_entity_categorical_classification/checkpoint-3362/vocab.txt +0 -0
  47. finetuned_entity_categorical_classification/runs/Oct12_11-19-39_ip-172-31-95-165/events.out.tfevents.1697109579.ip-172-31-95-165.128350.0 +0 -0
  48. finetuned_entity_categorical_classification/runs/Oct12_11-43-16_ip-172-31-95-165/events.out.tfevents.1697110996.ip-172-31-95-165.128941.0 +0 -0
  49. finetuned_entity_categorical_classification/runs/Oct12_11-59-06_ip-172-31-95-165/events.out.tfevents.1697111947.ip-172-31-95-165.129502.0 +0 -0
  50. research/08_organizing_entire_datacategories.ipynb +222 -222
data_categories/Computers_and_Electronics.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8926ed9c6d555331be7f050a8074b9898260e6e9ec698ddf6a57b5954d2d7adb
3
- size 36145
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bfb344e958c0625df92cecfd61ee937ad46f4ae3c1fe7b4a43d64bc66ea025b
3
+ size 53312
data_categories/Final_Category_Data_With_Labels.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:381bf850c184b58a8deea31f12421b126212f7c1702a8c73ed3e4176d2b56785
3
- size 1685102
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e98ec1b4ff3e48cf46b76010bdb651b013e29b73a2ba8afde2691ff2c7ffd89
3
+ size 1755664
data_categories/Food_and_Drink.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aadb7c36f6a07a555ffc31e245efc6062af7a18c58d977eb6877c1807b4e9251
3
- size 47597
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f93635d1d2a6cb1bcce6f246efe265a22a21f9b0f9e09ad64ff4f4135e9a873
3
+ size 50513
data_categories/Pets_and_Animals.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c88bc062d741826064efa5b0b2135f45abafe10aed9be7122f372769b780daba
3
- size 57220
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f02553cb1d35b2874332bdc31f355f85f17bf22d93024ddbe3ed174897c5c60
3
+ size 60136
data_categories/Real Estate.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98584873072b57c4b8629f8071ae0ffe4548f2c1a953260693659f2f740c43a6
3
- size 36020
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2c95f708a885aa30ff47132f588c8b1f69ea65587a9ff2dcdb9012e11754c3
3
+ size 40030
data_categories/Reference.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d77e7335dab3965e50b81a91d2f4312e5408705847ad425b0261a0e45acd136
3
- size 55091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f2e6a675d15a45ca557f267488141912d0898e231e890f40075b2dad1bf1ce
3
+ size 57698
data_categories/Sensitive Subjects.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6851357c45f0aa9baad91b8263d4f85421ae16952682e44279b193e289e9ed32
3
- size 10172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2594ab9518c2def44acaef1c8661194ae16c95755face04b58d255bac1b33f
3
+ size 11256
data_categories/Shopping.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88fe8138d63bd43921db7982b432d2d78bbcf8c17d182cf55877c616e856675f
3
- size 46132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc8a3ba5ed07db6b06b247d33d1b91bded1647f55b9ebcb1d50b1072a51eeecf
3
+ size 56890
data_test/keywords-2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc87cd18c79ecaa87b16dca63019b9577f72073c987ac18c624b059252e32d0f
3
+ size 8356
data_test/labelled_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f208650b882688e9a5cd9e0f2b3787dfd81ca7f5cb524a98c6d1e75d4aadfbf5
3
+ size 19932
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/added_tokens.json RENAMED
File without changes
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/config.json RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "distilbert-base-uncased",
3
  "activation": "gelu",
4
  "architectures": [
5
  "DistilBertForSequenceClassification"
@@ -9,63 +9,63 @@
9
  "dropout": 0.1,
10
  "hidden_dim": 3072,
11
  "id2label": {
12
- "0": "Beauty_and_Fitness",
13
- "1": "People_and_Society",
14
- "2": "Travel_and_Transportation",
15
- "3": "Shopping",
16
- "4": "Adult",
17
- "5": "Sports",
18
- "6": "Science",
19
- "7": "Food_and_Drink",
20
- "8": "News",
21
- "9": "Sensitive Subjects",
22
- "10": "Autos_and_Vehicles",
23
- "11": "Law_and_Government",
24
- "12": "Business_and_Industrial",
25
- "13": "Health",
26
- "14": "Real Estate",
27
- "15": "Books_and_Literature",
28
- "16": "Computers_and_Electronics",
29
- "17": "Internet_and_Telecom",
30
- "18": "Home_and_Garden",
31
- "19": "Jobs_and_Education",
32
- "20": "Online Communities",
33
- "21": "Finance",
34
  "22": "Arts_and_Entertainment",
35
- "23": "Games",
36
- "24": "Hobbies_and_Leisure",
37
- "25": "Reference",
38
- "26": "Pets_and_Animals"
39
  },
40
  "initializer_range": 0.02,
41
  "label2id": {
42
- "Adult": 4,
43
  "Arts_and_Entertainment": 22,
44
- "Autos_and_Vehicles": 10,
45
- "Beauty_and_Fitness": 0,
46
- "Books_and_Literature": 15,
47
- "Business_and_Industrial": 12,
48
- "Computers_and_Electronics": 16,
49
- "Finance": 21,
50
- "Food_and_Drink": 7,
51
- "Games": 23,
52
- "Health": 13,
53
- "Hobbies_and_Leisure": 24,
54
- "Home_and_Garden": 18,
55
- "Internet_and_Telecom": 17,
56
- "Jobs_and_Education": 19,
57
- "Law_and_Government": 11,
58
- "News": 8,
59
- "Online Communities": 20,
60
- "People_and_Society": 1,
61
- "Pets_and_Animals": 26,
62
- "Real Estate": 14,
63
- "Reference": 25,
64
- "Science": 6,
65
- "Sensitive Subjects": 9,
66
- "Shopping": 3,
67
- "Sports": 5,
68
- "Travel_and_Transportation": 2
69
  },
70
  "max_position_embeddings": 512,
71
  "model_type": "distilbert",
 
1
  {
2
+ "_name_or_path": "finetuned_entity_categorical_classification/checkpoint-3346",
3
  "activation": "gelu",
4
  "architectures": [
5
  "DistilBertForSequenceClassification"
 
9
  "dropout": 0.1,
10
  "hidden_dim": 3072,
11
  "id2label": {
12
+ "0": "Hobbies_and_Leisure",
13
+ "1": "News",
14
+ "2": "Science",
15
+ "3": "Autos_and_Vehicles",
16
+ "4": "Health",
17
+ "5": "Pets_and_Animals",
18
+ "6": "Adult",
19
+ "7": "Computers_and_Electronics",
20
+ "8": "Online Communities",
21
+ "9": "Beauty_and_Fitness",
22
+ "10": "People_and_Society",
23
+ "11": "Business_and_Industrial",
24
+ "12": "Reference",
25
+ "13": "Shopping",
26
+ "14": "Travel_and_Transportation",
27
+ "15": "Food_and_Drink",
28
+ "16": "Law_and_Government",
29
+ "17": "Books_and_Literature",
30
+ "18": "Finance",
31
+ "19": "Games",
32
+ "20": "Home_and_Garden",
33
+ "21": "Jobs_and_Education",
34
  "22": "Arts_and_Entertainment",
35
+ "23": "Sensitive Subjects",
36
+ "24": "Real Estate",
37
+ "25": "Internet_and_Telecom",
38
+ "26": "Sports"
39
  },
40
  "initializer_range": 0.02,
41
  "label2id": {
42
+ "Adult": 6,
43
  "Arts_and_Entertainment": 22,
44
+ "Autos_and_Vehicles": 3,
45
+ "Beauty_and_Fitness": 9,
46
+ "Books_and_Literature": 17,
47
+ "Business_and_Industrial": 11,
48
+ "Computers_and_Electronics": 7,
49
+ "Finance": 18,
50
+ "Food_and_Drink": 15,
51
+ "Games": 19,
52
+ "Health": 4,
53
+ "Hobbies_and_Leisure": 0,
54
+ "Home_and_Garden": 20,
55
+ "Internet_and_Telecom": 25,
56
+ "Jobs_and_Education": 21,
57
+ "Law_and_Government": 16,
58
+ "News": 1,
59
+ "Online Communities": 8,
60
+ "People_and_Society": 10,
61
+ "Pets_and_Animals": 5,
62
+ "Real Estate": 24,
63
+ "Reference": 12,
64
+ "Science": 2,
65
+ "Sensitive Subjects": 23,
66
+ "Shopping": 13,
67
+ "Sports": 26,
68
+ "Travel_and_Transportation": 14
69
  },
70
  "max_position_embeddings": 512,
71
  "model_type": "distilbert",
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e20e493b3480b24380280eb5e21c5fd12d4881adfb9cb57a5dd4559f3e85680
3
  size 535881018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7482411d85a2d5cf5f632c997d2e07449fe4217bcf4b1aad0b38f9138d1acd0a
3
  size 535881018
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:732fd46b0cb9afaec1d14b6595994279ae8d82e40715c5b60b128db60718e69b
3
  size 267932842
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f30aacfea59fa26f3b7edc0f510fe6d083c82c0a92e3118f80f0b13f375cb74e
3
  size 267932842
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/rng_state.pth RENAMED
Binary files a/finetuned_entity_categorical_classification/checkpoint-1576/rng_state.pth and b/finetuned_entity_categorical_classification/checkpoint-1681/rng_state.pth differ
 
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:774c81fc50874cca433d334a482dfa90f04ccfdc642ea608612d6233e8d60700
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c77a82e248c93cca9760dd3358cd21c9eded35e9713e3141aaaa12789322001
3
  size 1064
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/special_tokens_map.json RENAMED
File without changes
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/tokenizer.json RENAMED
File without changes
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/tokenizer_config.json RENAMED
File without changes
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/trainer_state.json RENAMED
@@ -1,46 +1,46 @@
1
  {
2
- "best_metric": 0.2187376469373703,
3
- "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1576",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 1576,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.32,
13
- "learning_rate": 1.957698815566836e-05,
14
- "loss": 1.5567,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.63,
19
- "learning_rate": 1.915397631133672e-05,
20
- "loss": 0.3944,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.95,
25
- "learning_rate": 1.873096446700508e-05,
26
- "loss": 0.2773,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
- "eval_accuracy": 0.9374900840869427,
32
- "eval_loss": 0.2187376469373703,
33
- "eval_runtime": 2.2114,
34
- "eval_samples_per_second": 2850.256,
35
- "eval_steps_per_second": 178.169,
36
- "step": 1576
37
  }
38
  ],
39
  "logging_steps": 500,
40
- "max_steps": 23640,
41
- "num_train_epochs": 15,
42
  "save_steps": 500,
43
- "total_flos": 100414675899720.0,
44
  "trial_name": null,
45
  "trial_params": null
46
  }
 
1
  {
2
+ "best_metric": 0.10296357423067093,
3
+ "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 1681,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.3,
13
+ "learning_rate": 1.7025580011897683e-05,
14
+ "loss": 0.1045,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.59,
19
+ "learning_rate": 1.405116002379536e-05,
20
+ "loss": 0.1056,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.89,
25
+ "learning_rate": 1.1076740035693041e-05,
26
+ "loss": 0.1041,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 1.0,
31
+ "eval_accuracy": 0.9721850364420646,
32
+ "eval_loss": 0.10296357423067093,
33
+ "eval_runtime": 2.316,
34
+ "eval_samples_per_second": 2902.854,
35
+ "eval_steps_per_second": 181.779,
36
+ "step": 1681
37
  }
38
  ],
39
  "logging_steps": 500,
40
+ "max_steps": 3362,
41
+ "num_train_epochs": 2,
42
  "save_steps": 500,
43
+ "total_flos": 108413372385396.0,
44
  "trial_name": null,
45
  "trial_params": null
46
  }
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:665da3b0732e752c339d7e2fda57582d0f87bae18392c4edd4fe1327453b2e44
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2de83bc1893d1870cbe886f5287e02f718e1fe0be09dba843ccfc561aeb95ec6
3
  size 4600
finetuned_entity_categorical_classification/{checkpoint-1576 β†’ checkpoint-1681}/vocab.txt RENAMED
File without changes
finetuned_entity_categorical_classification/checkpoint-3346/added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLS]": 101,
3
+ "[MASK]": 103,
4
+ "[PAD]": 0,
5
+ "[SEP]": 102,
6
+ "[UNK]": 100
7
+ }
finetuned_entity_categorical_classification/checkpoint-3346/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "finetuned_entity_categorical_classification/checkpoint-3338",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Hobbies_and_Leisure",
13
+ "1": "News",
14
+ "2": "Science",
15
+ "3": "Autos_and_Vehicles",
16
+ "4": "Health",
17
+ "5": "Pets_and_Animals",
18
+ "6": "Adult",
19
+ "7": "Computers_and_Electronics",
20
+ "8": "Online Communities",
21
+ "9": "Beauty_and_Fitness",
22
+ "10": "People_and_Society",
23
+ "11": "Business_and_Industrial",
24
+ "12": "Reference",
25
+ "13": "Shopping",
26
+ "14": "Travel_and_Transportation",
27
+ "15": "Food_and_Drink",
28
+ "16": "Law_and_Government",
29
+ "17": "Books_and_Literature",
30
+ "18": "Finance",
31
+ "19": "Games",
32
+ "20": "Home_and_Garden",
33
+ "21": "Jobs_and_Education",
34
+ "22": "Arts_and_Entertainment",
35
+ "23": "Sensitive Subjects",
36
+ "24": "Real Estate",
37
+ "25": "Internet_and_Telecom",
38
+ "26": "Sports"
39
+ },
40
+ "initializer_range": 0.02,
41
+ "label2id": {
42
+ "Adult": 6,
43
+ "Arts_and_Entertainment": 22,
44
+ "Autos_and_Vehicles": 3,
45
+ "Beauty_and_Fitness": 9,
46
+ "Books_and_Literature": 17,
47
+ "Business_and_Industrial": 11,
48
+ "Computers_and_Electronics": 7,
49
+ "Finance": 18,
50
+ "Food_and_Drink": 15,
51
+ "Games": 19,
52
+ "Health": 4,
53
+ "Hobbies_and_Leisure": 0,
54
+ "Home_and_Garden": 20,
55
+ "Internet_and_Telecom": 25,
56
+ "Jobs_and_Education": 21,
57
+ "Law_and_Government": 16,
58
+ "News": 1,
59
+ "Online Communities": 8,
60
+ "People_and_Society": 10,
61
+ "Pets_and_Animals": 5,
62
+ "Real Estate": 24,
63
+ "Reference": 12,
64
+ "Science": 2,
65
+ "Sensitive Subjects": 23,
66
+ "Shopping": 13,
67
+ "Sports": 26,
68
+ "Travel_and_Transportation": 14
69
+ },
70
+ "max_position_embeddings": 512,
71
+ "model_type": "distilbert",
72
+ "n_heads": 12,
73
+ "n_layers": 6,
74
+ "pad_token_id": 0,
75
+ "problem_type": "single_label_classification",
76
+ "qa_dropout": 0.1,
77
+ "seq_classif_dropout": 0.2,
78
+ "sinusoidal_pos_embds": false,
79
+ "tie_weights_": true,
80
+ "torch_dtype": "float32",
81
+ "transformers_version": "4.34.0",
82
+ "vocab_size": 30522
83
+ }
finetuned_entity_categorical_classification/checkpoint-3346/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7fd88a2fd3f16fd9c954418fb3e47832af4a6e96026f465481de95dd8e4b99
3
+ size 535881018
finetuned_entity_categorical_classification/checkpoint-3346/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0316f1198be89b32f1ee6ecde222febe6895b798b64d12f8e12b0f5bdaba754
3
+ size 267932842
finetuned_entity_categorical_classification/checkpoint-3346/rng_state.pth ADDED
Binary file (14.2 kB). View file
 
finetuned_entity_categorical_classification/checkpoint-3346/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836b9ae7a26190d3866515097d559222a1c62e5f96c298c8360e09e55b2cf8a4
3
+ size 1064
finetuned_entity_categorical_classification/checkpoint-3346/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
finetuned_entity_categorical_classification/checkpoint-3346/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
finetuned_entity_categorical_classification/checkpoint-3346/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [],
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
finetuned_entity_categorical_classification/checkpoint-3346/trainer_state.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.10625720769166946,
3
+ "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1673",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3346,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3,
13
+ "learning_rate": 1.7011356843992828e-05,
14
+ "loss": 0.1126,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.6,
19
+ "learning_rate": 1.4022713687985656e-05,
20
+ "loss": 0.1165,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.9,
25
+ "learning_rate": 1.1034070531978483e-05,
26
+ "loss": 0.117,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 1.0,
31
+ "eval_accuracy": 0.9715951562266407,
32
+ "eval_loss": 0.10625720769166946,
33
+ "eval_runtime": 2.3554,
34
+ "eval_samples_per_second": 2839.909,
35
+ "eval_steps_per_second": 177.892,
36
+ "step": 1673
37
+ },
38
+ {
39
+ "epoch": 1.2,
40
+ "learning_rate": 8.04542737597131e-06,
41
+ "loss": 0.0894,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 1.49,
46
+ "learning_rate": 5.056784219964137e-06,
47
+ "loss": 0.0827,
48
+ "step": 2500
49
+ },
50
+ {
51
+ "epoch": 1.79,
52
+ "learning_rate": 2.068141063956964e-06,
53
+ "loss": 0.0755,
54
+ "step": 3000
55
+ },
56
+ {
57
+ "epoch": 2.0,
58
+ "eval_accuracy": 0.9706981611601136,
59
+ "eval_loss": 0.1135576069355011,
60
+ "eval_runtime": 2.4092,
61
+ "eval_samples_per_second": 2776.427,
62
+ "eval_steps_per_second": 173.916,
63
+ "step": 3346
64
+ }
65
+ ],
66
+ "logging_steps": 500,
67
+ "max_steps": 3346,
68
+ "num_train_epochs": 2,
69
+ "save_steps": 500,
70
+ "total_flos": 209706294909150.0,
71
+ "trial_name": null,
72
+ "trial_params": null
73
+ }
finetuned_entity_categorical_classification/checkpoint-3346/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7813e2902539bc577c8459ca958658172359e179c4ef494972d6db5de3ada53e
3
+ size 4600
finetuned_entity_categorical_classification/checkpoint-3346/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
finetuned_entity_categorical_classification/checkpoint-3362/added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLS]": 101,
3
+ "[MASK]": 103,
4
+ "[PAD]": 0,
5
+ "[SEP]": 102,
6
+ "[UNK]": 100
7
+ }
finetuned_entity_categorical_classification/checkpoint-3362/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "finetuned_entity_categorical_classification/checkpoint-3346",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Hobbies_and_Leisure",
13
+ "1": "News",
14
+ "2": "Science",
15
+ "3": "Autos_and_Vehicles",
16
+ "4": "Health",
17
+ "5": "Pets_and_Animals",
18
+ "6": "Adult",
19
+ "7": "Computers_and_Electronics",
20
+ "8": "Online Communities",
21
+ "9": "Beauty_and_Fitness",
22
+ "10": "People_and_Society",
23
+ "11": "Business_and_Industrial",
24
+ "12": "Reference",
25
+ "13": "Shopping",
26
+ "14": "Travel_and_Transportation",
27
+ "15": "Food_and_Drink",
28
+ "16": "Law_and_Government",
29
+ "17": "Books_and_Literature",
30
+ "18": "Finance",
31
+ "19": "Games",
32
+ "20": "Home_and_Garden",
33
+ "21": "Jobs_and_Education",
34
+ "22": "Arts_and_Entertainment",
35
+ "23": "Sensitive Subjects",
36
+ "24": "Real Estate",
37
+ "25": "Internet_and_Telecom",
38
+ "26": "Sports"
39
+ },
40
+ "initializer_range": 0.02,
41
+ "label2id": {
42
+ "Adult": 6,
43
+ "Arts_and_Entertainment": 22,
44
+ "Autos_and_Vehicles": 3,
45
+ "Beauty_and_Fitness": 9,
46
+ "Books_and_Literature": 17,
47
+ "Business_and_Industrial": 11,
48
+ "Computers_and_Electronics": 7,
49
+ "Finance": 18,
50
+ "Food_and_Drink": 15,
51
+ "Games": 19,
52
+ "Health": 4,
53
+ "Hobbies_and_Leisure": 0,
54
+ "Home_and_Garden": 20,
55
+ "Internet_and_Telecom": 25,
56
+ "Jobs_and_Education": 21,
57
+ "Law_and_Government": 16,
58
+ "News": 1,
59
+ "Online Communities": 8,
60
+ "People_and_Society": 10,
61
+ "Pets_and_Animals": 5,
62
+ "Real Estate": 24,
63
+ "Reference": 12,
64
+ "Science": 2,
65
+ "Sensitive Subjects": 23,
66
+ "Shopping": 13,
67
+ "Sports": 26,
68
+ "Travel_and_Transportation": 14
69
+ },
70
+ "max_position_embeddings": 512,
71
+ "model_type": "distilbert",
72
+ "n_heads": 12,
73
+ "n_layers": 6,
74
+ "pad_token_id": 0,
75
+ "problem_type": "single_label_classification",
76
+ "qa_dropout": 0.1,
77
+ "seq_classif_dropout": 0.2,
78
+ "sinusoidal_pos_embds": false,
79
+ "tie_weights_": true,
80
+ "torch_dtype": "float32",
81
+ "transformers_version": "4.34.0",
82
+ "vocab_size": 30522
83
+ }
finetuned_entity_categorical_classification/checkpoint-3362/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d015879f29a2744736a3ba7748885a4ec943584a74c779bc00637389c2d90ccd
3
+ size 535881018
finetuned_entity_categorical_classification/checkpoint-3362/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f9ac5b4263d73b4fe5715bd69766cb18cb5925f401945d0c67275a65364524
3
+ size 267932842
finetuned_entity_categorical_classification/checkpoint-3362/rng_state.pth ADDED
Binary file (14.2 kB). View file
 
finetuned_entity_categorical_classification/checkpoint-3362/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af53710b40243eb9329cc845f9ef3a957c0e1972618f070ad4cc3c95bc43973
3
+ size 1064
finetuned_entity_categorical_classification/checkpoint-3362/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
finetuned_entity_categorical_classification/checkpoint-3362/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
finetuned_entity_categorical_classification/checkpoint-3362/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [],
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
finetuned_entity_categorical_classification/checkpoint-3362/trainer_state.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.10296357423067093,
3
+ "best_model_checkpoint": "finetuned_entity_categorical_classification/checkpoint-1681",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3362,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3,
13
+ "learning_rate": 1.7025580011897683e-05,
14
+ "loss": 0.1045,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.59,
19
+ "learning_rate": 1.405116002379536e-05,
20
+ "loss": 0.1056,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.89,
25
+ "learning_rate": 1.1076740035693041e-05,
26
+ "loss": 0.1041,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 1.0,
31
+ "eval_accuracy": 0.9721850364420646,
32
+ "eval_loss": 0.10296357423067093,
33
+ "eval_runtime": 2.316,
34
+ "eval_samples_per_second": 2902.854,
35
+ "eval_steps_per_second": 181.779,
36
+ "step": 1681
37
+ },
38
+ {
39
+ "epoch": 1.19,
40
+ "learning_rate": 8.10232004759072e-06,
41
+ "loss": 0.0776,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 1.49,
46
+ "learning_rate": 5.1279000594884e-06,
47
+ "loss": 0.0675,
48
+ "step": 2500
49
+ },
50
+ {
51
+ "epoch": 1.78,
52
+ "learning_rate": 2.1534800713860798e-06,
53
+ "loss": 0.0773,
54
+ "step": 3000
55
+ },
56
+ {
57
+ "epoch": 2.0,
58
+ "eval_accuracy": 0.9708463483563885,
59
+ "eval_loss": 0.11056160181760788,
60
+ "eval_runtime": 2.2742,
61
+ "eval_samples_per_second": 2956.182,
62
+ "eval_steps_per_second": 185.119,
63
+ "step": 3362
64
+ }
65
+ ],
66
+ "logging_steps": 500,
67
+ "max_steps": 3362,
68
+ "num_train_epochs": 2,
69
+ "save_steps": 500,
70
+ "total_flos": 216609059710134.0,
71
+ "trial_name": null,
72
+ "trial_params": null
73
+ }
finetuned_entity_categorical_classification/checkpoint-3362/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2de83bc1893d1870cbe886f5287e02f718e1fe0be09dba843ccfc561aeb95ec6
3
+ size 4600
finetuned_entity_categorical_classification/checkpoint-3362/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
finetuned_entity_categorical_classification/runs/Oct12_11-19-39_ip-172-31-95-165/events.out.tfevents.1697109579.ip-172-31-95-165.128350.0 ADDED
Binary file (7.68 kB). View file
 
finetuned_entity_categorical_classification/runs/Oct12_11-43-16_ip-172-31-95-165/events.out.tfevents.1697110996.ip-172-31-95-165.128941.0 ADDED
Binary file (7.68 kB). View file
 
finetuned_entity_categorical_classification/runs/Oct12_11-59-06_ip-172-31-95-165/events.out.tfevents.1697111947.ip-172-31-95-165.129502.0 ADDED
Binary file (7.68 kB). View file
 
research/08_organizing_entire_datacategories.ipynb CHANGED
@@ -438,173 +438,173 @@
438
  " </thead>\n",
439
  " <tbody>\n",
440
  " <tr>\n",
441
- " <th>920</th>\n",
442
- " <td>Virtual reality in therapy 2025</td>\n",
443
- " <td>Computers_and_Electronics</td>\n",
444
- " <td>7</td>\n",
445
- " </tr>\n",
446
- " <tr>\n",
447
- " <th>368</th>\n",
448
- " <td>Real estate networking tips</td>\n",
449
- " <td>Real Estate</td>\n",
450
- " <td>24</td>\n",
451
- " </tr>\n",
452
- " <tr>\n",
453
- " <th>173</th>\n",
454
- " <td>Real estate market outlook</td>\n",
455
- " <td>Real Estate</td>\n",
456
- " <td>24</td>\n",
457
- " </tr>\n",
458
- " <tr>\n",
459
- " <th>1045</th>\n",
460
  " <td>Plus-size clothing stores and shops</td>\n",
461
  " <td>Shopping</td>\n",
462
  " <td>13</td>\n",
463
  " </tr>\n",
464
  " <tr>\n",
465
- " <th>910</th>\n",
466
- " <td>Canyoning trips</td>\n",
467
- " <td>Hobbies_and_Leisure</td>\n",
468
- " <td>0</td>\n",
469
  " </tr>\n",
470
  " <tr>\n",
471
- " <th>1098</th>\n",
472
- " <td>Food storage organization</td>\n",
473
- " <td>Food_and_Drink</td>\n",
474
- " <td>15</td>\n",
475
  " </tr>\n",
476
  " <tr>\n",
477
- " <th>788</th>\n",
478
- " <td>Political party systems</td>\n",
479
- " <td>Law_and_Government</td>\n",
480
- " <td>16</td>\n",
481
  " </tr>\n",
482
  " <tr>\n",
483
- " <th>999</th>\n",
484
- " <td>Voting systems security</td>\n",
 
 
 
 
 
 
485
  " <td>Law_and_Government</td>\n",
486
  " <td>16</td>\n",
487
  " </tr>\n",
488
  " <tr>\n",
489
- " <th>547</th>\n",
490
- " <td>Cottage garden outdoor seating</td>\n",
491
  " <td>Home_and_Garden</td>\n",
492
  " <td>20</td>\n",
493
  " </tr>\n",
494
  " <tr>\n",
495
- " <th>210</th>\n",
496
- " <td>Sustainable fashion and ethical clothing networks</td>\n",
497
- " <td>Online Communities</td>\n",
498
- " <td>8</td>\n",
499
  " </tr>\n",
500
  " <tr>\n",
501
- " <th>466</th>\n",
502
- " <td>Medicaid</td>\n",
 
 
 
 
 
 
503
  " <td>Health</td>\n",
504
  " <td>4</td>\n",
505
  " </tr>\n",
506
  " <tr>\n",
507
- " <th>908</th>\n",
508
- " <td>Shakespearean adaptations in historical contex...</td>\n",
509
- " <td>Books_and_Literature</td>\n",
510
- " <td>17</td>\n",
511
  " </tr>\n",
512
  " <tr>\n",
513
- " <th>831</th>\n",
514
- " <td>Travel destinations and vacation planning advi...</td>\n",
515
- " <td>Online Communities</td>\n",
516
- " <td>8</td>\n",
517
  " </tr>\n",
518
  " <tr>\n",
519
- " <th>746</th>\n",
520
- " <td>Social and political activism campaigns and ev...</td>\n",
521
- " <td>Online Communities</td>\n",
522
- " <td>8</td>\n",
523
  " </tr>\n",
524
  " <tr>\n",
525
- " <th>715</th>\n",
526
- " <td>Community development resources</td>\n",
527
- " <td>People_and_Society</td>\n",
528
- " <td>10</td>\n",
529
  " </tr>\n",
530
  " <tr>\n",
531
- " <th>639</th>\n",
532
- " <td>Scientific data analysis software</td>\n",
533
- " <td>Science</td>\n",
534
- " <td>2</td>\n",
535
  " </tr>\n",
536
  " <tr>\n",
537
- " <th>830</th>\n",
538
- " <td>Real estate market reports and insights</td>\n",
539
  " <td>Real Estate</td>\n",
540
  " <td>24</td>\n",
541
  " </tr>\n",
542
  " <tr>\n",
543
- " <th>978</th>\n",
544
- " <td>Reference citation context importance measurem...</td>\n",
545
- " <td>Reference</td>\n",
546
- " <td>12</td>\n",
547
  " </tr>\n",
548
  " <tr>\n",
549
- " <th>938</th>\n",
550
- " <td>Industrial companies in Asia</td>\n",
551
- " <td>Business_and_Industrial</td>\n",
552
- " <td>11</td>\n",
553
  " </tr>\n",
554
  " <tr>\n",
555
- " <th>8</th>\n",
556
- " <td>Legal system</td>\n",
557
- " <td>Law_and_Government</td>\n",
558
- " <td>16</td>\n",
 
 
 
 
 
 
559
  " </tr>\n",
560
  " </tbody>\n",
561
  "</table>\n",
562
  "</div>"
563
  ],
564
  "text/plain": [
565
- " category \\\n",
566
- "920 Virtual reality in therapy 2025 \n",
567
- "368 Real estate networking tips \n",
568
- "173 Real estate market outlook \n",
569
- "1045 Plus-size clothing stores and shops \n",
570
- "910 Canyoning trips \n",
571
- "1098 Food storage organization \n",
572
- "788 Political party systems \n",
573
- "999 Voting systems security \n",
574
- "547 Cottage garden outdoor seating \n",
575
- "210 Sustainable fashion and ethical clothing networks \n",
576
- "466 Medicaid \n",
577
- "908 Shakespearean adaptations in historical contex... \n",
578
- "831 Travel destinations and vacation planning advi... \n",
579
- "746 Social and political activism campaigns and ev... \n",
580
- "715 Community development resources \n",
581
- "639 Scientific data analysis software \n",
582
- "830 Real estate market reports and insights \n",
583
- "978 Reference citation context importance measurem... \n",
584
- "938 Industrial companies in Asia \n",
585
- "8 Legal system \n",
586
  "\n",
587
  " label label_id \n",
588
- "920 Computers_and_Electronics 7 \n",
589
- "368 Real Estate 24 \n",
590
- "173 Real Estate 24 \n",
591
- "1045 Shopping 13 \n",
592
- "910 Hobbies_and_Leisure 0 \n",
593
- "1098 Food_and_Drink 15 \n",
594
- "788 Law_and_Government 16 \n",
595
- "999 Law_and_Government 16 \n",
596
- "547 Home_and_Garden 20 \n",
597
- "210 Online Communities 8 \n",
598
- "466 Health 4 \n",
599
- "908 Books_and_Literature 17 \n",
600
- "831 Online Communities 8 \n",
601
- "746 Online Communities 8 \n",
602
- "715 People_and_Society 10 \n",
603
- "639 Science 2 \n",
604
- "830 Real Estate 24 \n",
605
- "978 Reference 12 \n",
606
- "938 Business_and_Industrial 11 \n",
607
- "8 Law_and_Government 16 "
608
  ]
609
  },
610
  "execution_count": 10,
@@ -625,21 +625,22 @@
625
  "data": {
626
  "text/plain": [
627
  "label\n",
628
- "Food_and_Drink 1755\n",
629
- "Shopping 1505\n",
630
- "Computers_and_Electronics 1432\n",
 
631
  "Sports 1399\n",
632
  "Online Communities 1396\n",
633
  "Travel_and_Transportation 1355\n",
634
  "Internet_and_Telecom 1353\n",
635
- "Reference 1315\n",
636
  "Beauty_and_Fitness 1259\n",
637
  "People_and_Society 1250\n",
638
- "Pets_and_Animals 1228\n",
639
  "Law_and_Government 1226\n",
640
  "Home_and_Garden 1200\n",
641
  "News 1199\n",
642
  "Jobs_and_Education 1188\n",
 
643
  "Arts_and_Entertainment 1162\n",
644
  "Business_and_Industrial 1124\n",
645
  "Adult 1100\n",
@@ -647,11 +648,10 @@
647
  "Autos_and_Vehicles 1072\n",
648
  "Science 1055\n",
649
  "Hobbies_and_Leisure 1049\n",
650
- "Finance 1000\n",
651
- "Real Estate 1000\n",
652
  "Books_and_Literature 1000\n",
 
 
653
  "Games 700\n",
654
- "Sensitive Subjects 688\n",
655
  "Name: count, dtype: int64"
656
  ]
657
  },
@@ -698,121 +698,121 @@
698
  " <tbody>\n",
699
  " <tr>\n",
700
  " <th>0</th>\n",
701
- " <td>DIY woodworking projects</td>\n",
702
- " <td>Home_and_Garden</td>\n",
703
- " <td>20</td>\n",
704
  " </tr>\n",
705
  " <tr>\n",
706
  " <th>1</th>\n",
707
- " <td>Music festivals lineup leaks</td>\n",
708
- " <td>Arts_and_Entertainment</td>\n",
709
- " <td>22</td>\n",
710
  " </tr>\n",
711
  " <tr>\n",
712
  " <th>2</th>\n",
713
- " <td>Sports Team Fan Love</td>\n",
714
- " <td>Sports</td>\n",
715
- " <td>26</td>\n",
716
  " </tr>\n",
717
  " <tr>\n",
718
  " <th>3</th>\n",
719
- " <td>Food portion control and portion control apps</td>\n",
720
  " <td>Food_and_Drink</td>\n",
721
  " <td>15</td>\n",
722
  " </tr>\n",
723
  " <tr>\n",
724
  " <th>4</th>\n",
725
- " <td>Planting flower beds</td>\n",
726
- " <td>Home_and_Garden</td>\n",
727
- " <td>20</td>\n",
728
  " </tr>\n",
729
  " <tr>\n",
730
  " <th>5</th>\n",
731
- " <td>News articles</td>\n",
732
- " <td>News</td>\n",
733
- " <td>1</td>\n",
734
  " </tr>\n",
735
  " <tr>\n",
736
  " <th>6</th>\n",
737
- " <td>Organic makeup for natural look</td>\n",
738
- " <td>Beauty_and_Fitness</td>\n",
739
- " <td>9</td>\n",
740
  " </tr>\n",
741
  " <tr>\n",
742
  " <th>7</th>\n",
743
- " <td>Art history online courses in art therapy</td>\n",
744
- " <td>Arts_and_Entertainment</td>\n",
745
- " <td>22</td>\n",
746
  " </tr>\n",
747
  " <tr>\n",
748
  " <th>8</th>\n",
749
- " <td>Citation context ranking algorithms</td>\n",
750
- " <td>Reference</td>\n",
751
- " <td>12</td>\n",
752
  " </tr>\n",
753
  " <tr>\n",
754
  " <th>9</th>\n",
755
- " <td>Career assessment quizzes</td>\n",
756
- " <td>Jobs_and_Education</td>\n",
757
- " <td>21</td>\n",
758
  " </tr>\n",
759
  " <tr>\n",
760
  " <th>10</th>\n",
761
- " <td>Game streaming camera choice and setup tips</td>\n",
762
- " <td>Games</td>\n",
763
- " <td>19</td>\n",
764
  " </tr>\n",
765
  " <tr>\n",
766
  " <th>11</th>\n",
767
- " <td>Truck financing</td>\n",
768
- " <td>Autos_and_Vehicles</td>\n",
769
- " <td>3</td>\n",
770
  " </tr>\n",
771
  " <tr>\n",
772
  " <th>12</th>\n",
773
- " <td>Community engagement</td>\n",
774
- " <td>People_and_Society</td>\n",
775
- " <td>10</td>\n",
776
  " </tr>\n",
777
  " <tr>\n",
778
  " <th>13</th>\n",
779
- " <td>Creative writing prompts</td>\n",
780
- " <td>Books_and_Literature</td>\n",
781
- " <td>17</td>\n",
782
  " </tr>\n",
783
  " <tr>\n",
784
  " <th>14</th>\n",
785
- " <td>Reference citation context exploration methods</td>\n",
786
- " <td>Reference</td>\n",
787
- " <td>12</td>\n",
788
  " </tr>\n",
789
  " <tr>\n",
790
  " <th>15</th>\n",
791
- " <td>Online discussion boards</td>\n",
792
- " <td>Online Communities</td>\n",
793
- " <td>8</td>\n",
794
  " </tr>\n",
795
  " <tr>\n",
796
  " <th>16</th>\n",
797
- " <td>Tarantula species care for advanced keepers an...</td>\n",
798
- " <td>Pets_and_Animals</td>\n",
799
- " <td>5</td>\n",
800
  " </tr>\n",
801
  " <tr>\n",
802
  " <th>17</th>\n",
803
- " <td>Cycling workouts</td>\n",
804
- " <td>Beauty_and_Fitness</td>\n",
805
- " <td>9</td>\n",
806
  " </tr>\n",
807
  " <tr>\n",
808
  " <th>18</th>\n",
809
- " <td>Webcomics and graphic novels discussion groups</td>\n",
810
- " <td>Online Communities</td>\n",
811
- " <td>8</td>\n",
812
  " </tr>\n",
813
  " <tr>\n",
814
  " <th>19</th>\n",
815
- " <td>eSports Game Content Creation</td>\n",
816
  " <td>Sports</td>\n",
817
  " <td>26</td>\n",
818
  " </tr>\n",
@@ -821,49 +821,49 @@
821
  "</div>"
822
  ],
823
  "text/plain": [
824
- " category label \\\n",
825
- "0 DIY woodworking projects Home_and_Garden \n",
826
- "1 Music festivals lineup leaks Arts_and_Entertainment \n",
827
- "2 Sports Team Fan Love Sports \n",
828
- "3 Food portion control and portion control apps Food_and_Drink \n",
829
- "4 Planting flower beds Home_and_Garden \n",
830
- "5 News articles News \n",
831
- "6 Organic makeup for natural look Beauty_and_Fitness \n",
832
- "7 Art history online courses in art therapy Arts_and_Entertainment \n",
833
- "8 Citation context ranking algorithms Reference \n",
834
- "9 Career assessment quizzes Jobs_and_Education \n",
835
- "10 Game streaming camera choice and setup tips Games \n",
836
- "11 Truck financing Autos_and_Vehicles \n",
837
- "12 Community engagement People_and_Society \n",
838
- "13 Creative writing prompts Books_and_Literature \n",
839
- "14 Reference citation context exploration methods Reference \n",
840
- "15 Online discussion boards Online Communities \n",
841
- "16 Tarantula species care for advanced keepers an... Pets_and_Animals \n",
842
- "17 Cycling workouts Beauty_and_Fitness \n",
843
- "18 Webcomics and graphic novels discussion groups Online Communities \n",
844
- "19 eSports Game Content Creation Sports \n",
845
  "\n",
846
- " label_id \n",
847
- "0 20 \n",
848
- "1 22 \n",
849
- "2 26 \n",
850
- "3 15 \n",
851
- "4 20 \n",
852
- "5 1 \n",
853
- "6 9 \n",
854
- "7 22 \n",
855
- "8 12 \n",
856
- "9 21 \n",
857
- "10 19 \n",
858
- "11 3 \n",
859
- "12 10 \n",
860
- "13 17 \n",
861
- "14 12 \n",
862
- "15 8 \n",
863
- "16 5 \n",
864
- "17 9 \n",
865
- "18 8 \n",
866
- "19 26 "
867
  ]
868
  },
869
  "execution_count": 12,
 
438
  " </thead>\n",
439
  " <tbody>\n",
440
  " <tr>\n",
441
+ " <th>1201</th>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  " <td>Plus-size clothing stores and shops</td>\n",
443
  " <td>Shopping</td>\n",
444
  " <td>13</td>\n",
445
  " </tr>\n",
446
  " <tr>\n",
447
+ " <th>853</th>\n",
448
+ " <td>Citation context extraction techniques</td>\n",
449
+ " <td>Reference</td>\n",
450
+ " <td>12</td>\n",
451
  " </tr>\n",
452
  " <tr>\n",
453
+ " <th>1034</th>\n",
454
+ " <td>Cat ear headphones with aux</td>\n",
455
+ " <td>Computers_and_Electronics</td>\n",
456
+ " <td>7</td>\n",
457
  " </tr>\n",
458
  " <tr>\n",
459
+ " <th>632</th>\n",
460
+ " <td>promote such behavior</td>\n",
461
+ " <td>Sensitive Subjects</td>\n",
462
+ " <td>23</td>\n",
463
  " </tr>\n",
464
  " <tr>\n",
465
+ " <th>91</th>\n",
466
+ " <td>Literature review references</td>\n",
467
+ " <td>Reference</td>\n",
468
+ " <td>12</td>\n",
469
+ " </tr>\n",
470
+ " <tr>\n",
471
+ " <th>168</th>\n",
472
+ " <td>Freedom of speech cases</td>\n",
473
  " <td>Law_and_Government</td>\n",
474
  " <td>16</td>\n",
475
  " </tr>\n",
476
  " <tr>\n",
477
+ " <th>1111</th>\n",
478
+ " <td>French country kitchen design inspiration DIY</td>\n",
479
  " <td>Home_and_Garden</td>\n",
480
  " <td>20</td>\n",
481
  " </tr>\n",
482
  " <tr>\n",
483
+ " <th>492</th>\n",
484
+ " <td>Credit score improvement techniques overview</td>\n",
485
+ " <td>Finance</td>\n",
486
+ " <td>18</td>\n",
487
  " </tr>\n",
488
  " <tr>\n",
489
+ " <th>657</th>\n",
490
+ " <td>regulated by laws</td>\n",
491
+ " <td>Sensitive Subjects</td>\n",
492
+ " <td>23</td>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>1037</th>\n",
496
+ " <td>Health Education for Seniors</td>\n",
497
  " <td>Health</td>\n",
498
  " <td>4</td>\n",
499
  " </tr>\n",
500
  " <tr>\n",
501
+ " <th>109</th>\n",
502
+ " <td>Quantum mechanics experiments</td>\n",
503
+ " <td>Science</td>\n",
504
+ " <td>2</td>\n",
505
  " </tr>\n",
506
  " <tr>\n",
507
+ " <th>538</th>\n",
508
+ " <td>Healthcare AI applications</td>\n",
509
+ " <td>Science</td>\n",
510
+ " <td>2</td>\n",
511
  " </tr>\n",
512
  " <tr>\n",
513
+ " <th>1386</th>\n",
514
+ " <td>AirPods Pro Case</td>\n",
515
+ " <td>Computers_and_Electronics</td>\n",
516
+ " <td>7</td>\n",
517
  " </tr>\n",
518
  " <tr>\n",
519
+ " <th>844</th>\n",
520
+ " <td>DIY home electrical repairs</td>\n",
521
+ " <td>Home_and_Garden</td>\n",
522
+ " <td>20</td>\n",
523
  " </tr>\n",
524
  " <tr>\n",
525
+ " <th>439</th>\n",
526
+ " <td>tube sex</td>\n",
527
+ " <td>Adult</td>\n",
528
+ " <td>6</td>\n",
529
  " </tr>\n",
530
  " <tr>\n",
531
+ " <th>231</th>\n",
532
+ " <td>Real estate sales tactics</td>\n",
533
  " <td>Real Estate</td>\n",
534
  " <td>24</td>\n",
535
  " </tr>\n",
536
  " <tr>\n",
537
+ " <th>610</th>\n",
538
+ " <td>Home solar panel cleaning and maintenance</td>\n",
539
+ " <td>Home_and_Garden</td>\n",
540
+ " <td>20</td>\n",
541
  " </tr>\n",
542
  " <tr>\n",
543
+ " <th>422</th>\n",
544
+ " <td>Real estate legal issues</td>\n",
545
+ " <td>Real Estate</td>\n",
546
+ " <td>24</td>\n",
547
  " </tr>\n",
548
  " <tr>\n",
549
+ " <th>222</th>\n",
550
+ " <td>Film industry news</td>\n",
551
+ " <td>Arts_and_Entertainment</td>\n",
552
+ " <td>22</td>\n",
553
+ " </tr>\n",
554
+ " <tr>\n",
555
+ " <th>1077</th>\n",
556
+ " <td>Cat ear headphones for PS4</td>\n",
557
+ " <td>Computers_and_Electronics</td>\n",
558
+ " <td>7</td>\n",
559
  " </tr>\n",
560
  " </tbody>\n",
561
  "</table>\n",
562
  "</div>"
563
  ],
564
  "text/plain": [
565
+ " category \\\n",
566
+ "1201 Plus-size clothing stores and shops \n",
567
+ "853 Citation context extraction techniques \n",
568
+ "1034 Cat ear headphones with aux \n",
569
+ "632 promote such behavior \n",
570
+ "91 Literature review references \n",
571
+ "168 Freedom of speech cases \n",
572
+ "1111 French country kitchen design inspiration DIY \n",
573
+ "492 Credit score improvement techniques overview \n",
574
+ "657 regulated by laws \n",
575
+ "1037 Health Education for Seniors \n",
576
+ "109 Quantum mechanics experiments \n",
577
+ "538 Healthcare AI applications \n",
578
+ "1386 AirPods Pro Case \n",
579
+ "844 DIY home electrical repairs \n",
580
+ "439 tube sex \n",
581
+ "231 Real estate sales tactics \n",
582
+ "610 Home solar panel cleaning and maintenance \n",
583
+ "422 Real estate legal issues \n",
584
+ "222 Film industry news \n",
585
+ "1077 Cat ear headphones for PS4 \n",
586
  "\n",
587
  " label label_id \n",
588
+ "1201 Shopping 13 \n",
589
+ "853 Reference 12 \n",
590
+ "1034 Computers_and_Electronics 7 \n",
591
+ "632 Sensitive Subjects 23 \n",
592
+ "91 Reference 12 \n",
593
+ "168 Law_and_Government 16 \n",
594
+ "1111 Home_and_Garden 20 \n",
595
+ "492 Finance 18 \n",
596
+ "657 Sensitive Subjects 23 \n",
597
+ "1037 Health 4 \n",
598
+ "109 Science 2 \n",
599
+ "538 Science 2 \n",
600
+ "1386 Computers_and_Electronics 7 \n",
601
+ "844 Home_and_Garden 20 \n",
602
+ "439 Adult 6 \n",
603
+ "231 Real Estate 24 \n",
604
+ "610 Home_and_Garden 20 \n",
605
+ "422 Real Estate 24 \n",
606
+ "222 Arts_and_Entertainment 22 \n",
607
+ "1077 Computers_and_Electronics 7 "
608
  ]
609
  },
610
  "execution_count": 10,
 
625
  "data": {
626
  "text/plain": [
627
  "label\n",
628
+ "Computers_and_Electronics 1959\n",
629
+ "Shopping 1912\n",
630
+ "Food_and_Drink 1851\n",
631
+ "Reference 1453\n",
632
  "Sports 1399\n",
633
  "Online Communities 1396\n",
634
  "Travel_and_Transportation 1355\n",
635
  "Internet_and_Telecom 1353\n",
636
+ "Pets_and_Animals 1324\n",
637
  "Beauty_and_Fitness 1259\n",
638
  "People_and_Society 1250\n",
 
639
  "Law_and_Government 1226\n",
640
  "Home_and_Garden 1200\n",
641
  "News 1199\n",
642
  "Jobs_and_Education 1188\n",
643
+ "Real Estate 1166\n",
644
  "Arts_and_Entertainment 1162\n",
645
  "Business_and_Industrial 1124\n",
646
  "Adult 1100\n",
 
648
  "Autos_and_Vehicles 1072\n",
649
  "Science 1055\n",
650
  "Hobbies_and_Leisure 1049\n",
 
 
651
  "Books_and_Literature 1000\n",
652
+ "Finance 1000\n",
653
+ "Sensitive Subjects 762\n",
654
  "Games 700\n",
 
655
  "Name: count, dtype: int64"
656
  ]
657
  },
 
698
  " <tbody>\n",
699
  " <tr>\n",
700
  " <th>0</th>\n",
701
+ " <td>Internet usage monitoring</td>\n",
702
+ " <td>Internet_and_Telecom</td>\n",
703
+ " <td>25</td>\n",
704
  " </tr>\n",
705
  " <tr>\n",
706
  " <th>1</th>\n",
707
+ " <td>Food safety guidelines and regulations</td>\n",
708
+ " <td>Food_and_Drink</td>\n",
709
+ " <td>15</td>\n",
710
  " </tr>\n",
711
  " <tr>\n",
712
  " <th>2</th>\n",
713
+ " <td>Internet protocols and edge computing in finance</td>\n",
714
+ " <td>Internet_and_Telecom</td>\n",
715
+ " <td>25</td>\n",
716
  " </tr>\n",
717
  " <tr>\n",
718
  " <th>3</th>\n",
719
+ " <td>Online grocery shopping</td>\n",
720
  " <td>Food_and_Drink</td>\n",
721
  " <td>15</td>\n",
722
  " </tr>\n",
723
  " <tr>\n",
724
  " <th>4</th>\n",
725
+ " <td>Writing retreats for poets and novelists</td>\n",
726
+ " <td>Books_and_Literature</td>\n",
727
+ " <td>17</td>\n",
728
  " </tr>\n",
729
  " <tr>\n",
730
  " <th>5</th>\n",
731
+ " <td>Unicorn cat ear headphones</td>\n",
732
+ " <td>Computers_and_Electronics</td>\n",
733
+ " <td>7</td>\n",
734
  " </tr>\n",
735
  " <tr>\n",
736
  " <th>6</th>\n",
737
+ " <td>Reference citation context tagging techniques</td>\n",
738
+ " <td>Reference</td>\n",
739
+ " <td>12</td>\n",
740
  " </tr>\n",
741
  " <tr>\n",
742
  " <th>7</th>\n",
743
+ " <td>Motorcycle riding tips for beginners gear chec...</td>\n",
744
+ " <td>Autos_and_Vehicles</td>\n",
745
+ " <td>3</td>\n",
746
  " </tr>\n",
747
  " <tr>\n",
748
  " <th>8</th>\n",
749
+ " <td>Space agency missions</td>\n",
750
+ " <td>Science</td>\n",
751
+ " <td>2</td>\n",
752
  " </tr>\n",
753
  " <tr>\n",
754
  " <th>9</th>\n",
755
+ " <td>Game streaming self-promotion and growth tactics</td>\n",
756
+ " <td>Games</td>\n",
757
+ " <td>19</td>\n",
758
  " </tr>\n",
759
  " <tr>\n",
760
  " <th>10</th>\n",
761
+ " <td>sex videos movies</td>\n",
762
+ " <td>Adult</td>\n",
763
+ " <td>6</td>\n",
764
  " </tr>\n",
765
  " <tr>\n",
766
  " <th>11</th>\n",
767
+ " <td>Citation context organization methods</td>\n",
768
+ " <td>Reference</td>\n",
769
+ " <td>12</td>\n",
770
  " </tr>\n",
771
  " <tr>\n",
772
  " <th>12</th>\n",
773
+ " <td>Healthy office snacks</td>\n",
774
+ " <td>Health</td>\n",
775
+ " <td>4</td>\n",
776
  " </tr>\n",
777
  " <tr>\n",
778
  " <th>13</th>\n",
779
+ " <td>Indigenous rights advocacy</td>\n",
780
+ " <td>People_and_Society</td>\n",
781
+ " <td>10</td>\n",
782
  " </tr>\n",
783
  " <tr>\n",
784
  " <th>14</th>\n",
785
+ " <td>News talk shows</td>\n",
786
+ " <td>News</td>\n",
787
+ " <td>1</td>\n",
788
  " </tr>\n",
789
  " <tr>\n",
790
  " <th>15</th>\n",
791
+ " <td>Best facial cleansers</td>\n",
792
+ " <td>Hobbies_and_Leisure</td>\n",
793
+ " <td>0</td>\n",
794
  " </tr>\n",
795
  " <tr>\n",
796
  " <th>16</th>\n",
797
+ " <td>Letter of recommendation</td>\n",
798
+ " <td>Reference</td>\n",
799
+ " <td>12</td>\n",
800
  " </tr>\n",
801
  " <tr>\n",
802
  " <th>17</th>\n",
803
+ " <td>Fossil preservation techniques</td>\n",
804
+ " <td>Science</td>\n",
805
+ " <td>2</td>\n",
806
  " </tr>\n",
807
  " <tr>\n",
808
  " <th>18</th>\n",
809
+ " <td>Marriage equality</td>\n",
810
+ " <td>People_and_Society</td>\n",
811
+ " <td>10</td>\n",
812
  " </tr>\n",
813
  " <tr>\n",
814
  " <th>19</th>\n",
815
+ " <td>eSports Game Esports Player Fan Engagement Ini...</td>\n",
816
  " <td>Sports</td>\n",
817
  " <td>26</td>\n",
818
  " </tr>\n",
 
821
  "</div>"
822
  ],
823
  "text/plain": [
824
+ " category \\\n",
825
+ "0 Internet usage monitoring \n",
826
+ "1 Food safety guidelines and regulations \n",
827
+ "2 Internet protocols and edge computing in finance \n",
828
+ "3 Online grocery shopping \n",
829
+ "4 Writing retreats for poets and novelists \n",
830
+ "5 Unicorn cat ear headphones \n",
831
+ "6 Reference citation context tagging techniques \n",
832
+ "7 Motorcycle riding tips for beginners gear chec... \n",
833
+ "8 Space agency missions \n",
834
+ "9 Game streaming self-promotion and growth tactics \n",
835
+ "10 sex videos movies \n",
836
+ "11 Citation context organization methods \n",
837
+ "12 Healthy office snacks \n",
838
+ "13 Indigenous rights advocacy \n",
839
+ "14 News talk shows \n",
840
+ "15 Best facial cleansers \n",
841
+ "16 Letter of recommendation \n",
842
+ "17 Fossil preservation techniques \n",
843
+ "18 Marriage equality \n",
844
+ "19 eSports Game Esports Player Fan Engagement Ini... \n",
845
  "\n",
846
+ " label label_id \n",
847
+ "0 Internet_and_Telecom 25 \n",
848
+ "1 Food_and_Drink 15 \n",
849
+ "2 Internet_and_Telecom 25 \n",
850
+ "3 Food_and_Drink 15 \n",
851
+ "4 Books_and_Literature 17 \n",
852
+ "5 Computers_and_Electronics 7 \n",
853
+ "6 Reference 12 \n",
854
+ "7 Autos_and_Vehicles 3 \n",
855
+ "8 Science 2 \n",
856
+ "9 Games 19 \n",
857
+ "10 Adult 6 \n",
858
+ "11 Reference 12 \n",
859
+ "12 Health 4 \n",
860
+ "13 People_and_Society 10 \n",
861
+ "14 News 1 \n",
862
+ "15 Hobbies_and_Leisure 0 \n",
863
+ "16 Reference 12 \n",
864
+ "17 Science 2 \n",
865
+ "18 People_and_Society 10 \n",
866
+ "19 Sports 26 "
867
  ]
868
  },
869
  "execution_count": 12,