Pranjal2041 commited on
Commit
4014562
1 Parent(s): 353ec7a

Initial Commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +4 -0
  2. .vscode/launch.json +13 -0
  3. ExamplesCreator.ipynb +116 -0
  4. Scrape.ipynb +0 -0
  5. amzn_examples.json +0 -0
  6. cleaned_code/Demo.ipynb +0 -0
  7. cleaned_code/DemoFast.ipynb +875 -0
  8. cleaned_code/bert_coil_map_dict_lemma255K_isotropic.json +0 -0
  9. cleaned_code/ckpt/Amzn13K/amzn_main_model.bin +3 -0
  10. cleaned_code/configs/PredsRemover.ipynb +149 -0
  11. cleaned_code/configs/ablation_amzn_1_coil.yml +85 -0
  12. cleaned_code/configs/ablation_amzn_1_descs.yml +89 -0
  13. cleaned_code/configs/ablation_amzn_1_hier.yml +85 -0
  14. cleaned_code/configs/ablation_amzn_1_relax.yml +86 -0
  15. cleaned_code/configs/ablation_amzn_eda.yml +81 -0
  16. cleaned_code/configs/ablation_amzn_eda_base.yml +85 -0
  17. cleaned_code/configs/ablation_amzn_eda_base2.yml +84 -0
  18. cleaned_code/configs/ablation_eurlex_1_base.yml +85 -0
  19. cleaned_code/configs/ablation_eurlex_1_coil.yml +88 -0
  20. cleaned_code/configs/ablation_eurlex_1_descs.yml +91 -0
  21. cleaned_code/configs/ablation_eurlex_1_hier_descs.yml +91 -0
  22. cleaned_code/configs/ablation_eurlex_1_hierarchy.yml +88 -0
  23. cleaned_code/configs/ablation_eurlex_1_relax.yml +86 -0
  24. cleaned_code/configs/ablation_eurlex_eda.yml +82 -0
  25. cleaned_code/configs/amzn13k_active_hfwnet.yml +79 -0
  26. cleaned_code/configs/amzn13k_active_highfreq.yml +87 -0
  27. cleaned_code/configs/amzn13k_active_random.yml +81 -0
  28. cleaned_code/configs/amzn13k_active_wnet.yml +79 -0
  29. cleaned_code/configs/amzn13k_active_wnet2.yml +86 -0
  30. cleaned_code/configs/amzn13k_baseline.yml +73 -0
  31. cleaned_code/configs/amzn13k_baseline_descs.yml +81 -0
  32. cleaned_code/configs/amzn13k_baseline_descs_edaaug.yml +75 -0
  33. cleaned_code/configs/amzn13k_baseline_descs_fullsup.yml +74 -0
  34. cleaned_code/configs/amzn13k_baseline_descs_masked_0.0.yml +75 -0
  35. cleaned_code/configs/amzn13k_baseline_descs_masked_0.2.yml +75 -0
  36. cleaned_code/configs/amzn13k_baseline_descs_masked_0.5.yml +75 -0
  37. cleaned_code/configs/amzn13k_baseline_descs_masked_0.9.yml +75 -0
  38. cleaned_code/configs/amzn13k_baseline_descs_merge.yml +76 -0
  39. cleaned_code/configs/amzn13k_baseline_fs.yml +80 -0
  40. cleaned_code/configs/amzn13k_baseline_fs2.yml +80 -0
  41. cleaned_code/configs/amzn13k_baseline_fs5.yml +80 -0
  42. cleaned_code/configs/amzn13k_baseline_hierdescs.yml +84 -0
  43. cleaned_code/configs/amzn13k_baseline_hierdescs_seen.yml +82 -0
  44. cleaned_code/configs/baseline.yml +52 -0
  45. cleaned_code/configs/eurlex4.3k_baseline.yml +87 -0
  46. cleaned_code/configs/eurlex4.3k_baseline2.yml +84 -0
  47. cleaned_code/configs/eurlex4.3k_baseline_fs.yml +90 -0
  48. cleaned_code/configs/eurlex4.3k_baseline_fs20.yml +90 -0
  49. cleaned_code/configs/eurlex4.3k_baseline_fs5.yml +78 -0
  50. cleaned_code/configs/eurlex4.3k_baseline_nl.yml +88 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ cleaned_code/temp_file.pkl
2
+ cleaned_code/precomputed/Amzn13K/amzn_base_labels_data2.pkl
3
+ cleaned_code/precomputed/Amzn13K/amzn_base_labels_data3.pkl
4
+ __pycache__
.vscode/launch.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.2.0",
3
+ "configurations": [
4
+ {
5
+ "name": "Python: Current File",
6
+ "type": "python",
7
+ "request": "launch",
8
+ "program": "${file}",
9
+ "console": "integratedTerminal",
10
+ "justMyCode": true
11
+ }
12
+ ]
13
+ }
ExamplesCreator.ipynb ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/n/fs/nlp-pranjal\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "%cd ../../../"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "name": "stdout",
27
+ "output_type": "stream",
28
+ "text": [
29
+ "/n/fs/nlp-pranjal\n"
30
+ ]
31
+ }
32
+ ],
33
+ "source": [
34
+ "!pwd"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 3,
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "name": "stdout",
44
+ "output_type": "stream",
45
+ "text": [
46
+ "/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K\n"
47
+ ]
48
+ }
49
+ ],
50
+ "source": [
51
+ "%cd SemSup-LMLC/training/datasets/Amzn13K"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 4,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "import json\n",
61
+ "td = [json.loads(x) for x in open('test.jsonl')]"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 6,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "import numpy as np\n",
71
+ "examples = np.random.choice(td, 100, replace=False)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 10,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "json.dump(list(examples), open('amzn_examples.json','w'), indent=2)"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": []
89
+ }
90
+ ],
91
+ "metadata": {
92
+ "interpreter": {
93
+ "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
94
+ },
95
+ "kernelspec": {
96
+ "display_name": "Python 3.9.12 ('base')",
97
+ "language": "python",
98
+ "name": "python3"
99
+ },
100
+ "language_info": {
101
+ "codemirror_mode": {
102
+ "name": "ipython",
103
+ "version": 3
104
+ },
105
+ "file_extension": ".py",
106
+ "mimetype": "text/x-python",
107
+ "name": "python",
108
+ "nbconvert_exporter": "python",
109
+ "pygments_lexer": "ipython3",
110
+ "version": "3.9.12"
111
+ },
112
+ "orig_nbformat": 4
113
+ },
114
+ "nbformat": 4,
115
+ "nbformat_minor": 2
116
+ }
Scrape.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
amzn_examples.json ADDED
The diff for this file is too large to render. See raw diff
 
cleaned_code/Demo.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
cleaned_code/DemoFast.ipynb ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import numpy as np\n",
11
+ "import pickle\n",
12
+ "import h5py\n",
13
+ "from tqdm import tqdm\n",
14
+ "from transformers import AutoTokenizer\n",
15
+ "from scipy.special import expit "
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "def compute_tok_score_cart(doc_reps, doc_input_ids, qry_reps, qry_input_ids, qry_attention_mask):\n",
25
+ " qry_input_ids = qry_input_ids.unsqueeze(2).unsqueeze(3) # Q * LQ * 1 * 1\n",
26
+ " doc_input_ids = doc_input_ids.unsqueeze(0).unsqueeze(1) # 1 * 1 * D * LD\n",
27
+ " exact_match = doc_input_ids == qry_input_ids # Q * LQ * D * LD\n",
28
+ " exact_match = exact_match.float()\n",
29
+ " scores_no_masking = torch.matmul(\n",
30
+ " qry_reps.view(-1, 16), # (Q * LQ) * d\n",
31
+ " doc_reps.view(-1, 16).transpose(0, 1) # d * (D * LD)\n",
32
+ " )\n",
33
+ " scores_no_masking = scores_no_masking.view(\n",
34
+ " *qry_reps.shape[:2], *doc_reps.shape[:2]) # Q * LQ * D * LD\n",
35
+ " scores, _ = (scores_no_masking * exact_match).max(dim=3) # Q * LQ * D\n",
36
+ " tok_scores = (scores * qry_attention_mask.reshape(-1, qry_attention_mask.shape[-1]).unsqueeze(2))[:, 1:].sum(1)\n",
37
+ " \n",
38
+ " return tok_scores\n",
39
+ "\n",
40
+ "import torch\n",
41
+ "from typing import Optional\n",
42
+ "def coil_fast_eval_forward(\n",
43
+ " input_ids: Optional[torch.Tensor] = None,\n",
44
+ " doc_reps = None,\n",
45
+ " logits: Optional[torch.Tensor] = None,\n",
46
+ " desc_input_ids = None,\n",
47
+ " desc_attention_mask = None,\n",
48
+ " lab_reps = None,\n",
49
+ " label_embeddings = None\n",
50
+ "):\n",
51
+ " tok_scores = compute_tok_score_cart(\n",
52
+ " doc_reps, input_ids,\n",
53
+ " lab_reps, desc_input_ids.reshape(-1, desc_input_ids.shape[-1]), desc_attention_mask\n",
54
+ " )\n",
55
+ " logits = (logits.unsqueeze(0) @ label_embeddings.T)\n",
56
+ " new_tok_scores = torch.zeros(logits.shape, device = logits.device)\n",
57
+ " for i in range(tok_scores.shape[1]):\n",
58
+ " stride = tok_scores.shape[0]//tok_scores.shape[1]\n",
59
+ " new_tok_scores[i] = tok_scores[i*stride: i*stride + stride ,i]\n",
60
+ " return (logits + new_tok_scores).squeeze()"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 3,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "label_list = [x.strip() for x in open('datasets/Amzn13K/all_labels.txt')]\n",
70
+ "unseen_label_list = [x.strip() for x in open('datasets/Amzn13K/unseen_labels_split6500_2.txt')]\n",
71
+ "num_labels = len(label_list)\n",
72
+ "label_list.sort() # For consistency\n",
73
+ "l2i = {v: i for i, v in enumerate(label_list)}\n",
74
+ "unseen_label_indexes = [l2i[x] for x in unseen_label_list]"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 4,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "import json\n",
84
+ "coil_cluster_map = json.load(open('bert_coil_map_dict_lemma255K_isotropic.json')) "
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 22,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "label_preds = pickle.load(open('/n/fs/nlp-pranjal/SemSup-LMLC/training/ablation_amzn_1_main_labels_zsl.pkl','rb'))"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 20,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "label_preds = pickle.load(open('/n/fs/scratch/pranjal/seed_experiments/ablation_amzn_eda_labels_zsl_seed2.pkl','rb'))"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 38,
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "name": "stderr",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "100%|██████████| 13330/13330 [00:00<00:00, 64680.71it/s]\n"
115
+ ]
116
+ }
117
+ ],
118
+ "source": [
119
+ "all_lab_reps, all_label_embeddings, all_desc_input_ids, all_desc_attention_mask = [], [], [], []\n",
120
+ "for l in tqdm(label_list):\n",
121
+ " ll = label_preds[l]\n",
122
+ " lab_reps, label_embeddings, desc_input_ids, desc_attention_mask = ll[np.random.randint(len(ll))] \n",
123
+ " all_lab_reps.append(lab_reps.squeeze())\n",
124
+ " all_label_embeddings.append(label_embeddings.squeeze())\n",
125
+ " all_desc_input_ids.append(desc_input_ids.squeeze())\n",
126
+ " all_desc_attention_mask.append(desc_attention_mask.squeeze())\n",
127
+ "all_lab_reps = torch.stack(all_lab_reps).cpu()\n",
128
+ "all_label_embeddings = torch.stack(all_label_embeddings).cpu()\n",
129
+ "all_desc_input_ids = torch.stack(all_desc_input_ids).cpu()\n",
130
+ "all_desc_attention_mask = torch.stack(all_desc_attention_mask).cpu()\n",
131
+ "all_desc_input_ids_clus = torch.tensor([[coil_cluster_map[str(x.item())] for x in xx] for xx in all_desc_input_ids])"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "pickle.dump([all_lab_reps, all_label_embeddings, all_desc_input_ids, all_desc_input_ids_clus, all_desc_attention_mask], open('precomputed/Amzn13K/amzn_base_labels_data1_4.pkl','wb'))"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 6,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 7,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "all_lab_reps1, all_label_embeddings1, _, all_desc_input_ids1, all_desc_attention_mask1 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data1.pkl','rb'))\n",
159
+ "all_lab_reps2, all_label_embeddings2, _, all_desc_input_ids2, all_desc_attention_mask2 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data2.pkl','rb'))\n",
160
+ "all_lab_reps3, all_label_embeddings3, _, all_desc_input_ids3, all_desc_attention_mask3 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data3.pkl','rb'))\n",
161
+ "\n",
162
+ "\n",
163
+ "all_lab_reps = [all_lab_reps1.to(device), all_lab_reps2.to(device), all_lab_reps3.to(device)]\n",
164
+ "all_label_embeddings = [all_label_embeddings1.to(device), all_label_embeddings2.to(device), all_label_embeddings3.to(device)]\n",
165
+ "all_desc_input_ids = [all_desc_input_ids1.to(device), all_desc_input_ids2.to(device), all_desc_input_ids3.to(device)]\n",
166
+ "all_desc_attention_mask = [all_desc_attention_mask1.to(device), all_desc_attention_mask2.to(device), all_desc_attention_mask3.to(device)]"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 8,
172
+ "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "name": "stdout",
176
+ "output_type": "stream",
177
+ "text": [
178
+ "Yaml Config is:\n",
179
+ "--------------------------------------------------------------------------------\n",
180
+ "{'task_name': 'amazon13k', 'dataset_name': 'amazon13k', 'dataset_config_name': None, 'max_seq_length': 160, 'overwrite_output_dir': False, 'overwrite_cache': False, 'pad_to_max_length': True, 'load_from_local': True, 'max_train_samples': None, 'max_eval_samples': 15000, 'max_predict_samples': None, 'train_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/train_split6500_2.jsonl', 'validation_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl', 'test_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl', 'label_max_seq_length': 160, 'descriptions_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_edaaug.json', 'test_descriptions_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3.json', 'all_labels': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/all_labels.txt', 'test_labels': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/unseen_labels_split6500_2.txt', 'contrastive_learning_samples': 1000, 'cl_min_positive_descs': 1, 'coil_cluster_mapping_path': 'bert_coil_map_dict_lemma255K_isotropic.json', 'model_name_or_path': 'bert-base-uncased', 'config_name': None, 'tokenizer_name': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'model_revision': 'main', 'use_auth_token': False, 'ignore_mismatched_sizes': False, 'negative_sampling': 'none', 'semsup': True, 'label_model_name_or_path': 'prajjwal1/bert-small', 'encoder_model_type': 'bert', 'use_custom_optimizer': 'adamw', 'output_learning_rate': 0.0001, 'arch_type': 2, 'add_label_name': True, 'normalize_embeddings': False, 'tie_weights': False, 'coil': True, 'colbert': False, 'token_dim': 16, 'label_frozen_layers': 2, 'do_train': True, 'do_eval': True, 'do_predict': False, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 8, 'per_device_eval_batch_size': 1, 'learning_rate': 5e-05, 'num_train_epochs': 2, 'save_steps': 4900, 'evaluation_strategy': 'steps', 'eval_steps': 3000000, 'fp16': True, 'fp16_opt_level': 'O1', 'lr_scheduler_type': 'linear', 'dataloader_num_workers': 16, 'label_names': ['labels'], 'scenario': 'unseen_labels', 'ddp_find_unused_parameters': False, 'ignore_data_skip': True, 'seed': -1, 'EXP_NAME': 'semsup_descs_100ep_newds_cosine', 'EXP_DESC': 'SemSup Descriptions ran for 100 epochs', 'output_dir': 'demo_tmp'}\n",
181
+ "--------------------------------------------------------------------------------\n"
182
+ ]
183
+ },
184
+ {
185
+ "name": "stderr",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
189
+ "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
190
+ "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
191
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
192
+ "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
193
+ "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
194
+ ]
195
+ },
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "Config is BertConfig {\n",
201
+ " \"_name_or_path\": \"bert-base-uncased\",\n",
202
+ " \"arch_type\": 2,\n",
203
+ " \"architectures\": [\n",
204
+ " \"BertForMaskedLM\"\n",
205
+ " ],\n",
206
+ " \"attention_probs_dropout_prob\": 0.1,\n",
207
+ " \"classifier_dropout\": null,\n",
208
+ " \"coil\": true,\n",
209
+ " \"colbert\": false,\n",
210
+ " \"encoder_model_type\": \"bert\",\n",
211
+ " \"finetuning_task\": \"amazon13k\",\n",
212
+ " \"gradient_checkpointing\": false,\n",
213
+ " \"hidden_act\": \"gelu\",\n",
214
+ " \"hidden_dropout_prob\": 0.1,\n",
215
+ " \"hidden_size\": 768,\n",
216
+ " \"initializer_range\": 0.02,\n",
217
+ " \"intermediate_size\": 3072,\n",
218
+ " \"label_hidden_size\": 512,\n",
219
+ " \"layer_norm_eps\": 1e-12,\n",
220
+ " \"max_position_embeddings\": 512,\n",
221
+ " \"model_name_or_path\": \"bert-base-uncased\",\n",
222
+ " \"model_type\": \"bert\",\n",
223
+ " \"negative_sampling\": \"none\",\n",
224
+ " \"num_attention_heads\": 12,\n",
225
+ " \"num_hidden_layers\": 12,\n",
226
+ " \"pad_token_id\": 0,\n",
227
+ " \"position_embedding_type\": \"absolute\",\n",
228
+ " \"problem_type\": \"multi_label_classification\",\n",
229
+ " \"semsup\": true,\n",
230
+ " \"token_dim\": 16,\n",
231
+ " \"transformers_version\": \"4.20.0\",\n",
232
+ " \"type_vocab_size\": 2,\n",
233
+ " \"use_cache\": true,\n",
234
+ " \"vocab_size\": 30522\n",
235
+ "}\n",
236
+ "\n"
237
+ ]
238
+ }
239
+ ],
240
+ "source": [
241
+ "from src import BertForSemanticEmbedding, getLabelModel\n",
242
+ "from src import DataTrainingArguments, ModelArguments, CustomTrainingArguments, read_yaml_config\n",
243
+ "from src import dataset_classification_type\n",
244
+ "from src import SemSupDataset\n",
245
+ "from transformers import AutoConfig, HfArgumentParser, AutoTokenizer\n",
246
+ "import torch\n",
247
+ "\n",
248
+ "import json\n",
249
+ "from tqdm import tqdm\n",
250
+ "\n",
251
+ "ARGS_FILE = 'configs/ablation_amzn_eda.yml'\n",
252
+ "\n",
253
+ "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CustomTrainingArguments))\n",
254
+ "model_args, data_args, training_args = parser.parse_dict(read_yaml_config(ARGS_FILE, output_dir = 'demo_tmp', extra_args = {}))\n",
255
+ "\n",
256
+ "config = AutoConfig.from_pretrained(\n",
257
+ " model_args.config_name if model_args.config_name else model_args.model_name_or_path,\n",
258
+ " finetuning_task=data_args.task_name,\n",
259
+ " cache_dir=model_args.cache_dir,\n",
260
+ " revision=model_args.model_revision,\n",
261
+ " use_auth_token=True if model_args.use_auth_token else None,\n",
262
+ ")\n",
263
+ "\n",
264
+ "config.model_name_or_path = model_args.model_name_or_path\n",
265
+ "config.problem_type = dataset_classification_type[data_args.task_name]\n",
266
+ "config.negative_sampling = model_args.negative_sampling\n",
267
+ "config.semsup = model_args.semsup\n",
268
+ "config.encoder_model_type = model_args.encoder_model_type\n",
269
+ "config.arch_type = model_args.arch_type\n",
270
+ "config.coil = model_args.coil\n",
271
+ "config.token_dim = model_args.token_dim\n",
272
+ "config.colbert = model_args.colbert\n",
273
+ "\n",
274
+ "label_model, label_tokenizer = getLabelModel(data_args, model_args)\n",
275
+ "config.label_hidden_size = label_model.config.hidden_size\n",
276
+ "model = BertForSemanticEmbedding(config)\n",
277
+ "model.label_model = label_model\n",
278
+ "model.label_tokenizer = label_tokenizer\n",
279
+ "model.config.label2id = {l: i for i, l in enumerate(label_list)}\n",
280
+ "model.config.id2label = {id: label for label, id in config.label2id.items()}\n",
281
+ "\n",
282
+ "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 9,
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "data": {
292
+ "text/plain": [
293
+ "BertForSemanticEmbedding(\n",
294
+ " (encoder): BertModel(\n",
295
+ " (embeddings): BertEmbeddings(\n",
296
+ " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
297
+ " (position_embeddings): Embedding(512, 768)\n",
298
+ " (token_type_embeddings): Embedding(2, 768)\n",
299
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
300
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
301
+ " )\n",
302
+ " (encoder): BertEncoder(\n",
303
+ " (layer): ModuleList(\n",
304
+ " (0): BertLayer(\n",
305
+ " (attention): BertAttention(\n",
306
+ " (self): BertSelfAttention(\n",
307
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
308
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
309
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
310
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
311
+ " )\n",
312
+ " (output): BertSelfOutput(\n",
313
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
314
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
315
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
316
+ " )\n",
317
+ " )\n",
318
+ " (intermediate): BertIntermediate(\n",
319
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
320
+ " (intermediate_act_fn): GELUActivation()\n",
321
+ " )\n",
322
+ " (output): BertOutput(\n",
323
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
324
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
325
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
326
+ " )\n",
327
+ " )\n",
328
+ " (1): BertLayer(\n",
329
+ " (attention): BertAttention(\n",
330
+ " (self): BertSelfAttention(\n",
331
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
332
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
333
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
334
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
335
+ " )\n",
336
+ " (output): BertSelfOutput(\n",
337
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
338
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
339
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
340
+ " )\n",
341
+ " )\n",
342
+ " (intermediate): BertIntermediate(\n",
343
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
344
+ " (intermediate_act_fn): GELUActivation()\n",
345
+ " )\n",
346
+ " (output): BertOutput(\n",
347
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
348
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
349
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
350
+ " )\n",
351
+ " )\n",
352
+ " (2): BertLayer(\n",
353
+ " (attention): BertAttention(\n",
354
+ " (self): BertSelfAttention(\n",
355
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
356
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
357
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
358
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
359
+ " )\n",
360
+ " (output): BertSelfOutput(\n",
361
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
362
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
363
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
364
+ " )\n",
365
+ " )\n",
366
+ " (intermediate): BertIntermediate(\n",
367
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
368
+ " (intermediate_act_fn): GELUActivation()\n",
369
+ " )\n",
370
+ " (output): BertOutput(\n",
371
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
372
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
373
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
374
+ " )\n",
375
+ " )\n",
376
+ " (3): BertLayer(\n",
377
+ " (attention): BertAttention(\n",
378
+ " (self): BertSelfAttention(\n",
379
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
380
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
381
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
382
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
383
+ " )\n",
384
+ " (output): BertSelfOutput(\n",
385
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
386
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
387
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
388
+ " )\n",
389
+ " )\n",
390
+ " (intermediate): BertIntermediate(\n",
391
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
392
+ " (intermediate_act_fn): GELUActivation()\n",
393
+ " )\n",
394
+ " (output): BertOutput(\n",
395
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
396
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
397
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
398
+ " )\n",
399
+ " )\n",
400
+ " (4): BertLayer(\n",
401
+ " (attention): BertAttention(\n",
402
+ " (self): BertSelfAttention(\n",
403
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
404
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
405
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
406
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
407
+ " )\n",
408
+ " (output): BertSelfOutput(\n",
409
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
410
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
411
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
412
+ " )\n",
413
+ " )\n",
414
+ " (intermediate): BertIntermediate(\n",
415
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
416
+ " (intermediate_act_fn): GELUActivation()\n",
417
+ " )\n",
418
+ " (output): BertOutput(\n",
419
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
420
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
421
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
422
+ " )\n",
423
+ " )\n",
424
+ " (5): BertLayer(\n",
425
+ " (attention): BertAttention(\n",
426
+ " (self): BertSelfAttention(\n",
427
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
428
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
429
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
430
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
431
+ " )\n",
432
+ " (output): BertSelfOutput(\n",
433
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
434
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
435
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
436
+ " )\n",
437
+ " )\n",
438
+ " (intermediate): BertIntermediate(\n",
439
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
440
+ " (intermediate_act_fn): GELUActivation()\n",
441
+ " )\n",
442
+ " (output): BertOutput(\n",
443
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
444
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
445
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
446
+ " )\n",
447
+ " )\n",
448
+ " (6): BertLayer(\n",
449
+ " (attention): BertAttention(\n",
450
+ " (self): BertSelfAttention(\n",
451
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
452
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
453
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
454
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
455
+ " )\n",
456
+ " (output): BertSelfOutput(\n",
457
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
458
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
459
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
460
+ " )\n",
461
+ " )\n",
462
+ " (intermediate): BertIntermediate(\n",
463
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
464
+ " (intermediate_act_fn): GELUActivation()\n",
465
+ " )\n",
466
+ " (output): BertOutput(\n",
467
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
468
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
469
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
470
+ " )\n",
471
+ " )\n",
472
+ " (7): BertLayer(\n",
473
+ " (attention): BertAttention(\n",
474
+ " (self): BertSelfAttention(\n",
475
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
476
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
477
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
478
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
479
+ " )\n",
480
+ " (output): BertSelfOutput(\n",
481
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
482
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
483
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
484
+ " )\n",
485
+ " )\n",
486
+ " (intermediate): BertIntermediate(\n",
487
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
488
+ " (intermediate_act_fn): GELUActivation()\n",
489
+ " )\n",
490
+ " (output): BertOutput(\n",
491
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
492
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
493
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
494
+ " )\n",
495
+ " )\n",
496
+ " (8): BertLayer(\n",
497
+ " (attention): BertAttention(\n",
498
+ " (self): BertSelfAttention(\n",
499
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
500
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
501
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
502
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
503
+ " )\n",
504
+ " (output): BertSelfOutput(\n",
505
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
506
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
507
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
508
+ " )\n",
509
+ " )\n",
510
+ " (intermediate): BertIntermediate(\n",
511
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
512
+ " (intermediate_act_fn): GELUActivation()\n",
513
+ " )\n",
514
+ " (output): BertOutput(\n",
515
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
516
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
517
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
518
+ " )\n",
519
+ " )\n",
520
+ " (9): BertLayer(\n",
521
+ " (attention): BertAttention(\n",
522
+ " (self): BertSelfAttention(\n",
523
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
524
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
525
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
526
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
527
+ " )\n",
528
+ " (output): BertSelfOutput(\n",
529
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
530
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
531
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
532
+ " )\n",
533
+ " )\n",
534
+ " (intermediate): BertIntermediate(\n",
535
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
536
+ " (intermediate_act_fn): GELUActivation()\n",
537
+ " )\n",
538
+ " (output): BertOutput(\n",
539
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
540
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
541
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
542
+ " )\n",
543
+ " )\n",
544
+ " (10): BertLayer(\n",
545
+ " (attention): BertAttention(\n",
546
+ " (self): BertSelfAttention(\n",
547
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
548
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
549
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
550
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
551
+ " )\n",
552
+ " (output): BertSelfOutput(\n",
553
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
554
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
555
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
556
+ " )\n",
557
+ " )\n",
558
+ " (intermediate): BertIntermediate(\n",
559
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
560
+ " (intermediate_act_fn): GELUActivation()\n",
561
+ " )\n",
562
+ " (output): BertOutput(\n",
563
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
564
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
565
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
566
+ " )\n",
567
+ " )\n",
568
+ " (11): BertLayer(\n",
569
+ " (attention): BertAttention(\n",
570
+ " (self): BertSelfAttention(\n",
571
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
572
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
573
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
574
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
575
+ " )\n",
576
+ " (output): BertSelfOutput(\n",
577
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
578
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
579
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
580
+ " )\n",
581
+ " )\n",
582
+ " (intermediate): BertIntermediate(\n",
583
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
584
+ " (intermediate_act_fn): GELUActivation()\n",
585
+ " )\n",
586
+ " (output): BertOutput(\n",
587
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
588
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
589
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
590
+ " )\n",
591
+ " )\n",
592
+ " )\n",
593
+ " )\n",
594
+ " (pooler): BertPooler(\n",
595
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
596
+ " (activation): Tanh()\n",
597
+ " )\n",
598
+ " )\n",
599
+ " (tok_proj): Linear(in_features=768, out_features=16, bias=True)\n",
600
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
601
+ " (label_projection): Linear(in_features=768, out_features=512, bias=False)\n",
602
+ " (label_model): BertModel(\n",
603
+ " (embeddings): BertEmbeddings(\n",
604
+ " (word_embeddings): Embedding(30522, 512, padding_idx=0)\n",
605
+ " (position_embeddings): Embedding(512, 512)\n",
606
+ " (token_type_embeddings): Embedding(2, 512)\n",
607
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
608
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
609
+ " )\n",
610
+ " (encoder): BertEncoder(\n",
611
+ " (layer): ModuleList(\n",
612
+ " (0): BertLayer(\n",
613
+ " (attention): BertAttention(\n",
614
+ " (self): BertSelfAttention(\n",
615
+ " (query): Linear(in_features=512, out_features=512, bias=True)\n",
616
+ " (key): Linear(in_features=512, out_features=512, bias=True)\n",
617
+ " (value): Linear(in_features=512, out_features=512, bias=True)\n",
618
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
619
+ " )\n",
620
+ " (output): BertSelfOutput(\n",
621
+ " (dense): Linear(in_features=512, out_features=512, bias=True)\n",
622
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
623
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
624
+ " )\n",
625
+ " )\n",
626
+ " (intermediate): BertIntermediate(\n",
627
+ " (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
628
+ " (intermediate_act_fn): GELUActivation()\n",
629
+ " )\n",
630
+ " (output): BertOutput(\n",
631
+ " (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
632
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
633
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
634
+ " )\n",
635
+ " )\n",
636
+ " (1): BertLayer(\n",
637
+ " (attention): BertAttention(\n",
638
+ " (self): BertSelfAttention(\n",
639
+ " (query): Linear(in_features=512, out_features=512, bias=True)\n",
640
+ " (key): Linear(in_features=512, out_features=512, bias=True)\n",
641
+ " (value): Linear(in_features=512, out_features=512, bias=True)\n",
642
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
643
+ " )\n",
644
+ " (output): BertSelfOutput(\n",
645
+ " (dense): Linear(in_features=512, out_features=512, bias=True)\n",
646
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
647
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
648
+ " )\n",
649
+ " )\n",
650
+ " (intermediate): BertIntermediate(\n",
651
+ " (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
652
+ " (intermediate_act_fn): GELUActivation()\n",
653
+ " )\n",
654
+ " (output): BertOutput(\n",
655
+ " (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
656
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
657
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
658
+ " )\n",
659
+ " )\n",
660
+ " (2): BertLayer(\n",
661
+ " (attention): BertAttention(\n",
662
+ " (self): BertSelfAttention(\n",
663
+ " (query): Linear(in_features=512, out_features=512, bias=True)\n",
664
+ " (key): Linear(in_features=512, out_features=512, bias=True)\n",
665
+ " (value): Linear(in_features=512, out_features=512, bias=True)\n",
666
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
667
+ " )\n",
668
+ " (output): BertSelfOutput(\n",
669
+ " (dense): Linear(in_features=512, out_features=512, bias=True)\n",
670
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
671
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
672
+ " )\n",
673
+ " )\n",
674
+ " (intermediate): BertIntermediate(\n",
675
+ " (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
676
+ " (intermediate_act_fn): GELUActivation()\n",
677
+ " )\n",
678
+ " (output): BertOutput(\n",
679
+ " (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
680
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
681
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
682
+ " )\n",
683
+ " )\n",
684
+ " (3): BertLayer(\n",
685
+ " (attention): BertAttention(\n",
686
+ " (self): BertSelfAttention(\n",
687
+ " (query): Linear(in_features=512, out_features=512, bias=True)\n",
688
+ " (key): Linear(in_features=512, out_features=512, bias=True)\n",
689
+ " (value): Linear(in_features=512, out_features=512, bias=True)\n",
690
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
691
+ " )\n",
692
+ " (output): BertSelfOutput(\n",
693
+ " (dense): Linear(in_features=512, out_features=512, bias=True)\n",
694
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
695
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
696
+ " )\n",
697
+ " )\n",
698
+ " (intermediate): BertIntermediate(\n",
699
+ " (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
700
+ " (intermediate_act_fn): GELUActivation()\n",
701
+ " )\n",
702
+ " (output): BertOutput(\n",
703
+ " (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
704
+ " (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
705
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
706
+ " )\n",
707
+ " )\n",
708
+ " )\n",
709
+ " )\n",
710
+ " (pooler): BertPooler(\n",
711
+ " (dense): Linear(in_features=512, out_features=512, bias=True)\n",
712
+ " (activation): Tanh()\n",
713
+ " )\n",
714
+ " )\n",
715
+ ")"
716
+ ]
717
+ },
718
+ "execution_count": 9,
719
+ "metadata": {},
720
+ "output_type": "execute_result"
721
+ }
722
+ ],
723
+ "source": [
724
+ "model.to(device)\n",
725
+ "model.eval()\n",
726
+ "torch.set_grad_enabled(False)"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 65,
732
+ "metadata": {},
733
+ "outputs": [
734
+ {
735
+ "data": {
736
+ "text/plain": [
737
+ "<All keys matched successfully>"
738
+ ]
739
+ },
740
+ "execution_count": 65,
741
+ "metadata": {},
742
+ "output_type": "execute_result"
743
+ }
744
+ ],
745
+ "source": [
746
+ "model.load_state_dict(torch.load('ckpt/Amzn13K/amzn_main_model.bin', map_location = device))"
747
+ ]
748
+ },
749
+ {
750
+ "cell_type": "code",
751
+ "execution_count": 88,
752
+ "metadata": {},
753
+ "outputs": [],
754
+ "source": [
755
+ "text = '''SanDisk Cruzer Blade 32GB USB Flash Drive\\nUltra-compact and portable USB flash drive,Capless design\n",
756
+ "Share your photos, videos, songs and other files between computers with ease,care number:18001205899/18004195592\n",
757
+ "Protect your private files with included SanDisk SecureAccess software\n",
758
+ "Includes added protection of secure online backup (up to 2GB optionally available) offered by YuuWaa\n",
759
+ "Password-protect your sensitive files. Customer care:IndiaSupport@sandisk.com\n",
760
+ "Importer Details:Rashi Peripherals Pvt. Ltd. Rashi Complex,A Building,Survey186,Dongaripada,Poman Village,Vasai Bhiwandi Road, Dist. Thane,Maharastra 401208, India\n",
761
+ "Share your work files between computers with ease\n",
762
+ "Manufacturer Name & Address: SanDisk International LTD, C/O Unit 100, Airside Business Park, Lakeshore Drive, Swords, Co Dublin, Ireland.\n",
763
+ "Consumer Complaint Details: indiasupport@sandisk.com/18001022055'''"
764
+ ]
765
+ },
766
+ {
767
+ "cell_type": "code",
768
+ "execution_count": 89,
769
+ "metadata": {},
770
+ "outputs": [],
771
+ "source": [
772
+ "item = tokenizer(text, padding='max_length', max_length=data_args.max_seq_length, truncation=True)\n",
773
+ "item = {k:torch.tensor(v, device = device).unsqueeze(0) for k,v in item.items()}\n",
774
+ "\n",
775
+ "outputs_doc, logits = model.forward_input_encoder(**item)\n",
776
+ "doc_reps = model.tok_proj(outputs_doc.last_hidden_state)\n",
777
+ "\n",
778
+ "input_ids = torch.tensor([coil_cluster_map[str(x.item())] for x in item['input_ids'][0]]).to(device).unsqueeze(0)\n",
779
+ "all_logits = []\n",
780
+ "for adi, ada, alr, ale in zip(all_desc_input_ids, all_desc_attention_mask, all_lab_reps, all_label_embeddings):\n",
781
+ " all_logits.append(coil_fast_eval_forward(input_ids, doc_reps, logits, adi, ada, alr, ale))\n",
782
+ "\n",
783
+ "final_logits = sum([expit(x.cpu()) for x in all_logits]) / len(all_logits)\n",
784
+ "\n",
785
+ "outs = torch.topk(final_logits, k = 5)\n",
786
+ "preds_dic = dict()\n",
787
+ "for i,v in zip(outs.indices, outs.values):\n",
788
+ " preds_dic[label_list[i]] = v.item()"
789
+ ]
790
+ },
791
+ {
792
+ "cell_type": "code",
793
+ "execution_count": 90,
794
+ "metadata": {},
795
+ "outputs": [
796
+ {
797
+ "data": {
798
+ "text/plain": [
799
+ "{'electronics': 0.9989226460456848,\n",
800
+ " 'computers & accessories': 0.981508731842041,\n",
801
+ " 'computer components': 0.9518740177154541,\n",
802
+ " 'computer accessories': 0.7639468312263489,\n",
803
+ " 'hardware': 0.6584190726280212}"
804
+ ]
805
+ },
806
+ "execution_count": 90,
807
+ "metadata": {},
808
+ "output_type": "execute_result"
809
+ }
810
+ ],
811
+ "source": [
812
+ "preds_dic"
813
+ ]
814
+ },
815
+ {
816
+ "cell_type": "code",
817
+ "execution_count": null,
818
+ "metadata": {},
819
+ "outputs": [],
820
+ "source": []
821
+ },
822
+ {
823
+ "cell_type": "code",
824
+ "execution_count": 78,
825
+ "metadata": {},
826
+ "outputs": [
827
+ {
828
+ "data": {
829
+ "text/plain": [
830
+ "torch.Size([13330])"
831
+ ]
832
+ },
833
+ "execution_count": 78,
834
+ "metadata": {},
835
+ "output_type": "execute_result"
836
+ }
837
+ ],
838
+ "source": [
839
+ "final_logits.shape"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": null,
845
+ "metadata": {},
846
+ "outputs": [],
847
+ "source": []
848
+ }
849
+ ],
850
+ "metadata": {
851
+ "interpreter": {
852
+ "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
853
+ },
854
+ "kernelspec": {
855
+ "display_name": "Python 3.9.12 ('base')",
856
+ "language": "python",
857
+ "name": "python3"
858
+ },
859
+ "language_info": {
860
+ "codemirror_mode": {
861
+ "name": "ipython",
862
+ "version": 3
863
+ },
864
+ "file_extension": ".py",
865
+ "mimetype": "text/x-python",
866
+ "name": "python",
867
+ "nbconvert_exporter": "python",
868
+ "pygments_lexer": "ipython3",
869
+ "version": "3.9.12"
870
+ },
871
+ "orig_nbformat": 4
872
+ },
873
+ "nbformat": 4,
874
+ "nbformat_minor": 2
875
+ }
cleaned_code/bert_coil_map_dict_lemma255K_isotropic.json ADDED
The diff for this file is too large to render. See raw diff
 
cleaned_code/ckpt/Amzn13K/amzn_main_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0899b193b49dc3e8acf2caa984fbaee1520933bbc2f61cbb3e594363a702708
3
+ size 554726619
cleaned_code/configs/PredsRemover.ipynb ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import json\n",
11
+ "from os.path import join"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "OUT_DIR = 'output/'\n",
21
+ "EXP_DIR = join(OUT_DIR, 'semsup_descs_amzn13k_curie_nocoil', 'predictions')"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 6,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/n/fs/nlp-pranjal/SemSup-LMLC/training\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "%cd .."
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 7,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "files = dict()\n",
48
+ "for file in os.listdir(EXP_DIR):\n",
49
+ " t = float(file.split('_')[-1].replace('.pkl',''))\n",
50
+ " if t not in files:\n",
51
+ " files[t] = []\n",
52
+ " files[t] += [join(EXP_DIR, file)]"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 8,
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "data": {
62
+ "text/plain": [
63
+ "21.792958695441484"
64
+ ]
65
+ },
66
+ "execution_count": 8,
67
+ "metadata": {},
68
+ "output_type": "execute_result"
69
+ }
70
+ ],
71
+ "source": [
72
+ "import itertools\n",
73
+ "tsize = 0\n",
74
+ "for file in itertools.chain(*files.values()):\n",
75
+ " tsize += os.path.getsize(file)\n",
76
+ "tsize/ (1024**3)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 9,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "files = {k:files[k] for k in sorted(files.keys())}"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 10,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "data": {
95
+ "text/plain": [
96
+ "10.170047391206026"
97
+ ]
98
+ },
99
+ "execution_count": 10,
100
+ "metadata": {},
101
+ "output_type": "execute_result"
102
+ }
103
+ ],
104
+ "source": [
105
+ "import random\n",
106
+ "tsize = 0\n",
107
+ "for k in sorted(list(files.keys()))[10:]:\n",
108
+ " if random.random() > 0.6:\n",
109
+ " continue\n",
110
+ " for f in files[k]:\n",
111
+ " tsize += os.path.getsize(f)\n",
112
+ " os.remove(f)\n",
113
+ "tsize/ (1024**3)"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": []
122
+ }
123
+ ],
124
+ "metadata": {
125
+ "interpreter": {
126
+ "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
127
+ },
128
+ "kernelspec": {
129
+ "display_name": "Python 3.9.12 ('base')",
130
+ "language": "python",
131
+ "name": "python3"
132
+ },
133
+ "language_info": {
134
+ "codemirror_mode": {
135
+ "name": "ipython",
136
+ "version": 3
137
+ },
138
+ "file_extension": ".py",
139
+ "mimetype": "text/x-python",
140
+ "name": "python",
141
+ "nbconvert_exporter": "python",
142
+ "pygments_lexer": "ipython3",
143
+ "version": "3.9.12"
144
+ },
145
+ "orig_nbformat": 4
146
+ },
147
+ "nbformat": 4,
148
+ "nbformat_minor": 2
149
+ }
cleaned_code/configs/ablation_amzn_1_coil.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 160
10
+ overwrite_output_dir: false # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 160
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
23
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 1000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: false
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 2
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 3000000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 16
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+ seed: -1
85
+
cleaned_code/configs/ablation_amzn_1_descs.yml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 160
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+
21
+ # validation_file: datasets/Amzn13K/test.jsonl
22
+ # test_file: datasets/Amzn13K/test.jsonl
23
+ label_max_seq_length: 64
24
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
25
+ descriptions_file: datasets/Amzn13K/heir_withoutdescriptions_v3_v3_unseen.json
26
+ test_descriptions_file: datasets/Amzn13K/heir_withoutdescriptions_v3_v3.json
27
+
28
+ all_labels : datasets/Amzn13K/all_labels.txt
29
+ # test_labels : datasets/Amzn13K/all_labels.txt
30
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
31
+
32
+ contrastive_learning_samples: 1000
33
+ cl_min_positive_descs: 1
34
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
35
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
36
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
37
+
38
+ MODEL:
39
+ model_name_or_path: bert-base-uncased
40
+ # pretrained_model_path: output/ablation_amzn_1_descs/checkpoint-21000/pytorch_model.bin
41
+ config_name: null
42
+ tokenizer_name: null
43
+ cache_dir: null
44
+ use_fast_tokenizer: true
45
+ model_revision: main
46
+ use_auth_token: false
47
+ ignore_mismatched_sizes: false
48
+ negative_sampling: "none"
49
+ semsup: true
50
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
51
+ label_model_name_or_path: prajjwal1/bert-small
52
+ encoder_model_type: bert
53
+ use_custom_optimizer: adamw
54
+ output_learning_rate: 1.e-4
55
+ arch_type : 2
56
+ add_label_name: true
57
+ normalize_embeddings: false
58
+ tie_weights: false
59
+ coil: true
60
+ colbert: false
61
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
62
+ token_dim: 16
63
+ label_frozen_layers: 2
64
+
65
+ TRAINING:
66
+ do_train: false
67
+ do_eval: true
68
+ do_predict: false
69
+ per_device_train_batch_size: 4
70
+ gradient_accumulation_steps: 4
71
+ per_device_eval_batch_size: 1
72
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
73
+ num_train_epochs: 2
74
+ save_steps: 4900
75
+ evaluation_strategy: steps
76
+ eval_steps: 3000000
77
+ fp16: true
78
+ fp16_opt_level: O1
79
+ lr_scheduler_type: "linear" # defaults to 'linear'
80
+ dataloader_num_workers: 16
81
+ label_names: [labels]
82
+ scenario: "unseen_labels"
83
+
84
+ ddp_find_unused_parameters: false
85
+ max_eval_samples: 15000
86
+ ignore_data_skip: true
87
+ # one_hour_job: true
88
+ seed: -1
89
+
cleaned_code/configs/ablation_amzn_1_hier.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 160
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 96
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 1000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: true
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 2
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 2
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 3000000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 16
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+ seed: -1
85
+
cleaned_code/configs/ablation_amzn_1_relax.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 160
10
+ overwrite_output_dir: false # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 160
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
23
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 1000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: true
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 2
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 3000000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 16
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+ seed: -1
85
+
86
+
cleaned_code/configs/ablation_amzn_eda.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DATA:
2
+ task_name: amazon13k
3
+ dataset_name: amazon13k
4
+ dataset_config_name: null
5
+ max_seq_length: 160
6
+ overwrite_output_dir: false # Set to false, if using one_hour_job
7
+ overwrite_cache: false
8
+ pad_to_max_length: true
9
+ load_from_local: true
10
+ max_train_samples: null
11
+ max_eval_samples: null
12
+ max_predict_samples: null
13
+ train_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/train_split6500_2.jsonl
14
+ validation_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl
15
+ test_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl
16
+ label_max_seq_length: 160
17
+
18
+ descriptions_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_edaaug.json
19
+ test_descriptions_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3.json
20
+
21
+ all_labels : /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/all_labels.txt
22
+ test_labels: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/unseen_labels_split6500_2.txt
23
+
24
+ contrastive_learning_samples: 1000
25
+ cl_min_positive_descs: 1
26
+ # bm_short_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/eurlex4.3k/train_bmshort.txt
27
+ # ignore_pos_labels_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/ignore_train_split6500_fs5.txt
28
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
29
+
30
+ MODEL:
31
+ model_name_or_path: bert-base-uncased
32
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
43
+ label_model_name_or_path: prajjwal1/bert-small
44
+ encoder_model_type: bert
45
+ use_custom_optimizer: adamw
46
+ output_learning_rate: 1.e-4
47
+ arch_type : 2
48
+ add_label_name: true
49
+ normalize_embeddings: false
50
+ tie_weights: false
51
+ coil: true
52
+ colbert: false
53
+ # use_precomputed_embeddings: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
54
+ token_dim: 16
55
+ label_frozen_layers: 2
56
+
57
+ TRAINING:
58
+ do_train: true
59
+ do_eval: true
60
+ do_predict: false
61
+ per_device_train_batch_size: 1
62
+ gradient_accumulation_steps: 8
63
+ per_device_eval_batch_size: 1
64
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
65
+ num_train_epochs: 2
66
+ save_steps: 4900
67
+ evaluation_strategy: steps
68
+ eval_steps: 3000000
69
+ fp16: false
70
+ # fp16_opt_level: O1
71
+ lr_scheduler_type: "linear" # defaults to 'linear'
72
+ dataloader_num_workers: 16
73
+ label_names: [labels]
74
+ scenario: "unseen_labels"
75
+
76
+ ddp_find_unused_parameters: false
77
+ max_eval_samples: 15000
78
+ ignore_data_skip: true
79
+ # one_hour_job: true
80
+ seed: -1
81
+
cleaned_code/configs/ablation_amzn_eda_base.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 160
10
+ overwrite_output_dir: false # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 160
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
23
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 1000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: true
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 2
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 3000000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 16
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+ seed: -1
85
+
cleaned_code/configs/ablation_amzn_eda_base2.yml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 128
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 96
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
23
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 2000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: true
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 3
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 3000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 4
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+
cleaned_code/configs/ablation_eurlex_1_base.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: ../training/datasets/eurlex4.3k/train_split1057.jsonl
17
+ # validation_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
18
+ # test_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+ validation_file: ../training/datasets/eurlex4.3k/test.jsonl
20
+ test_file: ../training/datasets/eurlex4.3k/test.jsonl
21
+
22
+ # validation_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
23
+ # test_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
24
+ label_max_seq_length: 128
25
+ # descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
26
+ # test_descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
27
+ descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
28
+ test_descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
29
+
30
+
31
+ all_labels : ../training/datasets/eurlex4.3k/all_labels.txt
32
+ test_labels : ../training/datasets/eurlex4.3k/all_labels.txt
33
+ # test_labels: ../training/datasets/eurlex4.3k/unseen_labels_split1057.txt
34
+ # test_labels: ../training/datasets/eurlex4.3k/unseen_labels.txt
35
+
36
+ contrastive_learning_samples: 1500
37
+ cl_min_positive_descs: 1
38
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
39
+
40
+ MODEL:
41
+ model_name_or_path: bert-base-uncased
42
+ # pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_base_web_128_seed2/checkpoint-21600/pytorch_model.bin
43
+ config_name: null
44
+ tokenizer_name: null
45
+ cache_dir: null
46
+ use_fast_tokenizer: true
47
+ model_revision: main
48
+ use_auth_token: false
49
+ ignore_mismatched_sizes: false
50
+ negative_sampling: "none"
51
+ semsup: true
52
+ label_model_name_or_path: prajjwal1/bert-small
53
+ encoder_model_type: bert
54
+ use_custom_optimizer: adamw
55
+ output_learning_rate: 1.e-4
56
+ arch_type : 2
57
+ add_label_name: false
58
+ normalize_embeddings: false
59
+ tie_weights: false
60
+ coil: true
61
+ # use_precomputed_embeddings: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
62
+ token_dim: 16
63
+ label_frozen_layers: 2
64
+
65
+ TRAINING:
66
+ do_train: true
67
+ do_eval: true
68
+ per_device_train_batch_size: 1
69
+ gradient_accumulation_steps: 8
70
+ per_device_eval_batch_size: 1
71
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
72
+ num_train_epochs: 10
73
+ save_steps: 5400
74
+ evaluation_strategy: steps
75
+ eval_steps: 5000
76
+ fp16: true
77
+ fp16_opt_level: O1
78
+ lr_scheduler_type: "linear" # defaults to 'linear'
79
+ dataloader_num_workers: 8
80
+ label_names: [labels]
81
+ scenario: "unseen_labels"
82
+
83
+ ddp_find_unused_parameters: false
84
+ seed: -1
85
+
cleaned_code/configs/ablation_eurlex_1_coil.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
17
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
18
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+
20
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
21
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
22
+ label_max_seq_length: 128
23
+ # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
24
+ # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
25
+ descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
26
+ test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
27
+
28
+
29
+ all_labels : datasets/eurlex4.3k/all_labels.txt
30
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
31
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
32
+
33
+ # max_descs_per_label: 5
34
+ contrastive_learning_samples: 1500
35
+ cl_min_positive_descs: 1
36
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
37
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
38
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
39
+
40
+ MODEL:
41
+ model_name_or_path: bert-base-uncased
42
+ pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_coil_web_seed2/checkpoint-5400/pytorch_model.bin
43
+ config_name: null
44
+ tokenizer_name: null
45
+ cache_dir: null
46
+ use_fast_tokenizer: true
47
+ model_revision: main
48
+ use_auth_token: false
49
+ ignore_mismatched_sizes: false
50
+ negative_sampling: "none"
51
+ semsup: true
52
+ label_model_name_or_path: prajjwal1/bert-small
53
+ # label_model_name_or_path: bert-base-uncased
54
+ # label_model_name_or_path: prajjwal1/bert-tiny
55
+ encoder_model_type: bert
56
+ use_custom_optimizer: adamw
57
+ output_learning_rate: 1.e-4
58
+ arch_type : 2
59
+ add_label_name: false
60
+ normalize_embeddings: false
61
+ tie_weights: false
62
+ coil: false
63
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
64
+ token_dim: 16
65
+ # num_frozen_layers: 9
66
+ label_frozen_layers: 2
67
+
68
+ TRAINING:
69
+ do_train: false
70
+ do_eval: true
71
+ per_device_train_batch_size: 1
72
+ gradient_accumulation_steps: 8
73
+ per_device_eval_batch_size: 1
74
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
75
+ num_train_epochs: 10
76
+ save_steps: 5400
77
+ evaluation_strategy: steps
78
+ eval_steps: 5000
79
+ fp16: true
80
+ fp16_opt_level: O1
81
+ lr_scheduler_type: "linear" # defaults to 'linear'
82
+ dataloader_num_workers: 8
83
+ label_names: [labels]
84
+ scenario: "unseen_labels"
85
+
86
+ ddp_find_unused_parameters: false
87
+ seed: -1
88
+
cleaned_code/configs/ablation_eurlex_1_descs.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
17
+ # validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
18
+ # test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+
20
+ validation_file: datasets/eurlex4.3k/test.jsonl
21
+ test_file: datasets/eurlex4.3k/test.jsonl
22
+
23
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
24
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
25
+ label_max_seq_length: 64
26
+ descriptions_file: datasets/eurlex4.3k/heir_withoutdescriptions_4.3k_web_nl_unseen.json
27
+ test_descriptions_file: datasets/eurlex4.3k/heir_withoutdescriptions_4.3k_web_nl.json
28
+ # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
29
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
30
+ all_labels : datasets/eurlex4.3k/all_labels.txt
31
+ test_labels : datasets/eurlex4.3k/all_labels.txt
32
+
33
+ # test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
34
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
35
+
36
+ # max_descs_per_label: 5
37
+ contrastive_learning_samples: 1500
38
+ cl_min_positive_descs: 1
39
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
40
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
41
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
42
+
43
+ MODEL:
44
+ model_name_or_path: bert-base-uncased
45
+ pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_descs_seed3/checkpoint-27000/pytorch_model.bin
46
+ config_name: null
47
+ tokenizer_name: null
48
+ cache_dir: null
49
+ use_fast_tokenizer: true
50
+ model_revision: main
51
+ use_auth_token: false
52
+ ignore_mismatched_sizes: false
53
+ negative_sampling: "none"
54
+ semsup: true
55
+ label_model_name_or_path: prajjwal1/bert-small
56
+ # label_model_name_or_path: bert-base-uncased
57
+ # label_model_name_or_path: prajjwal1/bert-tiny
58
+ encoder_model_type: bert
59
+ use_custom_optimizer: adamw
60
+ output_learning_rate: 1.e-4
61
+ arch_type : 2
62
+ add_label_name: false
63
+ normalize_embeddings: false
64
+ tie_weights: false
65
+ coil: true
66
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
67
+ token_dim: 16
68
+ # num_frozen_layers: 9
69
+ label_frozen_layers: 2
70
+
71
+ TRAINING:
72
+ do_train: false
73
+ do_eval: true
74
+ per_device_train_batch_size: 1
75
+ gradient_accumulation_steps: 8
76
+ per_device_eval_batch_size: 1
77
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
78
+ num_train_epochs: 10
79
+ save_steps: 5400
80
+ evaluation_strategy: steps
81
+ eval_steps: 5000
82
+ fp16: true
83
+ fp16_opt_level: O1
84
+ lr_scheduler_type: "linear" # defaults to 'linear'
85
+ dataloader_num_workers: 8
86
+ label_names: [labels]
87
+ scenario: "unseen_labels"
88
+
89
+ ddp_find_unused_parameters: false
90
+ seed: -1
91
+
cleaned_code/configs/ablation_eurlex_1_hier_descs.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: datasets/eurlex4.3k/test.jsonl
21
+ # test_file: datasets/eurlex4.3k/test.jsonl
22
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
23
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
24
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
25
+
26
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 8
29
+ descriptions_file: datasets/eurlex4.3k/all_names.json
30
+ test_descriptions_file: datasets/eurlex4.3k/all_names.json
31
+ # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
32
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
33
+ all_labels : datasets/eurlex4.3k/all_labels.txt
34
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
35
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
36
+
37
+ # max_descs_per_label: 5
38
+ # contrastive_learning_samples: 1500
39
+ # cl_min_positive_descs: 1
40
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
41
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
42
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
43
+
44
+ MODEL:
45
+ model_name_or_path: bert-base-uncased
46
+ config_name: null
47
+ tokenizer_name: null
48
+ cache_dir: null
49
+ use_fast_tokenizer: true
50
+ model_revision: main
51
+ use_auth_token: false
52
+ ignore_mismatched_sizes: false
53
+ negative_sampling: "none"
54
+ semsup: true
55
+ label_model_name_or_path: prajjwal1/bert-small
56
+ # label_model_name_or_path: bert-base-uncased
57
+ # label_model_name_or_path: prajjwal1/bert-tiny
58
+ encoder_model_type: bert
59
+ use_custom_optimizer: adamw
60
+ output_learning_rate: 1.e-4
61
+ arch_type : 2
62
+ add_label_name: false
63
+ normalize_embeddings: false
64
+ tie_weights: false
65
+ coil: true
66
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
67
+ token_dim: 16
68
+ # num_frozen_layers: 9
69
+ label_frozen_layers: 2
70
+
71
+ TRAINING:
72
+ do_train: true
73
+ do_eval: true
74
+ per_device_train_batch_size: 4
75
+ gradient_accumulation_steps: 4
76
+ per_device_eval_batch_size: 1
77
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
78
+ num_train_epochs: 10
79
+ save_steps: 10000
80
+ evaluation_strategy: steps
81
+ eval_steps: 500
82
+ fp16: true
83
+ fp16_opt_level: O1
84
+ lr_scheduler_type: "linear" # defaults to 'linear'
85
+ dataloader_num_workers: 8
86
+ label_names: [labels]
87
+ scenario: "unseen_labels"
88
+
89
+ ddp_find_unused_parameters: false
90
+
91
+
cleaned_code/configs/ablation_eurlex_1_hierarchy.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
17
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
18
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+
20
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
21
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
22
+ label_max_seq_length: 96
23
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
24
+ # test_descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
25
+ descriptions_file: datasets/eurlex4.3k/eurlex_descs_refined_v3_v3.json
26
+ test_descriptions_file: datasets/eurlex4.3k/eurlex_descs_refined_v3_v3.json
27
+
28
+
29
+ all_labels : datasets/eurlex4.3k/all_labels.txt
30
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
31
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
32
+
33
+ max_descs_per_label: 5
34
+ contrastive_learning_samples: 1500
35
+ cl_min_positive_descs: 1
36
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
37
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
38
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
39
+
40
+ MODEL:
41
+ model_name_or_path: bert-base-uncased
42
+ pretrained_model_path: seed_experiments/ablation_eurlex_1_hierarchy_web_seed3/checkpoint-5400/pytorch_model.bin
43
+ config_name: null
44
+ tokenizer_name: null
45
+ cache_dir: null
46
+ use_fast_tokenizer: true
47
+ model_revision: main
48
+ use_auth_token: false
49
+ ignore_mismatched_sizes: false
50
+ negative_sampling: "none"
51
+ semsup: true
52
+ label_model_name_or_path: prajjwal1/bert-small
53
+ # label_model_name_or_path: bert-base-uncased
54
+ # label_model_name_or_path: prajjwal1/bert-tiny
55
+ encoder_model_type: bert
56
+ use_custom_optimizer: adamw
57
+ output_learning_rate: 1.e-4
58
+ arch_type : 2
59
+ add_label_name: false
60
+ normalize_embeddings: false
61
+ tie_weights: false
62
+ coil: true
63
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
64
+ token_dim: 16
65
+ # num_frozen_layers: 9
66
+ label_frozen_layers: 2
67
+
68
+ TRAINING:
69
+ do_train: false
70
+ do_eval: true
71
+ per_device_train_batch_size: 1
72
+ gradient_accumulation_steps: 8
73
+ per_device_eval_batch_size: 1
74
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
75
+ num_train_epochs: 10
76
+ save_steps: 5400
77
+ evaluation_strategy: steps
78
+ eval_steps: 5000
79
+ fp16: true
80
+ fp16_opt_level: O1
81
+ lr_scheduler_type: "linear" # defaults to 'linear'
82
+ dataloader_num_workers: 8
83
+ label_names: [labels]
84
+ scenario: "unseen_labels"
85
+
86
+ ddp_find_unused_parameters: false
87
+ seed: -1
88
+
cleaned_code/configs/ablation_eurlex_1_relax.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: eurlex57k
7
+ dataset_name: eurlex
8
+ dataset_config_name: null
9
+ max_seq_length: 512
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
18
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
20
+ label_max_seq_length: 128
21
+ # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
22
+ # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
23
+ descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
24
+ test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
25
+
26
+
27
+ all_labels : datasets/eurlex4.3k/all_labels.txt
28
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
29
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
30
+
31
+ contrastive_learning_samples: 1500
32
+ cl_min_positive_descs: 1
33
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
34
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
35
+ # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
36
+
37
+ MODEL:
38
+ model_name_or_path: bert-base-uncased
39
+ pretrained_model_path: seed_experiments/ablation_eurlex_1_relax_web_seed3/checkpoint-4900/pytorch_model.bin
40
+ config_name: null
41
+ tokenizer_name: null
42
+ cache_dir: null
43
+ use_fast_tokenizer: true
44
+ model_revision: main
45
+ use_auth_token: false
46
+ ignore_mismatched_sizes: false
47
+ negative_sampling: "none"
48
+ semsup: true
49
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
50
+ label_model_name_or_path: prajjwal1/bert-small
51
+ encoder_model_type: bert
52
+ use_custom_optimizer: adamw
53
+ output_learning_rate: 1.e-4
54
+ arch_type : 2
55
+ add_label_name: false
56
+ normalize_embeddings: false
57
+ tie_weights: false
58
+ coil: true
59
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
60
+ token_dim: 16
61
+ label_frozen_layers: 2
62
+
63
+ TRAINING:
64
+ do_train: false
65
+ do_eval: true
66
+ per_device_train_batch_size: 1
67
+ gradient_accumulation_steps: 8
68
+ per_device_eval_batch_size: 1
69
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
70
+ num_train_epochs: 10
71
+ save_steps: 5400
72
+ evaluation_strategy: steps
73
+ eval_steps: 5000
74
+ fp16: true
75
+ fp16_opt_level: O1
76
+ lr_scheduler_type: "linear" # defaults to 'linear'
77
+ dataloader_num_workers: 16
78
+ label_names: [labels]
79
+ scenario: "unseen_labels"
80
+
81
+ ddp_find_unused_parameters: false
82
+ # max_eval_samples: 15000
83
+ # ignore_data_skip: true
84
+ # one_hour_job: true
85
+ seed: -1
86
+
cleaned_code/configs/ablation_eurlex_eda.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
17
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
18
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
19
+
20
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
21
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
22
+ label_max_seq_length: 128
23
+ # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
24
+ # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
25
+ descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen_edaaug.json
26
+ test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
27
+
28
+
29
+ all_labels : datasets/eurlex4.3k/all_labels.txt
30
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
31
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
32
+
33
+ contrastive_learning_samples: 1500
34
+ cl_min_positive_descs: 1
35
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
36
+
37
+ MODEL:
38
+ model_name_or_path: bert-base-uncased
39
+ pretrained_model_path: seed_experiments/ablation_eurlex_1_eda_web_128_seed3/checkpoint-5400/pytorch_model.bin
40
+ config_name: null
41
+ tokenizer_name: null
42
+ cache_dir: null
43
+ use_fast_tokenizer: true
44
+ model_revision: main
45
+ use_auth_token: false
46
+ ignore_mismatched_sizes: false
47
+ negative_sampling: "none"
48
+ semsup: true
49
+ label_model_name_or_path: prajjwal1/bert-small
50
+ encoder_model_type: bert
51
+ use_custom_optimizer: adamw
52
+ output_learning_rate: 1.e-4
53
+ arch_type : 2
54
+ add_label_name: false
55
+ normalize_embeddings: false
56
+ tie_weights: false
57
+ coil: true
58
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
59
+ token_dim: 16
60
+ label_frozen_layers: 2
61
+
62
+ TRAINING:
63
+ do_train: false
64
+ do_eval: true
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 4
67
+ per_device_eval_batch_size: 2
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 10
70
+ save_steps: 5400
71
+ evaluation_strategy: steps
72
+ eval_steps: 5000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 16
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ seed: -1
82
+
cleaned_code/configs/amzn13k_active_hfwnet.yml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split1668_hfwnet.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
43
+ label_model_name_or_path: prajjwal1/bert-small
44
+ encoder_model_type: bert
45
+ use_custom_optimizer: adamw
46
+ output_learning_rate: 1.e-4
47
+ arch_type : 2
48
+ add_label_name: true
49
+ normalize_embeddings: false
50
+ tie_weights: false
51
+ coil: true
52
+ colbert: false
53
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
54
+ token_dim: 16
55
+ label_frozen_layers: 2
56
+
57
+ TRAINING:
58
+ do_train: true
59
+ do_eval: true
60
+ do_predict: false
61
+ per_device_train_batch_size: 1
62
+ gradient_accumulation_steps: 4
63
+ per_device_eval_batch_size: 1
64
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
65
+ num_train_epochs: 20
66
+ save_steps: 5000
67
+ evaluation_strategy: steps
68
+ eval_steps: 2000
69
+ fp16: true
70
+ fp16_opt_level: O1
71
+ lr_scheduler_type: "linear" # defaults to 'linear'
72
+ dataloader_num_workers: 4
73
+ label_names: [labels]
74
+ scenario: "unseen_labels"
75
+
76
+ ddp_find_unused_parameters: false
77
+ max_eval_samples: 15000
78
+ # ignore_data_skip: true
79
+ # one_hour_job: true
cleaned_code/configs/amzn13k_active_highfreq.yml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/Amzn13K/train_split1668_highfreq_fs50.jsonl
17
+ # train_file: datasets/Amzn13K/train_split1668_highfreq.jsonl
18
+ train_file: datasets/Amzn13K/train_split1106_highfreq_bot.jsonl
19
+
20
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
21
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
22
+ label_max_seq_length: 96
23
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
24
+ # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
25
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
26
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
27
+
28
+ all_labels : datasets/Amzn13K/all_labels.txt
29
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
30
+
31
+ # max_descs_per_label: 10
32
+ # contrastive_learning_samples: 5000
33
+ # cl_min_positive_descs: 1
34
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
35
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split1668_highfreq_fs50.txt
36
+
37
+ MODEL:
38
+ model_name_or_path: bert-base-uncased
39
+ config_name: null
40
+ tokenizer_name: null
41
+ cache_dir: null
42
+ use_fast_tokenizer: true
43
+ model_revision: main
44
+ use_auth_token: false
45
+ ignore_mismatched_sizes: false
46
+ negative_sampling: "none"
47
+ semsup: true
48
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
49
+ label_model_name_or_path: prajjwal1/bert-small
50
+ encoder_model_type: bert
51
+ use_custom_optimizer: adamw
52
+ output_learning_rate: 1.e-4
53
+ arch_type : 2
54
+ add_label_name: true
55
+ normalize_embeddings: false
56
+ tie_weights: false
57
+ coil: true
58
+ colbert: false
59
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
60
+ token_dim: 16
61
+ label_frozen_layers: 2
62
+
63
+ TRAINING:
64
+ do_train: true
65
+ do_eval: true
66
+ do_predict: false
67
+ per_device_train_batch_size: 1
68
+ gradient_accumulation_steps: 4
69
+ per_device_eval_batch_size: 1
70
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
71
+ num_train_epochs: 20
72
+ save_steps: 5000
73
+ evaluation_strategy: steps
74
+ eval_steps: 1000
75
+ fp16: true
76
+ fp16_opt_level: O1
77
+ lr_scheduler_type: "linear" # defaults to 'linear'
78
+ dataloader_num_workers: 8
79
+ label_names: [labels]
80
+ scenario: "unseen_labels"
81
+
82
+ ddp_find_unused_parameters: false
83
+ max_eval_samples: 15000
84
+ max_train_samples: 30000
85
+
86
+ # ignore_data_skip: true
87
+ # one_hour_job: true
cleaned_code/configs/amzn13k_active_random.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split1668_random.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split1668_random_fs50.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
43
+ label_model_name_or_path: prajjwal1/bert-small
44
+ encoder_model_type: bert
45
+ use_custom_optimizer: adamw
46
+ output_learning_rate: 1.e-4
47
+ arch_type : 2
48
+ add_label_name: true
49
+ normalize_embeddings: false
50
+ tie_weights: false
51
+ coil: true
52
+ colbert: false
53
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
54
+ token_dim: 16
55
+ label_frozen_layers: 2
56
+
57
+ TRAINING:
58
+ do_train: true
59
+ do_eval: true
60
+ do_predict: false
61
+ per_device_train_batch_size: 1
62
+ gradient_accumulation_steps: 4
63
+ per_device_eval_batch_size: 1
64
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
65
+ num_train_epochs: 20
66
+ save_steps: 5000
67
+ evaluation_strategy: steps
68
+ eval_steps: 2000
69
+ fp16: true
70
+ fp16_opt_level: O1
71
+ lr_scheduler_type: "linear" # defaults to 'linear'
72
+ dataloader_num_workers: 4
73
+ label_names: [labels]
74
+ scenario: "unseen_labels"
75
+
76
+ ddp_find_unused_parameters: false
77
+ max_eval_samples: 15000
78
+ max_train_samples: 30000
79
+
80
+ # ignore_data_skip: true
81
+ # one_hour_job: true
cleaned_code/configs/amzn13k_active_wnet.yml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split1228_wnet.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
43
+ label_model_name_or_path: prajjwal1/bert-small
44
+ encoder_model_type: bert
45
+ use_custom_optimizer: adamw
46
+ output_learning_rate: 1.e-4
47
+ arch_type : 2
48
+ add_label_name: true
49
+ normalize_embeddings: false
50
+ tie_weights: false
51
+ coil: true
52
+ colbert: false
53
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
54
+ token_dim: 16
55
+ label_frozen_layers: 2
56
+
57
+ TRAINING:
58
+ do_train: true
59
+ do_eval: true
60
+ do_predict: false
61
+ per_device_train_batch_size: 1
62
+ gradient_accumulation_steps: 4
63
+ per_device_eval_batch_size: 1
64
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
65
+ num_train_epochs: 20
66
+ save_steps: 5000
67
+ evaluation_strategy: steps
68
+ eval_steps: 2000
69
+ fp16: true
70
+ fp16_opt_level: O1
71
+ lr_scheduler_type: "linear" # defaults to 'linear'
72
+ dataloader_num_workers: 4
73
+ label_names: [labels]
74
+ scenario: "unseen_labels"
75
+
76
+ ddp_find_unused_parameters: false
77
+ max_eval_samples: 15000
78
+ # ignore_data_skip: true
79
+ # one_hour_job: true
cleaned_code/configs/amzn13k_active_wnet2.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/Amzn13K/train_split2807_wnet2_fs50.jsonl
17
+ # train_file: datasets/Amzn13K/train_split2807_wnet2.jsonl
18
+ train_file: datasets/Amzn13K/train_split1106_wnet2_bot_high.jsonl
19
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
21
+ label_max_seq_length: 96
22
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
23
+ # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
24
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
25
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
26
+
27
+ all_labels : datasets/Amzn13K/all_labels.txt
28
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
29
+
30
+ # max_descs_per_label: 10
31
+ # contrastive_learning_samples: 5000
32
+ # cl_min_positive_descs: 1
33
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
34
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split2807_wnet2_fs50.txt
35
+
36
+ MODEL:
37
+ model_name_or_path: bert-base-uncased
38
+ pretrained_label_model_path: label_model_amzn_hier_format.pt
39
+ config_name: null
40
+ tokenizer_name: null
41
+ cache_dir: null
42
+ use_fast_tokenizer: true
43
+ model_revision: main
44
+ use_auth_token: false
45
+ ignore_mismatched_sizes: false
46
+ negative_sampling: "none"
47
+ semsup: true
48
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
49
+ label_model_name_or_path: prajjwal1/bert-small
50
+ encoder_model_type: bert
51
+ use_custom_optimizer: adamw
52
+ output_learning_rate: 1.e-4
53
+ arch_type : 2
54
+ add_label_name: true
55
+ normalize_embeddings: false
56
+ tie_weights: false
57
+ coil: true
58
+ colbert: false
59
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
60
+ token_dim: 16
61
+ label_frozen_layers: 2
62
+
63
+ TRAINING:
64
+ do_train: true
65
+ do_eval: true
66
+ do_predict: false
67
+ per_device_train_batch_size: 1
68
+ gradient_accumulation_steps: 8
69
+ per_device_eval_batch_size: 1
70
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
71
+ num_train_epochs: 20
72
+ save_steps: 5000
73
+ evaluation_strategy: steps
74
+ eval_steps: 500
75
+ fp16: true
76
+ fp16_opt_level: O1
77
+ lr_scheduler_type: "linear" # defaults to 'linear'
78
+ dataloader_num_workers: 8
79
+ label_names: [labels]
80
+ scenario: "unseen_labels"
81
+
82
+ ddp_find_unused_parameters: false
83
+ max_eval_samples: 15000
84
+ max_train_samples: 10000
85
+ # ignore_data_skip: true
86
+ # one_hour_job: true
cleaned_code/configs/amzn13k_baseline.yml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500.jsonl
19
+ label_max_seq_length: 8
20
+ descriptions_file: datasets/Amzn13K/names_descriptions.json
21
+ all_labels : datasets/Amzn13K/all_labels.txt
22
+ test_labels: datasets/Amzn13K/unseen_labels_split6500.txt
23
+
24
+ max_descs_per_label: 5
25
+ contrastive_learning_samples: 6000
26
+ cl_min_positive_descs: 1
27
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
28
+
29
+ MODEL:
30
+ model_name_or_path: bert-base-uncased
31
+ config_name: null
32
+ tokenizer_name: null
33
+ cache_dir: null
34
+ use_fast_tokenizer: true
35
+ model_revision: main
36
+ use_auth_token: false
37
+ ignore_mismatched_sizes: false
38
+ negative_sampling: "none"
39
+ semsup: true
40
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
41
+ label_model_name_or_path: prajjwal1/bert-tiny
42
+ encoder_model_type: bert
43
+ use_custom_optimizer: adamw
44
+ output_learning_rate: 1.e-4
45
+ arch_type : 2
46
+ add_label_name: false
47
+ normalize_embeddings: false
48
+ tie_weights: false
49
+ coil: true
50
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
51
+ token_dim: 16
52
+
53
+ TRAINING:
54
+ do_train: true
55
+ do_eval: true
56
+ per_device_train_batch_size: 4
57
+ gradient_accumulation_steps: 1
58
+ per_device_eval_batch_size: 4
59
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
60
+ num_train_epochs: 3
61
+ save_steps: 10000
62
+ evaluation_strategy: steps
63
+ eval_steps: 1000
64
+ fp16: true
65
+ fp16_opt_level: O1
66
+ lr_scheduler_type: "linear" # defaults to 'linear'
67
+ dataloader_num_workers: 8
68
+ label_names: [labels]
69
+ scenario: "unseen_labels"
70
+
71
+ ddp_find_unused_parameters: false
72
+ max_eval_samples: 20000
73
+
cleaned_code/configs/amzn13k_baseline_descs.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: false # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ descriptions_file: datasets/Amzn13K/amzn_summ_descs.json
23
+ all_labels : datasets/Amzn13K/all_labels.txt
24
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
25
+
26
+ max_descs_per_label: 5
27
+ contrastive_learning_samples: 3500
28
+ cl_min_positive_descs: 1
29
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
30
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
31
+
32
+ MODEL:
33
+ model_name_or_path: bert-base-uncased
34
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
35
+ config_name: null
36
+ tokenizer_name: null
37
+ cache_dir: null
38
+ use_fast_tokenizer: true
39
+ model_revision: main
40
+ use_auth_token: false
41
+ ignore_mismatched_sizes: false
42
+ negative_sampling: "none"
43
+ semsup: true
44
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
45
+ label_model_name_or_path: prajjwal1/bert-small
46
+ encoder_model_type: bert
47
+ use_custom_optimizer: adamw
48
+ output_learning_rate: 1.e-4
49
+ arch_type : 2
50
+ add_label_name: true
51
+ normalize_embeddings: false
52
+ tie_weights: false
53
+ coil: true
54
+ colbert: false
55
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
56
+ token_dim: 16
57
+ label_frozen_layers: 2
58
+
59
+ TRAINING:
60
+ do_train: true
61
+ do_eval: true
62
+ per_device_train_batch_size: 1
63
+ gradient_accumulation_steps: 4
64
+ per_device_eval_batch_size: 1
65
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
66
+ num_train_epochs: 3
67
+ save_steps: 4000
68
+ evaluation_strategy: steps
69
+ eval_steps: 30000
70
+ fp16: true
71
+ fp16_opt_level: O1
72
+ lr_scheduler_type: "linear" # defaults to 'linear'
73
+ dataloader_num_workers: 8
74
+ label_names: [labels]
75
+ scenario: "unseen_labels"
76
+
77
+ ddp_find_unused_parameters: false
78
+ max_eval_samples: 15000
79
+ ignore_data_skip: true
80
+ # one_hour_job: true
81
+
cleaned_code/configs/amzn13k_baseline_descs_edaaug.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 32
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_eda_aug.json
22
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+
24
+ all_labels : datasets/Amzn13K/all_labels.txt
25
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
26
+
27
+ max_descs_per_label: 1000
28
+ contrastive_learning_samples: 3500
29
+ cl_min_positive_descs: 1
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: false
52
+ token_dim: 16
53
+ label_frozen_layers: 2
54
+
55
+ TRAINING:
56
+ do_train: true
57
+ do_eval: true
58
+ do_predict: false
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 5000
65
+ evaluation_strategy: steps
66
+ eval_steps: 2000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 4
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
cleaned_code/configs/amzn13k_baseline_descs_fullsup.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train.jsonl
17
+ validation_file: datasets/Amzn13K/test.jsonl
18
+ test_file: datasets/Amzn13K/test.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ # all_labels : datasets/Amzn13K/all_labels.txt
23
+ # test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ contrastive_learning_samples: 5000
27
+ cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+
30
+ MODEL:
31
+ model_name_or_path: bert-base-uncased
32
+ config_name: null
33
+ tokenizer_name: null
34
+ cache_dir: null
35
+ use_fast_tokenizer: true
36
+ model_revision: main
37
+ use_auth_token: false
38
+ ignore_mismatched_sizes: false
39
+ negative_sampling: "none"
40
+ semsup: true
41
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
42
+ label_model_name_or_path: prajjwal1/bert-tiny
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
52
+ token_dim: 16
53
+
54
+ TRAINING:
55
+ do_train: true
56
+ do_eval: true
57
+ per_device_train_batch_size: 4
58
+ gradient_accumulation_steps: 1
59
+ per_device_eval_batch_size: 2
60
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
61
+ num_train_epochs: 3
62
+ save_steps: 30000
63
+ evaluation_strategy: steps
64
+ eval_steps: 5000
65
+ fp16: true
66
+ fp16_opt_level: O1
67
+ lr_scheduler_type: "linear" # defaults to 'linear'
68
+ dataloader_num_workers: 8
69
+ label_names: [labels]
70
+ scenario: "seen"
71
+
72
+ ddp_find_unused_parameters: false
73
+ max_eval_samples: 15000
74
+
cleaned_code/configs/amzn13k_baseline_descs_masked_0.0.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 32
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.0.json
22
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+
24
+ all_labels : datasets/Amzn13K/all_labels.txt
25
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
26
+
27
+ max_descs_per_label: 1000
28
+ contrastive_learning_samples: 3500
29
+ cl_min_positive_descs: 1
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: false
52
+ token_dim: 16
53
+ label_frozen_layers: 2
54
+
55
+ TRAINING:
56
+ do_train: true
57
+ do_eval: true
58
+ do_predict: false
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 5000
65
+ evaluation_strategy: steps
66
+ eval_steps: 2000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 4
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
cleaned_code/configs/amzn13k_baseline_descs_masked_0.2.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 32
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.2.json
22
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+
24
+ all_labels : datasets/Amzn13K/all_labels.txt
25
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
26
+
27
+ max_descs_per_label: 1000
28
+ contrastive_learning_samples: 3500
29
+ cl_min_positive_descs: 1
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: false
52
+ token_dim: 16
53
+ label_frozen_layers: 2
54
+
55
+ TRAINING:
56
+ do_train: true
57
+ do_eval: true
58
+ do_predict: false
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 5000
65
+ evaluation_strategy: steps
66
+ eval_steps: 2000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 4
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
cleaned_code/configs/amzn13k_baseline_descs_masked_0.5.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 32
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.5.json
22
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+
24
+ all_labels : datasets/Amzn13K/all_labels.txt
25
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
26
+
27
+ max_descs_per_label: 1000
28
+ contrastive_learning_samples: 3500
29
+ cl_min_positive_descs: 1
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: false
52
+ token_dim: 16
53
+ label_frozen_layers: 2
54
+
55
+ TRAINING:
56
+ do_train: true
57
+ do_eval: true
58
+ do_predict: false
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 5000
65
+ evaluation_strategy: steps
66
+ eval_steps: 2000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 4
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
cleaned_code/configs/amzn13k_baseline_descs_masked_0.9.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 32
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.9.json
22
+ test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
23
+
24
+ all_labels : datasets/Amzn13K/all_labels.txt
25
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
26
+
27
+ max_descs_per_label: 1000
28
+ contrastive_learning_samples: 3500
29
+ cl_min_positive_descs: 1
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: false
52
+ token_dim: 16
53
+ label_frozen_layers: 2
54
+
55
+ TRAINING:
56
+ do_train: true
57
+ do_eval: true
58
+ do_predict: false
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 5000
65
+ evaluation_strategy: steps
66
+ eval_steps: 2000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 4
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
cleaned_code/configs/amzn13k_baseline_descs_merge.yml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 80
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_merge3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ contrastive_learning_samples: 5000
27
+ cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+
30
+ MODEL:
31
+ model_name_or_path: bert-base-uncased
32
+ config_name: null
33
+ tokenizer_name: null
34
+ cache_dir: null
35
+ use_fast_tokenizer: true
36
+ model_revision: main
37
+ use_auth_token: false
38
+ ignore_mismatched_sizes: false
39
+ negative_sampling: "none"
40
+ semsup: true
41
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
42
+ label_model_name_or_path: prajjwal1/bert-tiny
43
+ encoder_model_type: bert
44
+ use_custom_optimizer: adamw
45
+ output_learning_rate: 1.e-4
46
+ arch_type : 2
47
+ add_label_name: true
48
+ normalize_embeddings: false
49
+ tie_weights: false
50
+ coil: true
51
+ colbert: true
52
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
53
+ token_dim: 16
54
+ label_frozen_layers: 2
55
+
56
+ TRAINING:
57
+ do_train: true
58
+ do_eval: true
59
+ per_device_train_batch_size: 2
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 2
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 3
64
+ save_steps: 10000
65
+ evaluation_strategy: steps
66
+ eval_steps: 1000
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 8
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ max_eval_samples: 15000
76
+
cleaned_code/configs/amzn13k_baseline_fs.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_fs100.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs100.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
34
+ config_name: null
35
+ tokenizer_name: null
36
+ cache_dir: null
37
+ use_fast_tokenizer: true
38
+ model_revision: main
39
+ use_auth_token: false
40
+ ignore_mismatched_sizes: false
41
+ negative_sampling: "none"
42
+ semsup: true
43
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
44
+ label_model_name_or_path: prajjwal1/bert-small
45
+ encoder_model_type: bert
46
+ use_custom_optimizer: adamw
47
+ output_learning_rate: 5.e-5
48
+ arch_type : 2
49
+ add_label_name: true
50
+ normalize_embeddings: false
51
+ tie_weights: false
52
+ coil: true
53
+ colbert: false
54
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
55
+ token_dim: 16
56
+ label_frozen_layers: 2
57
+
58
+ TRAINING:
59
+ do_train: true
60
+ do_eval: true
61
+ do_predict: false
62
+ per_device_train_batch_size: 1
63
+ gradient_accumulation_steps: 8
64
+ per_device_eval_batch_size: 1
65
+ learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
66
+ num_train_epochs: 10
67
+ save_steps: 5000
68
+ evaluation_strategy: steps
69
+ eval_steps: 500
70
+ fp16: true
71
+ fp16_opt_level: O1
72
+ lr_scheduler_type: "linear" # defaults to 'linear'
73
+ dataloader_num_workers: 4
74
+ label_names: [labels]
75
+ scenario: "unseen_labels"
76
+
77
+ ddp_find_unused_parameters: false
78
+ max_eval_samples: 15000
79
+ # ignore_data_skip: true
80
+ # one_hour_job: true
cleaned_code/configs/amzn13k_baseline_fs2.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: false # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_fs5.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_fs5.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_fs5.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_fs5.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
34
+ config_name: null
35
+ tokenizer_name: null
36
+ cache_dir: null
37
+ use_fast_tokenizer: true
38
+ model_revision: main
39
+ use_auth_token: false
40
+ ignore_mismatched_sizes: false
41
+ negative_sampling: "none"
42
+ semsup: true
43
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
44
+ label_model_name_or_path: prajjwal1/bert-small
45
+ encoder_model_type: bert
46
+ use_custom_optimizer: adamw
47
+ output_learning_rate: 5.e-5
48
+ arch_type : 2
49
+ add_label_name: true
50
+ normalize_embeddings: false
51
+ tie_weights: false
52
+ coil: true
53
+ colbert: true
54
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
55
+ token_dim: 16
56
+ label_frozen_layers: 2
57
+
58
+ TRAINING:
59
+ do_train: true
60
+ do_eval: true
61
+ do_predict: false
62
+ per_device_train_batch_size: 1
63
+ gradient_accumulation_steps: 4
64
+ per_device_eval_batch_size: 1
65
+ learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
66
+ num_train_epochs: 3
67
+ save_steps: 10000
68
+ evaluation_strategy: steps
69
+ eval_steps: 500
70
+ fp16: true
71
+ fp16_opt_level: O1
72
+ lr_scheduler_type: "linear" # defaults to 'linear'
73
+ dataloader_num_workers: 8
74
+ label_names: [labels]
75
+ scenario: "unseen_labels"
76
+
77
+ ddp_find_unused_parameters: false
78
+ max_eval_samples: 15000
79
+ # ignore_data_skip: true
80
+ # one_hour_job: true
cleaned_code/configs/amzn13k_baseline_fs5.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: amazon13k
6
+ dataset_name: amazon13k
7
+ dataset_config_name: null
8
+ max_seq_length: 128
9
+ overwrite_output_dir: true # Set to false, if using one_hour_job
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/Amzn13K/train_split6500_fs5.jsonl
17
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
18
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ label_max_seq_length: 32
20
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
21
+ descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
22
+ all_labels : datasets/Amzn13K/all_labels.txt
23
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
24
+
25
+ # max_descs_per_label: 10
26
+ # contrastive_learning_samples: 5000
27
+ # cl_min_positive_descs: 1
28
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
29
+ ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
30
+
31
+ MODEL:
32
+ model_name_or_path: bert-base-uncased
33
+ pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
34
+ config_name: null
35
+ tokenizer_name: null
36
+ cache_dir: null
37
+ use_fast_tokenizer: true
38
+ model_revision: main
39
+ use_auth_token: false
40
+ ignore_mismatched_sizes: false
41
+ negative_sampling: "none"
42
+ semsup: true
43
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
44
+ label_model_name_or_path: prajjwal1/bert-small
45
+ encoder_model_type: bert
46
+ use_custom_optimizer: adamw
47
+ output_learning_rate: 5.e-5
48
+ arch_type : 2
49
+ add_label_name: true
50
+ normalize_embeddings: false
51
+ tie_weights: false
52
+ coil: true
53
+ colbert: false
54
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
55
+ token_dim: 16
56
+ label_frozen_layers: 2
57
+
58
+ TRAINING:
59
+ do_train: true
60
+ do_eval: true
61
+ do_predict: false
62
+ per_device_train_batch_size: 1
63
+ gradient_accumulation_steps: 8
64
+ per_device_eval_batch_size: 1
65
+ learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
66
+ num_train_epochs: 10
67
+ save_steps: 5000
68
+ evaluation_strategy: steps
69
+ eval_steps: 500
70
+ fp16: true
71
+ fp16_opt_level: O1
72
+ lr_scheduler_type: "linear" # defaults to 'linear'
73
+ dataloader_num_workers: 4
74
+ label_names: [labels]
75
+ scenario: "unseen_labels"
76
+
77
+ ddp_find_unused_parameters: false
78
+ max_eval_samples: 15000
79
+ # ignore_data_skip: true
80
+ # one_hour_job: true
cleaned_code/configs/amzn13k_baseline_hierdescs.yml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 128
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 96
21
+ # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
22
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
23
+ test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 2000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+ coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
33
+
34
+ MODEL:
35
+ model_name_or_path: bert-base-uncased
36
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
37
+ config_name: null
38
+ tokenizer_name: null
39
+ cache_dir: null
40
+ use_fast_tokenizer: true
41
+ model_revision: main
42
+ use_auth_token: false
43
+ ignore_mismatched_sizes: false
44
+ negative_sampling: "none"
45
+ semsup: true
46
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
47
+ label_model_name_or_path: prajjwal1/bert-small
48
+ encoder_model_type: bert
49
+ use_custom_optimizer: adamw
50
+ output_learning_rate: 1.e-4
51
+ arch_type : 2
52
+ add_label_name: true
53
+ normalize_embeddings: false
54
+ tie_weights: false
55
+ coil: true
56
+ colbert: false
57
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
58
+ token_dim: 16
59
+ label_frozen_layers: 2
60
+
61
+ TRAINING:
62
+ do_train: true
63
+ do_eval: true
64
+ do_predict: false
65
+ per_device_train_batch_size: 1
66
+ gradient_accumulation_steps: 8
67
+ per_device_eval_batch_size: 1
68
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
69
+ num_train_epochs: 3
70
+ save_steps: 4900
71
+ evaluation_strategy: steps
72
+ eval_steps: 5000
73
+ fp16: true
74
+ fp16_opt_level: O1
75
+ lr_scheduler_type: "linear" # defaults to 'linear'
76
+ dataloader_num_workers: 4
77
+ label_names: [labels]
78
+ scenario: "unseen_labels"
79
+
80
+ ddp_find_unused_parameters: false
81
+ max_eval_samples: 15000
82
+ ignore_data_skip: true
83
+ # one_hour_job: true
84
+
cleaned_code/configs/amzn13k_baseline_hierdescs_seen.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
3
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
4
+
5
+ DATA:
6
+ task_name: amazon13k
7
+ dataset_name: amazon13k
8
+ dataset_config_name: null
9
+ max_seq_length: 128
10
+ overwrite_output_dir: true # Set to false, if using one_hour_job
11
+ overwrite_cache: false
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/Amzn13K/train_split6500_2.jsonl
18
+ validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
19
+ test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
20
+ label_max_seq_length: 96
21
+ descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
22
+ # descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
23
+ # test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
24
+
25
+ all_labels : datasets/Amzn13K/all_labels.txt
26
+ test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
27
+
28
+ contrastive_learning_samples: 2000
29
+ cl_min_positive_descs: 1
30
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
31
+ # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
32
+
33
+ MODEL:
34
+ model_name_or_path: bert-base-uncased
35
+ # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
36
+ config_name: null
37
+ tokenizer_name: null
38
+ cache_dir: null
39
+ use_fast_tokenizer: true
40
+ model_revision: main
41
+ use_auth_token: false
42
+ ignore_mismatched_sizes: false
43
+ negative_sampling: "none"
44
+ semsup: true
45
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
46
+ label_model_name_or_path: prajjwal1/bert-small
47
+ encoder_model_type: bert
48
+ use_custom_optimizer: adamw
49
+ output_learning_rate: 1.e-4
50
+ arch_type : 2
51
+ add_label_name: true
52
+ normalize_embeddings: false
53
+ tie_weights: false
54
+ coil: true
55
+ colbert: false
56
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
57
+ token_dim: 16
58
+ label_frozen_layers: 2
59
+
60
+ TRAINING:
61
+ do_train: true
62
+ do_eval: true
63
+ per_device_train_batch_size: 1
64
+ gradient_accumulation_steps: 4
65
+ per_device_eval_batch_size: 1
66
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
67
+ num_train_epochs: 3
68
+ save_steps: 5000
69
+ evaluation_strategy: steps
70
+ eval_steps: 1000
71
+ fp16: true
72
+ fp16_opt_level: O1
73
+ lr_scheduler_type: "linear" # defaults to 'linear'
74
+ dataloader_num_workers: 4
75
+ label_names: [labels]
76
+ scenario: "unseen_labels"
77
+
78
+ ddp_find_unused_parameters: false
79
+ max_eval_samples: 15000
80
+ # ignore_data_skip: true
81
+ # one_hour_job: true
82
+
cleaned_code/configs/baseline.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "eurlex4k_baseline_128_newds"
2
+ EXP_DESC: "Eurlex4K Baseline with len=128 on new dataset"
3
+ # Ideally would contain all the possible keys
4
+
5
+ DATA:
6
+ task_name: eurlex4k
7
+ dataset_name: eurlex
8
+ dataset_config_name: null
9
+ max_seq_length: 128
10
+ overwrite_output_dir: true
11
+ overwrite_cache: true
12
+ pad_to_max_length: true
13
+ load_from_local: true
14
+ max_train_samples: null
15
+ max_eval_samples: null
16
+ max_predict_samples: null
17
+ train_file: datasets/eurlex_raw_text_dataset/train.jsonl
18
+ validation_file: datasets/eurlex_raw_text_dataset/test.jsonl
19
+ test_file: datasets/eurlex_raw_text_dataset/test.jsonl
20
+
21
+ MODEL:
22
+ model_name_or_path: bert-base-uncased
23
+ config_name: null
24
+ tokenizer_name: null
25
+ cache_dir: null
26
+ use_fast_tokenizer: true
27
+ model_revision: main
28
+ use_auth_token: false
29
+ ignore_mismatched_sizes: false
30
+ negative_sampling: "none"
31
+ semsup: false
32
+ encoder_model_type: bert
33
+ user_custom_optimizer: null
34
+
35
+
36
+ TRAINING:
37
+ do_train: true
38
+ do_eval: true
39
+ per_device_train_batch_size: 8
40
+ gradient_accumulation_steps: 1
41
+ learning_rate: 1.e-4 # Will point to input encoder lr, if user_custom_optimizer is False
42
+ num_train_epochs: 30
43
+ save_steps: 20000
44
+ evaluation_strategy: steps
45
+ eval_steps: 10000
46
+ fp16: true
47
+ fp16_opt_level: O1
48
+ lr_scheduler_type: "constant_with_warmup" # defaults to 'linear'
49
+ dataloader_num_workers: 4
50
+ label_names: [labels]
51
+
52
+
cleaned_code/configs/eurlex4.3k_baseline.yml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: ../training/datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: ../training/datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: ../training/datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: ../training/datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: ../training/datasets/eurlex4.3k/test.jsonl
21
+ # test_file: ../training/datasets/eurlex4.3k/test.jsonl
22
+ train_file: ../training/datasets/eurlex4.3k/train_split1057_1000highfreq.jsonl
23
+ validation_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
24
+ test_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
25
+
26
+ # validation_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 96
29
+ descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
30
+ # descriptions_file: ../training/datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
31
+ # descriptions_file: ../training/datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
32
+ all_labels : ../training/datasets/eurlex4.3k/all_labels.txt
33
+ test_labels: ../training/datasets/eurlex4.3k/unseen_labels_split1057.txt
34
+ # test_labels: ../training/datasets/eurlex4.3k/unseen_labels.txt
35
+
36
+ max_descs_per_label: 5
37
+ # contrastive_learning_samples: 1500
38
+ # cl_min_positive_descs: 1
39
+ # bm_short_file: ../training/datasets/eurlex4.3k/train_bmshort.txt
40
+
41
+ MODEL:
42
+ model_name_or_path: bert-base-uncased
43
+ config_name: null
44
+ tokenizer_name: null
45
+ cache_dir: null
46
+ use_fast_tokenizer: true
47
+ model_revision: main
48
+ use_auth_token: false
49
+ ignore_mismatched_sizes: false
50
+ negative_sampling: "none"
51
+ semsup: true
52
+ # label_model_name_or_path: prajjwal1/bert-small
53
+ label_model_name_or_path: bert-base-uncased
54
+ # label_model_name_or_path: prajjwal1/bert-tiny
55
+ encoder_model_type: bert
56
+ use_custom_optimizer: adamw
57
+ output_learning_rate: 1.e-4
58
+ arch_type : 2
59
+ add_label_name: false
60
+ normalize_embeddings: false
61
+ tie_weights: true
62
+ coil: true
63
+ # use_precomputed_embeddings: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
64
+ token_dim: 16
65
+ num_frozen_layers: 9
66
+
67
+ TRAINING:
68
+ do_train: true
69
+ do_eval: true
70
+ per_device_train_batch_size: 1
71
+ gradient_accumulation_steps: 4
72
+ per_device_eval_batch_size: 1
73
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
74
+ num_train_epochs: 10
75
+ save_steps: 10000
76
+ evaluation_strategy: steps
77
+ eval_steps: 500
78
+ fp16: true
79
+ fp16_opt_level: O1
80
+ lr_scheduler_type: "linear" # defaults to 'linear'
81
+ dataloader_num_workers: 8
82
+ label_names: [labels]
83
+ scenario: "unseen_labels"
84
+
85
+ ddp_find_unused_parameters: false
86
+
87
+
cleaned_code/configs/eurlex4.3k_baseline2.yml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: datasets/eurlex4.3k/test.jsonl
21
+ # test_file: datasets/eurlex4.3k/test.jsonl
22
+ train_file: datasets/eurlex4.3k/train_split248_root.jsonl
23
+ validation_file: datasets/eurlex4.3k/test_unseen_split248_root.jsonl
24
+ test_file: datasets/eurlex4.3k/test_unseen_split248_root.jsonl
25
+
26
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 96
29
+ descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
30
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
31
+ all_labels : datasets/eurlex4.3k/all_labels.txt
32
+ test_labels: datasets/eurlex4.3k/unseen_labels_split248_root.txt
33
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
34
+
35
+ max_descs_per_label: 5
36
+ contrastive_learning_samples: 2500
37
+ cl_min_positive_descs: 1
38
+ bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
39
+
40
+ MODEL:
41
+ model_name_or_path: bert-base-uncased
42
+ config_name: null
43
+ tokenizer_name: null
44
+ cache_dir: null
45
+ use_fast_tokenizer: true
46
+ model_revision: main
47
+ use_auth_token: false
48
+ ignore_mismatched_sizes: false
49
+ negative_sampling: "none"
50
+ semsup: true
51
+ # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
52
+ label_model_name_or_path: prajjwal1/bert-tiny
53
+ encoder_model_type: bert
54
+ use_custom_optimizer: adamw
55
+ output_learning_rate: 1.e-4
56
+ arch_type : 2
57
+ add_label_name: false
58
+ normalize_embeddings: false
59
+ tie_weights: false
60
+ coil: true
61
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
62
+ token_dim: 16
63
+
64
+ TRAINING:
65
+ do_train: true
66
+ do_eval: true
67
+ per_device_train_batch_size: 1
68
+ gradient_accumulation_steps: 8
69
+ per_device_eval_batch_size: 2
70
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
71
+ num_train_epochs: 10
72
+ save_steps: 10000
73
+ evaluation_strategy: steps
74
+ eval_steps: 500
75
+ fp16: true
76
+ fp16_opt_level: O1
77
+ lr_scheduler_type: "linear" # defaults to 'linear'
78
+ dataloader_num_workers: 8
79
+ label_names: [labels]
80
+ scenario: "unseen_labels"
81
+
82
+ ddp_find_unused_parameters: false
83
+
84
+
cleaned_code/configs/eurlex4.3k_baseline_fs.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: datasets/eurlex4.3k/test.jsonl
21
+ # test_file: datasets/eurlex4.3k/test.jsonl
22
+ train_file: datasets/eurlex4.3k/train_split1057_fs1.jsonl
23
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs1.jsonl
24
+ test_file: datasets/eurlex4.3k/test_unseen_split1057_fs1.jsonl
25
+
26
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 80
29
+ # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
30
+ descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
31
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
32
+ all_labels : datasets/eurlex4.3k/all_labels.txt
33
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs1.txt
34
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
35
+ ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs1.txt
36
+
37
+ max_descs_per_label: 5
38
+ contrastive_learning_samples: 600
39
+ cl_min_positive_descs: 2
40
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
41
+
42
+ MODEL:
43
+ model_name_or_path: bert-base-uncased
44
+ pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
45
+ config_name: null
46
+ tokenizer_name: null
47
+ cache_dir: null
48
+ use_fast_tokenizer: true
49
+ model_revision: main
50
+ use_auth_token: false
51
+ ignore_mismatched_sizes: false
52
+ negative_sampling: "none"
53
+ semsup: true
54
+ label_model_name_or_path: prajjwal1/bert-small
55
+ # label_model_name_or_path: prajjwal1/bert-tiny
56
+ encoder_model_type: bert
57
+ use_custom_optimizer: adamw
58
+ output_learning_rate: 1.e-4
59
+ arch_type : 2
60
+ add_label_name: false
61
+ normalize_embeddings: false
62
+ tie_weights: false
63
+ coil: true
64
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
65
+ token_dim: 16
66
+ label_frozen_layers: 2
67
+
68
+ TRAINING:
69
+ do_train: true
70
+ do_eval: true
71
+ per_device_train_batch_size: 1
72
+ gradient_accumulation_steps: 4
73
+ per_device_eval_batch_size: 1
74
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
75
+ num_train_epochs: 100
76
+ save_steps: 10000
77
+ evaluation_strategy: steps
78
+ eval_steps: 100
79
+ fp16: true
80
+ fp16_opt_level: O1
81
+ lr_scheduler_type: "linear" # defaults to 'linear'
82
+ dataloader_num_workers: 1
83
+ label_names: [labels]
84
+ scenario: "unseen_labels"
85
+
86
+ ddp_find_unused_parameters: false
87
+ # ignore_data_skip: true
88
+ # one_hour_job: true
89
+
90
+
cleaned_code/configs/eurlex4.3k_baseline_fs20.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: datasets/eurlex4.3k/test.jsonl
21
+ # test_file: datasets/eurlex4.3k/test.jsonl
22
+ train_file: datasets/eurlex4.3k/train_split1057_fs20.jsonl
23
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs20.jsonl
24
+ test_file: datasets/eurlex4.3k/test_unseen_split1057_fs20.jsonl
25
+
26
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 80
29
+ # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
30
+ descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
31
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
32
+ all_labels : datasets/eurlex4.3k/all_labels.txt
33
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs20.txt
34
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
35
+ ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs20.txt
36
+
37
+ max_descs_per_label: 5
38
+ contrastive_learning_samples: 600
39
+ cl_min_positive_descs: 2
40
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
41
+
42
+ MODEL:
43
+ model_name_or_path: bert-base-uncased
44
+ pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
45
+ config_name: null
46
+ tokenizer_name: null
47
+ cache_dir: null
48
+ use_fast_tokenizer: true
49
+ model_revision: main
50
+ use_auth_token: false
51
+ ignore_mismatched_sizes: false
52
+ negative_sampling: "none"
53
+ semsup: true
54
+ label_model_name_or_path: prajjwal1/bert-small
55
+ # label_model_name_or_path: prajjwal1/bert-tiny
56
+ encoder_model_type: bert
57
+ use_custom_optimizer: adamw
58
+ output_learning_rate: 1.e-4
59
+ arch_type : 2
60
+ add_label_name: false
61
+ normalize_embeddings: false
62
+ tie_weights: false
63
+ coil: true
64
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
65
+ token_dim: 16
66
+ label_frozen_layers: 2
67
+
68
+ TRAINING:
69
+ do_train: true
70
+ do_eval: true
71
+ per_device_train_batch_size: 1
72
+ gradient_accumulation_steps: 4
73
+ per_device_eval_batch_size: 1
74
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
75
+ num_train_epochs: 20
76
+ save_steps: 10000
77
+ evaluation_strategy: steps
78
+ eval_steps: 500
79
+ fp16: true
80
+ fp16_opt_level: O1
81
+ lr_scheduler_type: "linear" # defaults to 'linear'
82
+ dataloader_num_workers: 1
83
+ label_names: [labels]
84
+ scenario: "unseen_labels"
85
+
86
+ ddp_find_unused_parameters: false
87
+ # ignore_data_skip: true
88
+ # one_hour_job: true
89
+
90
+
cleaned_code/configs/eurlex4.3k_baseline_fs5.yml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ train_file: datasets/eurlex4.3k/train_split1057_fs5.jsonl
17
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs5.jsonl
18
+ test_file: datasets/eurlex4.3k/test_unseen_split1057_fs5.jsonl
19
+
20
+ label_max_seq_length: 80
21
+ descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
22
+ all_labels : datasets/eurlex4.3k/all_labels.txt
23
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs5.txt
24
+ ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs5.txt
25
+
26
+ max_descs_per_label: 5
27
+ contrastive_learning_samples: 600
28
+ cl_min_positive_descs: 2
29
+
30
+ MODEL:
31
+ model_name_or_path: bert-base-uncased
32
+ pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
33
+ config_name: null
34
+ tokenizer_name: null
35
+ cache_dir: null
36
+ use_fast_tokenizer: true
37
+ model_revision: main
38
+ use_auth_token: false
39
+ ignore_mismatched_sizes: false
40
+ negative_sampling: "none"
41
+ semsup: true
42
+ label_model_name_or_path: prajjwal1/bert-small
43
+ # label_model_name_or_path: prajjwal1/bert-tiny
44
+ encoder_model_type: bert
45
+ use_custom_optimizer: adamw
46
+ output_learning_rate: 1.e-4
47
+ arch_type : 2
48
+ add_label_name: false
49
+ normalize_embeddings: false
50
+ tie_weights: false
51
+ coil: true
52
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
53
+ token_dim: 16
54
+ label_frozen_layers: 2
55
+
56
+ TRAINING:
57
+ do_train: true
58
+ do_eval: true
59
+ per_device_train_batch_size: 1
60
+ gradient_accumulation_steps: 4
61
+ per_device_eval_batch_size: 1
62
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
63
+ num_train_epochs: 20
64
+ save_steps: 10000
65
+ evaluation_strategy: steps
66
+ eval_steps: 500
67
+ fp16: true
68
+ fp16_opt_level: O1
69
+ lr_scheduler_type: "linear" # defaults to 'linear'
70
+ dataloader_num_workers: 1
71
+ label_names: [labels]
72
+ scenario: "unseen_labels"
73
+
74
+ ddp_find_unused_parameters: false
75
+ # ignore_data_skip: true
76
+ # one_hour_job: true
77
+
78
+
cleaned_code/configs/eurlex4.3k_baseline_nl.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXP_NAME: "semsup_descs_100ep_newds_cosine"
2
+ EXP_DESC: "SemSup Descriptions ran for 100 epochs"
3
+
4
+ DATA:
5
+ task_name: eurlex57k
6
+ dataset_name: eurlex
7
+ dataset_config_name: null
8
+ max_seq_length: 512
9
+ overwrite_output_dir: true
10
+ overwrite_cache: false
11
+ pad_to_max_length: true
12
+ load_from_local: true
13
+ max_train_samples: null
14
+ max_eval_samples: null
15
+ max_predict_samples: null
16
+ # train_file: datasets/eurlex4.3k/train_hr.jsonl
17
+ # train_file: datasets/eurlex4.3k/train.jsonl
18
+ # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
19
+ # test_file: datasets/eurlex4.3k/test_unseen.jsonl
20
+ # validation_file: datasets/eurlex4.3k/test.jsonl
21
+ # test_file: datasets/eurlex4.3k/test.jsonl
22
+ train_file: datasets/eurlex4.3k/train_split1057.jsonl
23
+ validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
24
+ test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
25
+
26
+ # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
27
+ # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
28
+ label_max_seq_length: 96
29
+ descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
30
+ # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
31
+ # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
32
+ all_labels : datasets/eurlex4.3k/all_labels.txt
33
+ test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
34
+ # test_labels: datasets/eurlex4.3k/unseen_labels.txt
35
+
36
+ max_descs_per_label: 5
37
+ # contrastive_learning_samples: 1500
38
+ # cl_min_positive_descs: 1
39
+ # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
40
+
41
+ MODEL:
42
+ model_name_or_path: bert-base-uncased
43
+ config_name: null
44
+ tokenizer_name: null
45
+ cache_dir: null
46
+ use_fast_tokenizer: true
47
+ model_revision: main
48
+ use_auth_token: false
49
+ ignore_mismatched_sizes: false
50
+ negative_sampling: "none"
51
+ semsup: true
52
+ label_model_name_or_path: prajjwal1/bert-small
53
+ # label_model_name_or_path: bert-base-uncased
54
+ # label_model_name_or_path: prajjwal1/bert-tiny
55
+ encoder_model_type: bert
56
+ use_custom_optimizer: adamw
57
+ output_learning_rate: 1.e-4
58
+ arch_type : 2
59
+ add_label_name: false
60
+ normalize_embeddings: false
61
+ tie_weights: false
62
+ coil: true
63
+ # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
64
+ token_dim: 16
65
+ # num_frozen_layers: 9
66
+ label_frozen_layers: 2
67
+
68
+ TRAINING:
69
+ do_train: true
70
+ do_eval: true
71
+ per_device_train_batch_size: 1
72
+ gradient_accumulation_steps: 4
73
+ per_device_eval_batch_size: 1
74
+ learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
75
+ num_train_epochs: 10
76
+ save_steps: 10000
77
+ evaluation_strategy: steps
78
+ eval_steps: 500
79
+ fp16: true
80
+ fp16_opt_level: O1
81
+ lr_scheduler_type: "linear" # defaults to 'linear'
82
+ dataloader_num_workers: 8
83
+ label_names: [labels]
84
+ scenario: "unseen_labels"
85
+
86
+ ddp_find_unused_parameters: false
87
+
88
+