tikim
commited on
Commit
•
933313b
1
Parent(s):
71c6c74
Update training.ipynb to v0.2.0
Browse files- training.ipynb +30 -13
training.ipynb
CHANGED
@@ -109,6 +109,8 @@
|
|
109 |
" self.data = [row for row in reader]\n",
|
110 |
"\n",
|
111 |
" def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:\n",
|
|
|
|
|
112 |
" src, trg = self.data[index]\n",
|
113 |
" embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)\n",
|
114 |
" embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])\n",
|
@@ -121,13 +123,26 @@
|
|
121 |
"DATA_ROOT = './output'\n",
|
122 |
"FILE_FFAC_FULL = 'ffac_full.csv'\n",
|
123 |
"FILE_FFAC_TEST = 'ffac_test.csv'\n",
|
124 |
-
"
|
125 |
-
"
|
126 |
"\n",
|
127 |
-
"train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_FULL}')\n",
|
128 |
-
"eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_TEST}') \n",
|
129 |
-
"
|
130 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
]
|
132 |
},
|
133 |
{
|
@@ -175,12 +190,12 @@
|
|
175 |
" do_eval=True,\n",
|
176 |
" evaluation_strategy=\"epoch\",\n",
|
177 |
" save_strategy=\"epoch\",\n",
|
178 |
-
"
|
179 |
-
" num_train_epochs=25,\n",
|
180 |
-
"
|
181 |
-
" per_device_train_batch_size=64,\n",
|
182 |
-
"
|
183 |
-
" per_device_eval_batch_size=64,\n",
|
184 |
" warmup_ratio=0.1,\n",
|
185 |
" gradient_accumulation_steps=4,\n",
|
186 |
" save_total_limit=5,\n",
|
@@ -229,7 +244,9 @@
|
|
229 |
"source": [
|
230 |
"trainer.train()\n",
|
231 |
"\n",
|
232 |
-
"model.save_pretrained(\"dump/best_model\")"
|
|
|
|
|
233 |
]
|
234 |
}
|
235 |
],
|
|
|
109 |
" self.data = [row for row in reader]\n",
|
110 |
"\n",
|
111 |
" def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:\n",
|
112 |
+
"# with open('train_log.txt', 'a+') as log_file:\n",
|
113 |
+
"# log_file.write(f'reading data[{index}] {self.data[index]}\\n')\n",
|
114 |
" src, trg = self.data[index]\n",
|
115 |
" embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)\n",
|
116 |
" embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])\n",
|
|
|
123 |
"DATA_ROOT = './output'\n",
|
124 |
"FILE_FFAC_FULL = 'ffac_full.csv'\n",
|
125 |
"FILE_FFAC_TEST = 'ffac_test.csv'\n",
|
126 |
+
"FILE_JA_KO_TRAIN = 'ja_ko_train.csv'\n",
|
127 |
+
"FILE_JA_KO_TEST = 'ja_ko_test.csv'\n",
|
128 |
"\n",
|
129 |
+
"# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_FULL}')\n",
|
130 |
+
"# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_TEST}') \n",
|
131 |
+
"train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')\n",
|
132 |
+
"eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TEST}') "
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": null,
|
138 |
+
"metadata": {},
|
139 |
+
"outputs": [],
|
140 |
+
"source": [
|
141 |
+
"# be sure to check the column count of each dataset if you encounter \"ValueError: too many values to unpack (expected 2)\"\n",
|
142 |
+
"# at the `src, trg = self.data[index]`\n",
|
143 |
+
"# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.\n",
|
144 |
+
"# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)\n",
|
145 |
+
"# debug_data = train_dataset.data\n"
|
146 |
]
|
147 |
},
|
148 |
{
|
|
|
190 |
" do_eval=True,\n",
|
191 |
" evaluation_strategy=\"epoch\",\n",
|
192 |
" save_strategy=\"epoch\",\n",
|
193 |
+
" num_train_epochs=3,\n",
|
194 |
+
" # num_train_epochs=25,\n",
|
195 |
+
" per_device_train_batch_size=30,\n",
|
196 |
+
" # per_device_train_batch_size=64,\n",
|
197 |
+
" per_device_eval_batch_size=30,\n",
|
198 |
+
" # per_device_eval_batch_size=64,\n",
|
199 |
" warmup_ratio=0.1,\n",
|
200 |
" gradient_accumulation_steps=4,\n",
|
201 |
" save_total_limit=5,\n",
|
|
|
244 |
"source": [
|
245 |
"trainer.train()\n",
|
246 |
"\n",
|
247 |
+
"model.save_pretrained(\"dump/best_model\")\n",
|
248 |
+
"src_tokenizer.save_pretrained(\"dump/best_model/src_tokenizer\")\n",
|
249 |
+
"trg_tokenizer.save_pretrained(\"dump/best_model/trg_tokenizer\")"
|
250 |
]
|
251 |
}
|
252 |
],
|