tikim commited on
Commit
933313b
1 Parent(s): 71c6c74

Update training.ipynb to v0.2.0

Browse files
Files changed (1) hide show
  1. training.ipynb +30 -13
training.ipynb CHANGED
@@ -109,6 +109,8 @@
109
  " self.data = [row for row in reader]\n",
110
  "\n",
111
  " def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:\n",
 
 
112
  " src, trg = self.data[index]\n",
113
  " embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)\n",
114
  " embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])\n",
@@ -121,13 +123,26 @@
121
  "DATA_ROOT = './output'\n",
122
  "FILE_FFAC_FULL = 'ffac_full.csv'\n",
123
  "FILE_FFAC_TEST = 'ffac_test.csv'\n",
124
- "# FILE_JA_KO_TRAIN = 'ja_ko_train.csv'\n",
125
- "# FILE_JA_KO_TEST = 'ja_ko_test.csv'\n",
126
  "\n",
127
- "train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_FULL}')\n",
128
- "eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_TEST}') \n",
129
- "# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')\n",
130
- "# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TEST}') "
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  ]
132
  },
133
  {
@@ -175,12 +190,12 @@
175
  " do_eval=True,\n",
176
  " evaluation_strategy=\"epoch\",\n",
177
  " save_strategy=\"epoch\",\n",
178
- "# num_train_epochs=5,\n",
179
- " num_train_epochs=25,\n",
180
- "# per_device_train_batch_size=32,\n",
181
- " per_device_train_batch_size=64,\n",
182
- "# per_device_eval_batch_size=32,\n",
183
- " per_device_eval_batch_size=64,\n",
184
  " warmup_ratio=0.1,\n",
185
  " gradient_accumulation_steps=4,\n",
186
  " save_total_limit=5,\n",
@@ -229,7 +244,9 @@
229
  "source": [
230
  "trainer.train()\n",
231
  "\n",
232
- "model.save_pretrained(\"dump/best_model\")"
 
 
233
  ]
234
  }
235
  ],
 
109
  " self.data = [row for row in reader]\n",
110
  "\n",
111
  " def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:\n",
112
+ "# with open('train_log.txt', 'a+') as log_file:\n",
113
+ "# log_file.write(f'reading data[{index}] {self.data[index]}\\n')\n",
114
  " src, trg = self.data[index]\n",
115
  " embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)\n",
116
  " embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])\n",
 
123
  "DATA_ROOT = './output'\n",
124
  "FILE_FFAC_FULL = 'ffac_full.csv'\n",
125
  "FILE_FFAC_TEST = 'ffac_test.csv'\n",
126
+ "FILE_JA_KO_TRAIN = 'ja_ko_train.csv'\n",
127
+ "FILE_JA_KO_TEST = 'ja_ko_test.csv'\n",
128
  "\n",
129
+ "# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_FULL}')\n",
130
+ "# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_TEST}') \n",
131
+ "train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')\n",
132
+ "eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TEST}') "
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "# be sure to check the column count of each dataset if you encounter \"ValueError: too many values to unpack (expected 2)\"\n",
142
+ "# at the `src, trg = self.data[index]`\n",
143
+ "# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.\n",
144
+ "# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)\n",
145
+ "# debug_data = train_dataset.data\n"
146
  ]
147
  },
148
  {
 
190
  " do_eval=True,\n",
191
  " evaluation_strategy=\"epoch\",\n",
192
  " save_strategy=\"epoch\",\n",
193
+ " num_train_epochs=3,\n",
194
+ " # num_train_epochs=25,\n",
195
+ " per_device_train_batch_size=30,\n",
196
+ " # per_device_train_batch_size=64,\n",
197
+ " per_device_eval_batch_size=30,\n",
198
+ " # per_device_eval_batch_size=64,\n",
199
  " warmup_ratio=0.1,\n",
200
  " gradient_accumulation_steps=4,\n",
201
  " save_total_limit=5,\n",
 
244
  "source": [
245
  "trainer.train()\n",
246
  "\n",
247
+ "model.save_pretrained(\"dump/best_model\")\n",
248
+ "src_tokenizer.save_pretrained(\"dump/best_model/src_tokenizer\")\n",
249
+ "trg_tokenizer.save_pretrained(\"dump/best_model/trg_tokenizer\")"
250
  ]
251
  }
252
  ],