MasumBhuiyan commited on
Commit
c5dc1d4
1 Parent(s): 6543d58

Seq2Seq model implemented

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/bn_multi_tribe_mt.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="C:\ProgramData\miniconda3" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="C:\ProgramData\miniconda3" project-jdk-type="Python SDK" />
7
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/bn_multi_tribe_mt.iml" filepath="$PROJECT_DIR$/.idea/bn_multi_tribe_mt.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
src/__init__.py ADDED
File without changes
src/pipes/data.py CHANGED
@@ -1,8 +1,9 @@
1
  import random
2
- import const
3
- import utils
4
  import string
5
-
 
6
 
7
  class SequenceLoader:
8
  def __init__(self):
@@ -40,8 +41,9 @@ class SequenceLoader:
40
 
41
 
42
  def serialize(src_seq, tar_seq):
43
- tar_seq_in = tar_seq[:, :-1].to_tensor()
44
- tar_seq_out = tar_seq[:, 1:].to_tensor()
 
45
  return (src_seq, tar_seq_in), tar_seq_out
46
 
47
 
@@ -165,28 +167,27 @@ class Dataset:
165
  self.dataset_dict = seq_processor.get_dict()
166
 
167
  def pull(self):
168
- src_lang_train_seqs = self.dataset_dict[self.langs[0]]["train"]
169
- tar_lang_train_seqs = self.dataset_dict[self.langs[1]]["train"]
170
 
171
- src_lang_val_seqs = self.dataset_dict[self.langs[0]]["val"]
172
- tar_lang_val_seqs = self.dataset_dict[self.langs[1]]["val"]
173
 
174
  train_ds = ((tf.data.Dataset
175
  .from_tensor_slices((src_lang_train_seqs, tar_lang_train_seqs)))
176
  .shuffle(const.BUFFER_SIZE)
177
  .batch(const.BATCH_SIZE))
178
 
179
- val_ds = (tf.data.Dataset
180
- .from_tensor_slices(src_lang_val_seqs, tar_lang_val_seqs)
181
  .shuffle(const.BUFFER_SIZE)
182
  .batch(const.BATCH_SIZE))
183
 
184
  train_ds = train_ds.map(serialize, tf.data.AUTOTUNE)
185
  val_ds = val_ds.map(serialize, tf.data.AUTOTUNE)
186
 
187
- return trainset, valset
188
 
189
- @staticmethod
190
  def get_dict(self):
191
  return self.dataset_dict
192
 
 
1
  import random
2
+ from pipes import const
3
+ from pipes import utils
4
  import string
5
+ import tensorflow as tf
6
+ import numpy as np
7
 
8
  class SequenceLoader:
9
  def __init__(self):
 
41
 
42
 
43
  def serialize(src_seq, tar_seq):
44
+ tar_seq_in = tf.convert_to_tensor(tar_seq[:, :-1])
45
+ tar_seq_out = tf.convert_to_tensor(tar_seq[:, 1:])
46
+ src_seq = tf.convert_to_tensor(src_seq)
47
  return (src_seq, tar_seq_in), tar_seq_out
48
 
49
 
 
167
  self.dataset_dict = seq_processor.get_dict()
168
 
169
  def pull(self):
170
+ src_lang_train_seqs = np.array(self.dataset_dict[self.langs[0]]["train"])
171
+ tar_lang_train_seqs = np.array(self.dataset_dict[self.langs[1]]["train"])
172
 
173
+ src_lang_val_seqs = np.array(self.dataset_dict[self.langs[0]]["val"])
174
+ tar_lang_val_seqs = np.array(self.dataset_dict[self.langs[1]]["val"])
175
 
176
  train_ds = ((tf.data.Dataset
177
  .from_tensor_slices((src_lang_train_seqs, tar_lang_train_seqs)))
178
  .shuffle(const.BUFFER_SIZE)
179
  .batch(const.BATCH_SIZE))
180
 
181
+ val_ds = ((tf.data.Dataset
182
+ .from_tensor_slices((src_lang_val_seqs, tar_lang_val_seqs)))
183
  .shuffle(const.BUFFER_SIZE)
184
  .batch(const.BATCH_SIZE))
185
 
186
  train_ds = train_ds.map(serialize, tf.data.AUTOTUNE)
187
  val_ds = val_ds.map(serialize, tf.data.AUTOTUNE)
188
 
189
+ return train_ds, val_ds
190
 
 
191
  def get_dict(self):
192
  return self.dataset_dict
193
 
src/{seq2seqtrainer.py → seq2seq_trainer.py} RENAMED
@@ -1,5 +1,8 @@
1
- from pipes import models, utils, const
 
 
2
  from pipes.data import Dataset
 
3
 
4
  if __name__ == "__main__":
5
  input_lang = 'gr'
@@ -8,17 +11,30 @@ if __name__ == "__main__":
8
  dataset_object = Dataset([input_lang, output_lang])
9
  dataset_object.pack()
10
  dataset_object.process()
 
11
  dataset_dict = dataset_object.get_dict()
12
 
13
- seq2seq = models.Seq2Seq(
14
  input_vocab_size=dataset_dict[input_lang]["vocab_size"],
15
  output_vocab_size=dataset_dict[output_lang]["vocab_size"],
16
  embedding_dim=256,
17
- hidden_units=64)
18
- seq2seq.build()
19
- seq2seq.run(
20
- encoder_input_data=dataset_dict[input_lang]["train"],
21
- decoder_input_data=dataset_dict[output_lang]["train"],
22
- val_encoder_input_data=dataset_dict[input_lang]["val"],
23
- val_decoder_input_data=dataset_dict[output_lang]["val"],
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
 
1
+ from pipes import utils
2
+ from pipes import const
3
+ from pipes import models
4
  from pipes.data import Dataset
5
+ import tensorflow as tf
6
 
7
  if __name__ == "__main__":
8
  input_lang = 'gr'
 
11
  dataset_object = Dataset([input_lang, output_lang])
12
  dataset_object.pack()
13
  dataset_object.process()
14
+ train_ds, val_ds = dataset_object.pull()
15
  dataset_dict = dataset_object.get_dict()
16
 
17
+ model_object = models.Seq2Seq(
18
  input_vocab_size=dataset_dict[input_lang]["vocab_size"],
19
  output_vocab_size=dataset_dict[output_lang]["vocab_size"],
20
  embedding_dim=256,
21
+ hidden_units=512
22
+ )
23
+
24
+ model_object.build()
25
+ model = model_object.get()
26
+
27
+ model.compile(
28
+ optimizer=tf.keras.optimizers.Adam(),
29
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
30
+ metrics=['accuracy', 'val_accuracy'],
31
+ )
32
+
33
+ history = model.fit(
34
+ train_ds.repeat(),
35
+ epochs=10,
36
+ steps_per_epoch=100,
37
+ validation_steps=20,
38
+ validation_data=val_ds,
39
+ callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
40
  )