MasumBhuiyan commited on
Commit
e13f31a
1 Parent(s): dcd73d9

Updated Data Processing module. Seq2Seq model added (unverified)

Browse files
Files changed (3) hide show
  1. src/pipes/data.py +1 -2
  2. src/pipes/models.py +71 -0
  3. src/seq2seqtrainer.py +24 -0
src/pipes/data.py CHANGED
@@ -1,6 +1,5 @@
1
  import random
2
- import utils
3
- import const
4
 
5
 
6
  class Sentence:
 
1
  import random
2
+ from pipes import utils, const
 
3
 
4
 
5
  class Sentence:
src/pipes/models.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+
4
+ class Seq2Seq:
5
+ def __init__(self,
6
+ input_vocab_size,
7
+ output_vocab_size,
8
+ embedding_dim,
9
+ hidden_units):
10
+ self.epochs = 10
11
+ self.batch_size = 64
12
+ self.metrics = ['accuracy']
13
+ self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
14
+ self.optimizer = tf.keras.optimizers.Adam()
15
+ self.model = None
16
+ self.embedding_dim = embedding_dim
17
+ self.hidden_units = hidden_units
18
+ self.input_vocab_size = input_vocab_size
19
+ self.output_vocab_size = output_vocab_size
20
+ self.encoder_embedding = tf.keras.layers.Embedding(self.input_vocab_size, self.embedding_dim)
21
+ self.encoder = tf.keras.layers.LSTM(self.hidden_units, return_sequences=True, return_state=True)
22
+ self.decoder_embedding = tf.keras.layers.Embedding(self.output_vocab_size, self.embedding_dim)
23
+ self.decoder = tf.keras.layers.LSTM(self.hidden_units, return_sequences=True, return_state=True)
24
+ self.output_layer = tf.keras.layers.Dense(self.output_vocab_size, activation='softmax')
25
+
26
+ def build(self):
27
+ encoder_inputs = tf.keras.Input(shape=(None,))
28
+ encoder_embedding = self.encoder_embedding(encoder_inputs)
29
+ encoder_outputs, state_h, state_c = self.encoder(encoder_embedding)
30
+ encoder_states = [state_h, state_c]
31
+ decoder_inputs = tf.keras.Input(shape=(None,))
32
+ decoder_embedding = self.decoder_embedding(decoder_inputs)
33
+ decoder_outputs, _, _ = self.decoder(decoder_embedding, initial_state=encoder_states)
34
+ outputs = self.output_layer(decoder_outputs)
35
+ self.model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
36
+
37
+ def run(self, encoder_input_data, decoder_input_data, val_encoder_input_data, val_decoder_input_data):
38
+ self.model.compile(
39
+ optimizer=self.optimizer,
40
+ loss=self.loss,
41
+ metrics=self.metrics
42
+ )
43
+
44
+ decoder_target_data = [[sentence[1:] + [0]] for sentence in decoder_input_data]
45
+ val_decoder_target_data = [[sentence[1:] + [0]] for sentence in val_decoder_input_data]
46
+
47
+ self.model.fit(
48
+ ([encoder_input_data, decoder_input_data]),
49
+ decoder_target_data,
50
+ batch_size=self.batch_size,
51
+ epochs=self.epochs,
52
+ validation_data=([val_encoder_input_data, val_decoder_input_data], val_decoder_target_data)
53
+ )
54
+
55
+ def get(self):
56
+ return self.model
57
+
58
+ def set_epochs(self, epochs):
59
+ self.epochs = epochs
60
+
61
+ def set_batch_size(self, batch_size):
62
+ self.batch_size = batch_size
63
+
64
+ def set_loss(self, loss):
65
+ self.loss = loss
66
+
67
+ def set_optimizer(self, optimizer):
68
+ self.optimizer = optimizer
69
+
70
+ def set_metric(self, metrics):
71
+ self.metrics = metrics
src/seq2seqtrainer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pipes import models, utils, const
2
+ from pipes.data import Dataset
3
+
4
+ if __name__ == "__main__":
5
+ input_lang = 'gr'
6
+ output_lang = 'bn'
7
+
8
+ dataset_object = Dataset([input_lang, output_lang])
9
+ dataset_object.pack()
10
+ dataset_object.process()
11
+ dataset_dict = dataset_object.get_dict()
12
+
13
+ seq2seq = models.Seq2Seq(
14
+ input_vocab_size=dataset_dict[input_lang]["vocab_size"],
15
+ output_vocab_size=dataset_dict[output_lang]["vocab_size"],
16
+ embedding_dim=256,
17
+ hidden_units=64)
18
+ seq2seq.build()
19
+ seq2seq.run(
20
+ encoder_input_data=dataset_dict[input_lang]["train"],
21
+ decoder_input_data=dataset_dict[output_lang]["train"],
22
+ val_encoder_input_data=dataset_dict[input_lang]["val"],
23
+ val_decoder_input_data=dataset_dict[output_lang]["val"],
24
+ )