MasumBhuiyan
commited on
Commit
•
e13f31a
1
Parent(s):
dcd73d9
Updated Data Processing module. Seq2Seq model added (unverified)
Browse files- src/pipes/data.py +1 -2
- src/pipes/models.py +71 -0
- src/seq2seqtrainer.py +24 -0
src/pipes/data.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import random
|
2 |
-
import utils
|
3 |
-
import const
|
4 |
|
5 |
|
6 |
class Sentence:
|
|
|
1 |
import random
|
2 |
+
from pipes import utils, const
|
|
|
3 |
|
4 |
|
5 |
class Sentence:
|
src/pipes/models.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
|
4 |
+
class Seq2Seq:
|
5 |
+
def __init__(self,
|
6 |
+
input_vocab_size,
|
7 |
+
output_vocab_size,
|
8 |
+
embedding_dim,
|
9 |
+
hidden_units):
|
10 |
+
self.epochs = 10
|
11 |
+
self.batch_size = 64
|
12 |
+
self.metrics = ['accuracy']
|
13 |
+
self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
|
14 |
+
self.optimizer = tf.keras.optimizers.Adam()
|
15 |
+
self.model = None
|
16 |
+
self.embedding_dim = embedding_dim
|
17 |
+
self.hidden_units = hidden_units
|
18 |
+
self.input_vocab_size = input_vocab_size
|
19 |
+
self.output_vocab_size = output_vocab_size
|
20 |
+
self.encoder_embedding = tf.keras.layers.Embedding(self.input_vocab_size, self.embedding_dim)
|
21 |
+
self.encoder = tf.keras.layers.LSTM(self.hidden_units, return_sequences=True, return_state=True)
|
22 |
+
self.decoder_embedding = tf.keras.layers.Embedding(self.output_vocab_size, self.embedding_dim)
|
23 |
+
self.decoder = tf.keras.layers.LSTM(self.hidden_units, return_sequences=True, return_state=True)
|
24 |
+
self.output_layer = tf.keras.layers.Dense(self.output_vocab_size, activation='softmax')
|
25 |
+
|
26 |
+
def build(self):
|
27 |
+
encoder_inputs = tf.keras.Input(shape=(None,))
|
28 |
+
encoder_embedding = self.encoder_embedding(encoder_inputs)
|
29 |
+
encoder_outputs, state_h, state_c = self.encoder(encoder_embedding)
|
30 |
+
encoder_states = [state_h, state_c]
|
31 |
+
decoder_inputs = tf.keras.Input(shape=(None,))
|
32 |
+
decoder_embedding = self.decoder_embedding(decoder_inputs)
|
33 |
+
decoder_outputs, _, _ = self.decoder(decoder_embedding, initial_state=encoder_states)
|
34 |
+
outputs = self.output_layer(decoder_outputs)
|
35 |
+
self.model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
|
36 |
+
|
37 |
+
def run(self, encoder_input_data, decoder_input_data, val_encoder_input_data, val_decoder_input_data):
|
38 |
+
self.model.compile(
|
39 |
+
optimizer=self.optimizer,
|
40 |
+
loss=self.loss,
|
41 |
+
metrics=self.metrics
|
42 |
+
)
|
43 |
+
|
44 |
+
decoder_target_data = [[sentence[1:] + [0]] for sentence in decoder_input_data]
|
45 |
+
val_decoder_target_data = [[sentence[1:] + [0]] for sentence in val_decoder_input_data]
|
46 |
+
|
47 |
+
self.model.fit(
|
48 |
+
([encoder_input_data, decoder_input_data]),
|
49 |
+
decoder_target_data,
|
50 |
+
batch_size=self.batch_size,
|
51 |
+
epochs=self.epochs,
|
52 |
+
validation_data=([val_encoder_input_data, val_decoder_input_data], val_decoder_target_data)
|
53 |
+
)
|
54 |
+
|
55 |
+
def get(self):
|
56 |
+
return self.model
|
57 |
+
|
58 |
+
def set_epochs(self, epochs):
|
59 |
+
self.epochs = epochs
|
60 |
+
|
61 |
+
def set_batch_size(self, batch_size):
|
62 |
+
self.batch_size = batch_size
|
63 |
+
|
64 |
+
def set_loss(self, loss):
|
65 |
+
self.loss = loss
|
66 |
+
|
67 |
+
def set_optimizer(self, optimizer):
|
68 |
+
self.optimizer = optimizer
|
69 |
+
|
70 |
+
def set_metric(self, metrics):
|
71 |
+
self.metrics = metrics
|
src/seq2seqtrainer.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pipes import models, utils, const
|
2 |
+
from pipes.data import Dataset
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
input_lang = 'gr'
|
6 |
+
output_lang = 'bn'
|
7 |
+
|
8 |
+
dataset_object = Dataset([input_lang, output_lang])
|
9 |
+
dataset_object.pack()
|
10 |
+
dataset_object.process()
|
11 |
+
dataset_dict = dataset_object.get_dict()
|
12 |
+
|
13 |
+
seq2seq = models.Seq2Seq(
|
14 |
+
input_vocab_size=dataset_dict[input_lang]["vocab_size"],
|
15 |
+
output_vocab_size=dataset_dict[output_lang]["vocab_size"],
|
16 |
+
embedding_dim=256,
|
17 |
+
hidden_units=64)
|
18 |
+
seq2seq.build()
|
19 |
+
seq2seq.run(
|
20 |
+
encoder_input_data=dataset_dict[input_lang]["train"],
|
21 |
+
decoder_input_data=dataset_dict[output_lang]["train"],
|
22 |
+
val_encoder_input_data=dataset_dict[input_lang]["val"],
|
23 |
+
val_decoder_input_data=dataset_dict[output_lang]["val"],
|
24 |
+
)
|