saiful9379 commited on
Commit
f52cf42
1 Parent(s): 5cc215b

add module script

Browse files
Files changed (5) hide show
  1. app.py +117 -0
  2. config.py +12 -0
  3. model.py +84 -0
  4. requirements.txt +9 -0
  5. utils.py +25 -0
app.py CHANGED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import argparse
3
+ import sentencepiece as spm
4
+ from utils import utils_cls
5
+ from model import BanglaTransformer
6
+ from config import config as cfg
7
+ torch.manual_seed(0)
8
+ # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9
+ device = torch.device('cpu')
10
+ uobj = utils_cls(device=device)
11
+
12
+ __MODULE__ = "Bangla Language Translation"
13
+ __MAIL__ = "saifulbrur79@gmail.com"
14
+ __MODIFICAIOTN__ = "28/03/2023"
15
+ __LICENSE__ = "MIT"
16
+
17
+
18
+
19
+ class Bn2EnTranslation:
20
+ def __init__(self):
21
+ self.bn_tokenizer='./model/bn_model.model'
22
+ self.en_tokenizer='./model/en_model.model'
23
+ self.bn_vocab='./model/bn_vocab.pkl'
24
+ self.en_vocab='./model/en_vocab.pkl'
25
+ self.model='./model/pytorch_model.pt'
26
+
27
+ def read_data(self, data_path):
28
+ with open(data_path, "r") as f:
29
+ data = f.readlines()
30
+ data = list(map(lambda x: [x.split("\t")[0], x.split("\t")[1].replace("\n", "")], data))
31
+ return data
32
+
33
+ def load_tokenizer(self, tokenizer_path:str = "")->object:
34
+ _tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
35
+ return _tokenizer
36
+
37
+ def get_vocab(self, BN_VOCAL_PATH:str="", EN_VOCAL_PATH:str=""):
38
+ bn_vocal, en_vocal = uobj.load_bn_vocal(BN_VOCAL_PATH), uobj.load_en_vocal(EN_VOCAL_PATH)
39
+ return bn_vocal, en_vocal
40
+
41
+ def load_model(self, model_path:str = "", SRC_VOCAB_SIZE:int=0, TGT_VOCAB_SIZE:int=0):
42
+ model = BanglaTransformer(
43
+ cfg.NUM_ENCODER_LAYERS, cfg.NUM_DECODER_LAYERS, cfg.EMB_SIZE, SRC_VOCAB_SIZE,
44
+ TGT_VOCAB_SIZE, cfg.FFN_HID_DIM, nhead= cfg.NHEAD)
45
+ model.to(device)
46
+ checkpoint = torch.load(model_path)
47
+ model.load_state_dict(checkpoint['model_state_dict'])
48
+ model.eval()
49
+ return model
50
+
51
+ def greedy_decode(self, model, src, src_mask, max_len, start_symbol, eos_index):
52
+ src = src.to(device)
53
+ src_mask = src_mask.to(device)
54
+ memory = model.encode(src, src_mask)
55
+ ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
56
+ for i in range(max_len-1):
57
+ memory = memory.to(device)
58
+ memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
59
+ tgt_mask = (uobj.generate_square_subsequent_mask(ys.size(0))
60
+ .type(torch.bool)).to(device)
61
+ out = model.decode(ys, memory, tgt_mask)
62
+ out = out.transpose(0, 1)
63
+ prob = model.generator(out[:, -1])
64
+ _, next_word = torch.max(prob, dim = 1)
65
+ next_word = next_word.item()
66
+ ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
67
+ if next_word == eos_index:
68
+ break
69
+ return ys
70
+ def get_bntoen_model(self):
71
+ print("Tokenizer Loading ...... : ", end="", flush=True)
72
+ bn_tokenizer = self.load_tokenizer(tokenizer_path=self.bn_tokenizer)
73
+ print("Done")
74
+ print("Vocab Loading ...... : ", end="", flush=True)
75
+ bn_vocab, en_vocab = self.get_vocab(BN_VOCAL_PATH=self.bn_vocab, EN_VOCAL_PATH=self.en_vocab)
76
+ print("Done")
77
+ print("Model Loading ...... : ", end="", flush=True)
78
+ model = self.load_model(model_path=self.model, SRC_VOCAB_SIZE=len(bn_vocab), TGT_VOCAB_SIZE=len(en_vocab))
79
+ print("Done")
80
+
81
+ models = {
82
+ "bn_tokenizer" : bn_tokenizer,
83
+ "bn_vocab" : bn_vocab,
84
+ "en_vocab" : en_vocab,
85
+ "model" : model
86
+ }
87
+ return models
88
+ def translate(self, text, models):
89
+ model = models["model"]
90
+ src_vocab = models["bn_vocab"]
91
+ tgt_vocab = models["en_vocab"]
92
+ src_tokenizer = models["bn_tokenizer"]
93
+ src = text
94
+ PAD_IDX, BOS_IDX, EOS_IDX= src_vocab['<pad>'], src_vocab['<bos>'], src_vocab['<eos>']
95
+ tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
96
+ num_tokens = len(tokens)
97
+ src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
98
+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
99
+ tgt_tokens = self.greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, eos_index= EOS_IDX).flatten()
100
+ p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
101
+ pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
102
+ return pts.strip()
103
+
104
+
105
+
106
+
107
+ if __name__ == "__main__":
108
+ print(torch.cuda.get_device_name(0))
109
+ text = "এই উপজেলায় ১টি সরকারি কলেজ রয়েছে"
110
+
111
+ obj = Bn2EnTranslation()
112
+ models = obj.get_bntoen_model()
113
+ pre = obj.translate(text, models)
114
+ print("="*20)
115
+ print(f"input : {text}")
116
+ print(f"prediction: {pre}")
117
+
config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class config:
2
+ EMB_SIZE = 512
3
+ NHEAD = 8
4
+ FFN_HID_DIM = 512
5
+ NUM_ENCODER_LAYERS = 6
6
+ NUM_DECODER_LAYERS = 6
7
+ NUM_EPOCHS = 300
8
+ BN_TOKENIZER_PATH = "./model/bn_model.model"
9
+ EN_TOKENIZER_PATH = "./model/en_model.model"
10
+ BN_VOCAL_PATH = "./model/bn_vocab.pkl"
11
+ EN_VOCAL_PATH = "./model/en_vocab.pkl"
12
+ MODEL_PATH = "./model/model_checkpoint.pt"
model.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch import Tensor
5
+ from torch.nn import TransformerEncoder, TransformerDecoder, \
6
+ TransformerEncoderLayer, TransformerDecoderLayer
7
+ torch.manual_seed(0)
8
+
9
+ class PositionalEncoding(nn.Module):
10
+ def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
11
+ super(PositionalEncoding, self).__init__()
12
+ den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
13
+ pos = torch.arange(0, maxlen).reshape(maxlen, 1)
14
+ pos_embedding = torch.zeros((maxlen, emb_size))
15
+ pos_embedding[:, 0::2] = torch.sin(pos * den)
16
+ pos_embedding[:, 1::2] = torch.cos(pos * den)
17
+ pos_embedding = pos_embedding.unsqueeze(-2)
18
+
19
+ self.dropout = nn.Dropout(dropout)
20
+ self.register_buffer('pos_embedding', pos_embedding)
21
+
22
+ def forward(self, token_embedding: Tensor):
23
+ return self.dropout(token_embedding +
24
+ self.pos_embedding[:token_embedding.size(0),:])
25
+
26
+ class TokenEmbedding(nn.Module):
27
+ def __init__(self, vocab_size: int, emb_size):
28
+ super(TokenEmbedding, self).__init__()
29
+ self.embedding = nn.Embedding(vocab_size, emb_size)
30
+ self.emb_size = emb_size
31
+ def forward(self, tokens: Tensor):
32
+ return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
33
+
34
+
35
+
36
+
37
+ class BanglaTransformer(nn.Module):
38
+ def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
39
+ emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
40
+ dim_feedforward:int = 512, dropout:float = 0.1, nhead:int=8):
41
+ super(BanglaTransformer, self).__init__()
42
+ encoder_layer = TransformerEncoderLayer(
43
+ d_model=emb_size,
44
+ nhead=nhead,
45
+ dim_feedforward=dim_feedforward
46
+ )
47
+ self.transformer_encoder = TransformerEncoder(
48
+ encoder_layer,
49
+ num_layers=num_encoder_layers
50
+ )
51
+ decoder_layer = TransformerDecoderLayer(
52
+ d_model=emb_size,
53
+ nhead=nhead,
54
+ dim_feedforward=dim_feedforward
55
+ )
56
+ self.transformer_decoder = TransformerDecoder(
57
+ decoder_layer,
58
+ num_layers=num_decoder_layers
59
+ )
60
+
61
+ self.generator = nn.Linear(emb_size, tgt_vocab_size)
62
+ self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
63
+ self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
64
+ self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)
65
+
66
+ def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
67
+ tgt_mask: Tensor, src_padding_mask: Tensor,
68
+ tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
69
+ src_emb = self.positional_encoding(self.src_tok_emb(src))
70
+ tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
71
+ memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
72
+ outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
73
+ tgt_padding_mask, memory_key_padding_mask)
74
+ return self.generator(outs)
75
+
76
+ def encode(self, src: Tensor, src_mask: Tensor):
77
+ return self.transformer_encoder(self.positional_encoding(
78
+ self.src_tok_emb(src)), src_mask)
79
+
80
+ def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
81
+ return self.transformer_decoder(self.positional_encoding(
82
+ self.tgt_tok_emb(tgt)), memory,
83
+ tgt_mask)
84
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.19.5
2
+ sentencepiece==0.1.97
3
+ tokenizers==0.8.1rc1
4
+ torch==1.11.0+cu113
5
+ torchsummary==1.5.1
6
+ torchtext==0.12.0
7
+ torchvision==0.12.0+cu113
8
+ transformers==3.0.2
9
+ nltk==3.7
utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import torch
3
+ from config import config as cfg
4
+
5
+ class utils_cls:
6
+ def __init__(self, device):
7
+ self.device = device
8
+
9
+ def load_bn_vocal(self, bn_vocal_path):
10
+ file = open(bn_vocal_path, 'rb')
11
+ bn_vocal = pickle.load(file)
12
+ file.close()
13
+ return bn_vocal
14
+
15
+ def load_en_vocal(self, en_vocal_path):
16
+ file = open(en_vocal_path, 'rb')
17
+ en_vocal = pickle.load(file)
18
+ file.close()
19
+ return en_vocal
20
+
21
+ def generate_square_subsequent_mask(self, sz):
22
+ mask = (torch.triu(torch.ones((sz, sz), device=self.device)) == 1).transpose(0, 1)
23
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
24
+ return mask
25
+