Spaces:
Runtime error
Runtime error
saiful9379
commited on
Commit
•
f52cf42
1
Parent(s):
5cc215b
add module script
Browse files- app.py +117 -0
- config.py +12 -0
- model.py +84 -0
- requirements.txt +9 -0
- utils.py +25 -0
app.py
CHANGED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import argparse
|
3 |
+
import sentencepiece as spm
|
4 |
+
from utils import utils_cls
|
5 |
+
from model import BanglaTransformer
|
6 |
+
from config import config as cfg
|
7 |
+
torch.manual_seed(0)
|
8 |
+
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
9 |
+
device = torch.device('cpu')
|
10 |
+
uobj = utils_cls(device=device)
|
11 |
+
|
12 |
+
__MODULE__ = "Bangla Language Translation"
|
13 |
+
__MAIL__ = "saifulbrur79@gmail.com"
|
14 |
+
__MODIFICAIOTN__ = "28/03/2023"
|
15 |
+
__LICENSE__ = "MIT"
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
class Bn2EnTranslation:
|
20 |
+
def __init__(self):
|
21 |
+
self.bn_tokenizer='./model/bn_model.model'
|
22 |
+
self.en_tokenizer='./model/en_model.model'
|
23 |
+
self.bn_vocab='./model/bn_vocab.pkl'
|
24 |
+
self.en_vocab='./model/en_vocab.pkl'
|
25 |
+
self.model='./model/pytorch_model.pt'
|
26 |
+
|
27 |
+
def read_data(self, data_path):
|
28 |
+
with open(data_path, "r") as f:
|
29 |
+
data = f.readlines()
|
30 |
+
data = list(map(lambda x: [x.split("\t")[0], x.split("\t")[1].replace("\n", "")], data))
|
31 |
+
return data
|
32 |
+
|
33 |
+
def load_tokenizer(self, tokenizer_path:str = "")->object:
|
34 |
+
_tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
|
35 |
+
return _tokenizer
|
36 |
+
|
37 |
+
def get_vocab(self, BN_VOCAL_PATH:str="", EN_VOCAL_PATH:str=""):
|
38 |
+
bn_vocal, en_vocal = uobj.load_bn_vocal(BN_VOCAL_PATH), uobj.load_en_vocal(EN_VOCAL_PATH)
|
39 |
+
return bn_vocal, en_vocal
|
40 |
+
|
41 |
+
def load_model(self, model_path:str = "", SRC_VOCAB_SIZE:int=0, TGT_VOCAB_SIZE:int=0):
|
42 |
+
model = BanglaTransformer(
|
43 |
+
cfg.NUM_ENCODER_LAYERS, cfg.NUM_DECODER_LAYERS, cfg.EMB_SIZE, SRC_VOCAB_SIZE,
|
44 |
+
TGT_VOCAB_SIZE, cfg.FFN_HID_DIM, nhead= cfg.NHEAD)
|
45 |
+
model.to(device)
|
46 |
+
checkpoint = torch.load(model_path)
|
47 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
48 |
+
model.eval()
|
49 |
+
return model
|
50 |
+
|
51 |
+
def greedy_decode(self, model, src, src_mask, max_len, start_symbol, eos_index):
|
52 |
+
src = src.to(device)
|
53 |
+
src_mask = src_mask.to(device)
|
54 |
+
memory = model.encode(src, src_mask)
|
55 |
+
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
|
56 |
+
for i in range(max_len-1):
|
57 |
+
memory = memory.to(device)
|
58 |
+
memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
|
59 |
+
tgt_mask = (uobj.generate_square_subsequent_mask(ys.size(0))
|
60 |
+
.type(torch.bool)).to(device)
|
61 |
+
out = model.decode(ys, memory, tgt_mask)
|
62 |
+
out = out.transpose(0, 1)
|
63 |
+
prob = model.generator(out[:, -1])
|
64 |
+
_, next_word = torch.max(prob, dim = 1)
|
65 |
+
next_word = next_word.item()
|
66 |
+
ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
|
67 |
+
if next_word == eos_index:
|
68 |
+
break
|
69 |
+
return ys
|
70 |
+
def get_bntoen_model(self):
|
71 |
+
print("Tokenizer Loading ...... : ", end="", flush=True)
|
72 |
+
bn_tokenizer = self.load_tokenizer(tokenizer_path=self.bn_tokenizer)
|
73 |
+
print("Done")
|
74 |
+
print("Vocab Loading ...... : ", end="", flush=True)
|
75 |
+
bn_vocab, en_vocab = self.get_vocab(BN_VOCAL_PATH=self.bn_vocab, EN_VOCAL_PATH=self.en_vocab)
|
76 |
+
print("Done")
|
77 |
+
print("Model Loading ...... : ", end="", flush=True)
|
78 |
+
model = self.load_model(model_path=self.model, SRC_VOCAB_SIZE=len(bn_vocab), TGT_VOCAB_SIZE=len(en_vocab))
|
79 |
+
print("Done")
|
80 |
+
|
81 |
+
models = {
|
82 |
+
"bn_tokenizer" : bn_tokenizer,
|
83 |
+
"bn_vocab" : bn_vocab,
|
84 |
+
"en_vocab" : en_vocab,
|
85 |
+
"model" : model
|
86 |
+
}
|
87 |
+
return models
|
88 |
+
def translate(self, text, models):
|
89 |
+
model = models["model"]
|
90 |
+
src_vocab = models["bn_vocab"]
|
91 |
+
tgt_vocab = models["en_vocab"]
|
92 |
+
src_tokenizer = models["bn_tokenizer"]
|
93 |
+
src = text
|
94 |
+
PAD_IDX, BOS_IDX, EOS_IDX= src_vocab['<pad>'], src_vocab['<bos>'], src_vocab['<eos>']
|
95 |
+
tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
|
96 |
+
num_tokens = len(tokens)
|
97 |
+
src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
|
98 |
+
src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
|
99 |
+
tgt_tokens = self.greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, eos_index= EOS_IDX).flatten()
|
100 |
+
p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
|
101 |
+
pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
|
102 |
+
return pts.strip()
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
print(torch.cuda.get_device_name(0))
|
109 |
+
text = "এই উপজেলায় ১টি সরকারি কলেজ রয়েছে"
|
110 |
+
|
111 |
+
obj = Bn2EnTranslation()
|
112 |
+
models = obj.get_bntoen_model()
|
113 |
+
pre = obj.translate(text, models)
|
114 |
+
print("="*20)
|
115 |
+
print(f"input : {text}")
|
116 |
+
print(f"prediction: {pre}")
|
117 |
+
|
config.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class config:
|
2 |
+
EMB_SIZE = 512
|
3 |
+
NHEAD = 8
|
4 |
+
FFN_HID_DIM = 512
|
5 |
+
NUM_ENCODER_LAYERS = 6
|
6 |
+
NUM_DECODER_LAYERS = 6
|
7 |
+
NUM_EPOCHS = 300
|
8 |
+
BN_TOKENIZER_PATH = "./model/bn_model.model"
|
9 |
+
EN_TOKENIZER_PATH = "./model/en_model.model"
|
10 |
+
BN_VOCAL_PATH = "./model/bn_vocab.pkl"
|
11 |
+
EN_VOCAL_PATH = "./model/en_vocab.pkl"
|
12 |
+
MODEL_PATH = "./model/model_checkpoint.pt"
|
model.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import Tensor
|
5 |
+
from torch.nn import TransformerEncoder, TransformerDecoder, \
|
6 |
+
TransformerEncoderLayer, TransformerDecoderLayer
|
7 |
+
torch.manual_seed(0)
|
8 |
+
|
9 |
+
class PositionalEncoding(nn.Module):
|
10 |
+
def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
|
11 |
+
super(PositionalEncoding, self).__init__()
|
12 |
+
den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
|
13 |
+
pos = torch.arange(0, maxlen).reshape(maxlen, 1)
|
14 |
+
pos_embedding = torch.zeros((maxlen, emb_size))
|
15 |
+
pos_embedding[:, 0::2] = torch.sin(pos * den)
|
16 |
+
pos_embedding[:, 1::2] = torch.cos(pos * den)
|
17 |
+
pos_embedding = pos_embedding.unsqueeze(-2)
|
18 |
+
|
19 |
+
self.dropout = nn.Dropout(dropout)
|
20 |
+
self.register_buffer('pos_embedding', pos_embedding)
|
21 |
+
|
22 |
+
def forward(self, token_embedding: Tensor):
|
23 |
+
return self.dropout(token_embedding +
|
24 |
+
self.pos_embedding[:token_embedding.size(0),:])
|
25 |
+
|
26 |
+
class TokenEmbedding(nn.Module):
|
27 |
+
def __init__(self, vocab_size: int, emb_size):
|
28 |
+
super(TokenEmbedding, self).__init__()
|
29 |
+
self.embedding = nn.Embedding(vocab_size, emb_size)
|
30 |
+
self.emb_size = emb_size
|
31 |
+
def forward(self, tokens: Tensor):
|
32 |
+
return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
class BanglaTransformer(nn.Module):
|
38 |
+
def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
|
39 |
+
emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
|
40 |
+
dim_feedforward:int = 512, dropout:float = 0.1, nhead:int=8):
|
41 |
+
super(BanglaTransformer, self).__init__()
|
42 |
+
encoder_layer = TransformerEncoderLayer(
|
43 |
+
d_model=emb_size,
|
44 |
+
nhead=nhead,
|
45 |
+
dim_feedforward=dim_feedforward
|
46 |
+
)
|
47 |
+
self.transformer_encoder = TransformerEncoder(
|
48 |
+
encoder_layer,
|
49 |
+
num_layers=num_encoder_layers
|
50 |
+
)
|
51 |
+
decoder_layer = TransformerDecoderLayer(
|
52 |
+
d_model=emb_size,
|
53 |
+
nhead=nhead,
|
54 |
+
dim_feedforward=dim_feedforward
|
55 |
+
)
|
56 |
+
self.transformer_decoder = TransformerDecoder(
|
57 |
+
decoder_layer,
|
58 |
+
num_layers=num_decoder_layers
|
59 |
+
)
|
60 |
+
|
61 |
+
self.generator = nn.Linear(emb_size, tgt_vocab_size)
|
62 |
+
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
|
63 |
+
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
|
64 |
+
self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)
|
65 |
+
|
66 |
+
def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
|
67 |
+
tgt_mask: Tensor, src_padding_mask: Tensor,
|
68 |
+
tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
|
69 |
+
src_emb = self.positional_encoding(self.src_tok_emb(src))
|
70 |
+
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
|
71 |
+
memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
|
72 |
+
outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
|
73 |
+
tgt_padding_mask, memory_key_padding_mask)
|
74 |
+
return self.generator(outs)
|
75 |
+
|
76 |
+
def encode(self, src: Tensor, src_mask: Tensor):
|
77 |
+
return self.transformer_encoder(self.positional_encoding(
|
78 |
+
self.src_tok_emb(src)), src_mask)
|
79 |
+
|
80 |
+
def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
|
81 |
+
return self.transformer_decoder(self.positional_encoding(
|
82 |
+
self.tgt_tok_emb(tgt)), memory,
|
83 |
+
tgt_mask)
|
84 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.19.5
|
2 |
+
sentencepiece==0.1.97
|
3 |
+
tokenizers==0.8.1rc1
|
4 |
+
torch==1.11.0+cu113
|
5 |
+
torchsummary==1.5.1
|
6 |
+
torchtext==0.12.0
|
7 |
+
torchvision==0.12.0+cu113
|
8 |
+
transformers==3.0.2
|
9 |
+
nltk==3.7
|
utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import torch
|
3 |
+
from config import config as cfg
|
4 |
+
|
5 |
+
class utils_cls:
|
6 |
+
def __init__(self, device):
|
7 |
+
self.device = device
|
8 |
+
|
9 |
+
def load_bn_vocal(self, bn_vocal_path):
|
10 |
+
file = open(bn_vocal_path, 'rb')
|
11 |
+
bn_vocal = pickle.load(file)
|
12 |
+
file.close()
|
13 |
+
return bn_vocal
|
14 |
+
|
15 |
+
def load_en_vocal(self, en_vocal_path):
|
16 |
+
file = open(en_vocal_path, 'rb')
|
17 |
+
en_vocal = pickle.load(file)
|
18 |
+
file.close()
|
19 |
+
return en_vocal
|
20 |
+
|
21 |
+
def generate_square_subsequent_mask(self, sz):
|
22 |
+
mask = (torch.triu(torch.ones((sz, sz), device=self.device)) == 1).transpose(0, 1)
|
23 |
+
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
|
24 |
+
return mask
|
25 |
+
|