NirajanBekoju commited on
Commit
464ed03
1 Parent(s): 296f6c4

deployment phase 1

Browse files
__pycache__/config.cpython-38.pyc ADDED
Binary file (481 Bytes). View file
 
__pycache__/model.cpython-38.pyc ADDED
Binary file (2.88 kB). View file
 
__pycache__/utils.cpython-38.pyc ADDED
Binary file (965 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import torch
4
+ from torch import Tensor, nn
5
+ from torch.nn import TransformerEncoder, TransformerEncoderLayer
6
+ from torchtext.vocab import build_vocab_from_iterator
7
+ from torch.utils.data import dataset
8
+ from torch.utils.tensorboard import SummaryWriter
9
+
10
+ import regex as re
11
+ import os
12
+ import time
13
+ from tqdm import tqdm
14
+ import copy
15
+ import math
16
+
17
+ from model import TransformerModel
18
+ from utils import preProcessText, getTokenizer
19
+ from config import getConfig
20
+
21
+
22
+ model_config, app_config = getConfig()
23
+ print(model_config)
24
+ print(app_config)
25
+
26
+ bptt=model_config["bptt"]
27
+
28
+ device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ print(device)
30
+
31
+ softmax = nn.Softmax(dim=2)
32
+
33
+ tokenizer, vocab = getTokenizer()
34
+ ntokens = len(vocab)
35
+
36
+
37
+ def get_model(model_config, ntokens):
38
+ emsize = model_config["emsize"]
39
+ d_hid = model_config["d_hid"]
40
+ nlayers = model_config["nlayers"]
41
+ nhead = model_config["nhead"]
42
+ dropout = model_config["dropout"]
43
+ model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
44
+ return model
45
+
46
+ def loadModel(best_model_path):
47
+ global model
48
+ if os.path.exists(best_model_path):
49
+ print(f"Preloading model {best_model_path}")
50
+ if torch.cuda.is_available():
51
+ state = torch.load(best_model_path)
52
+ else:
53
+ state = torch.load(best_model_path, map_location=torch.device('cpu'))
54
+ model.load_state_dict(state['model_state_dict'])
55
+ return model
56
+ else:
57
+ raise Exception("Model Not Found")
58
+
59
+ def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
60
+ """Converts raw text into a flat Tensor."""
61
+ # obtain the data in tensor format for each line
62
+ data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
63
+ for item in raw_text_iter]
64
+ # concatenate all the lines
65
+ return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
66
+
67
+ def batchify(data: Tensor, batch_size: int) -> Tensor:
68
+ """Divides the data into batch_size separate sequences, removing extra elements
69
+ that wouldn't cleanly fit.
70
+ Args:
71
+ data: Tensor, shape [N]
72
+ batch_size: int, batch size
73
+ Returns:
74
+ Tensor of shape [N // bsz, bsz]
75
+ """
76
+ seq_len = data.size(0) // batch_size
77
+ data = data[:seq_len * batch_size]
78
+ data = data.view(batch_size, seq_len).t().contiguous()
79
+ return data.to(device)
80
+
81
+ def generate_square_subsequent_mask(sz: int) -> Tensor:
82
+ """Generates an upper-triangular matrix of -inf, with zeros on diag."""
83
+ return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
84
+
85
+ def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words=5, k=50):
86
+ model.eval()
87
+ src_mask = generate_square_subsequent_mask(bptt).to(device)
88
+ pred_text = []
89
+ for i in range(no_words):
90
+ batch_size = gen_data.size(0)
91
+ if batch_size != bptt:
92
+ src_mask_ = src_mask[:batch_size, :batch_size]
93
+
94
+ # generate the probability of the next word
95
+ output_softmax = model(gen_data, src_mask_)
96
+ output_softmax_permuted = output_softmax.permute(1, 0, 2)
97
+
98
+ # obtain the "k" top probable words index
99
+ # both indices and values are of size (no. of words, k=50)
100
+ indices = torch.topk(output_softmax_permuted, k, dim=2).indices.squeeze(0)
101
+ # obtain the top "k" probability of the probable words
102
+ values = torch.topk(softmax(output_softmax_permuted), k, dim=2).values
103
+ values = values/torch.sum(values, dim=2, keepdims=True)
104
+ values = values.squeeze(0)
105
+
106
+ # create categorical distribution and take sample from values
107
+ # categorical distribution take 1 sample from k=50 samples of each dimension
108
+ for _ in range(10):
109
+ ind_sampled = torch.distributions.Categorical(values).sample()
110
+ next_index = indices[-1][ind_sampled[-1]]
111
+ # if the obtained token is not <unk>, then no need to sample again
112
+ if vocab.lookup_token(next_index) != '<unk>':
113
+ break
114
+
115
+ pred_text.append([vocab.lookup_token(next_index)][0])
116
+ if(batch_size < 15):
117
+ gen_data = torch.cat((gen_data[:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
118
+ batch_size = gen_data.size(0)
119
+ else:
120
+ gen_data = torch.cat((gen_data[1:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
121
+ batch_size = gen_data.size(0)
122
+
123
+ return pred_text
124
+
125
+ def predText(text : str, num_words : int):
126
+ text = [text]
127
+ num_words = int(num_words)
128
+ sample_data = data_process(text)
129
+ sample_data = batchify(sample_data, 1)
130
+ pred_text = nonnaive_generator(loaded_model, sample_data[:,-1].unsqueeze(1), no_words=num_words, k=50)
131
+ whole_text = text[0] + ' ' + ' '.join(pred_text)
132
+ return whole_text
133
+
134
+
135
+ if __name__ == '__main__':
136
+ model = get_model(model_config, ntokens).to(device)
137
+ best_model_path = 'models/best_model.pt'
138
+ loaded_model = loadModel(best_model_path)
139
+
140
+ input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
141
+ with gr.Blocks() as demo:
142
+ input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
143
+ input_num_words = gr.Number(label="Number of word to generate", value=5)
144
+
145
+ btn = gr.Button(value="Submit")
146
+
147
+ btn.click(predText, inputs=[input_text_box, input_num_words], outputs=[input_text_box])
148
+
149
+ demo.launch()
config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def getConfig():
2
+ model_config = {
3
+ "emsize" : 300,
4
+ "d_hid" : 1024,
5
+ "nlayers" : 6,
6
+ "nhead" : 6,
7
+ "dropout" : 0.2,
8
+ "bptt" : 64
9
+ }
10
+
11
+ app_config = {
12
+ "logs" : "tensorboard_logs",
13
+ "epochs" : 25,
14
+ }
15
+
16
+ return model_config, app_config
model.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import Tensor, nn
3
+ from torch.nn import TransformerEncoder, TransformerEncoderLayer
4
+
5
+ import math
6
+
7
+ class PositionalEncoding(nn.Module):
8
+
9
+ def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
10
+ super().__init__()
11
+ self.dropout = nn.Dropout(p=dropout)
12
+
13
+ position = torch.arange(max_len).unsqueeze(1)
14
+ div_term = torch.exp(torch.arange(0, d_model, 2)
15
+ * (-math.log(10000.0) / d_model))
16
+ pe = torch.zeros(max_len, 1, d_model)
17
+ pe[:, 0, 0::2] = torch.sin(position * div_term)
18
+ pe[:, 0, 1::2] = torch.cos(position * div_term)
19
+ self.register_buffer('pe', pe)
20
+
21
+ def forward(self, x: Tensor) -> Tensor:
22
+ """
23
+ Args:
24
+ x: Tensor, shape [seq_len, batch_size, embedding_dim]
25
+ """
26
+ x = x + self.pe[:x.size(0)]
27
+ return self.dropout(x)
28
+
29
+ class TransformerModel(nn.Module):
30
+
31
+ def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
32
+ nlayers: int, dropout: float = 0.5):
33
+ super().__init__()
34
+ self.model_type = 'Transformer'
35
+ self.pos_encoder = PositionalEncoding(d_model, dropout)
36
+ encoder_layers = TransformerEncoderLayer(
37
+ d_model, nhead, d_hid, dropout)
38
+ self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
39
+ self.embedding = nn.Embedding(ntoken, d_model)
40
+ self.d_model = d_model
41
+ self.decoder = nn.Linear(d_model, ntoken)
42
+
43
+ self.init_weights()
44
+
45
+ def init_weights(self) -> None:
46
+ initrange = 0.1
47
+ self.embedding.weight.data.uniform_(-initrange, initrange)
48
+ self.decoder.bias.data.zero_()
49
+ self.decoder.weight.data.uniform_(-initrange, initrange)
50
+
51
+ def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
52
+ """
53
+ Args:
54
+ src: Tensor, shape [seq_len, batch_size]
55
+ src_mask: Tensor, shape [seq_len, seq_len]
56
+ Returns:
57
+ output Tensor of shape [seq_len, batch_size, ntoken]
58
+ """
59
+ src = src.long()
60
+ src = self.embedding(src) * math.sqrt(self.d_model)
61
+ src = self.pos_encoder(src)
62
+ output = self.transformer_encoder(src, src_mask)
63
+ output = self.decoder(output)
64
+ return output
models/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f56a4727ca6141d21b9165a906445367b3714d5f53d519e86b963ba703747b7
3
+ size 174967426
requirements.txt ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ aiofiles==23.2.1
3
+ altair==5.1.2
4
+ annotated-types==0.6.0
5
+ anyio==3.7.1
6
+ arpa==0.1.0b4
7
+ asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
8
+ attrs==23.1.0
9
+ backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
10
+ cachetools==5.3.2
11
+ certifi==2022.12.7
12
+ charset-normalizer==2.1.1
13
+ click==8.1.7
14
+ colorama==0.4.6
15
+ comm @ file:///croot/comm_1671231121260/work
16
+ contourpy==1.1.1
17
+ cycler==0.12.1
18
+ debugpy @ file:///croot/debugpy_1690905042057/work
19
+ decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
20
+ dill==0.3.7
21
+ einops==0.7.0
22
+ exceptiongroup==1.2.0
23
+ executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
24
+ fastapi==0.104.1
25
+ ffmpy==0.3.1
26
+ filelock==3.9.0
27
+ fonttools==4.43.1
28
+ fsspec==2023.4.0
29
+ google-auth==2.23.4
30
+ google-auth-oauthlib==1.0.0
31
+ gradio==4.7.1
32
+ gradio_client==0.7.0
33
+ grpcio==1.59.2
34
+ h11==0.14.0
35
+ httpcore==1.0.2
36
+ httpx==0.25.1
37
+ huggingface-hub==0.17.3
38
+ idna==3.4
39
+ imageio==2.32.0
40
+ importlib-metadata @ file:///croot/importlib-metadata_1678997070253/work
41
+ importlib-resources==6.1.0
42
+ ipykernel @ file:///croot/ipykernel_1691121631942/work
43
+ ipython @ file:///croot/ipython_1691532092695/work
44
+ jedi @ file:///tmp/build/80754af9/jedi_1644315233700/work
45
+ Jinja2==3.1.2
46
+ jsonschema==4.20.0
47
+ jsonschema-specifications==2023.11.1
48
+ jupyter_client @ file:///croot/jupyter_client_1680171862562/work
49
+ jupyter_core @ file:///croot/jupyter_core_1679906564508/work
50
+ kiwisolver==1.4.5
51
+ lazy_loader==0.3
52
+ lightning-utilities==0.9.0
53
+ lxml==4.9.3
54
+ Markdown==3.5.1
55
+ markdown-it-py==3.0.0
56
+ MarkupSafe==2.1.2
57
+ matplotlib==3.7.3
58
+ matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
59
+ mdurl==0.1.2
60
+ mpmath==1.3.0
61
+ nest-asyncio @ file:///croot/nest-asyncio_1672387112409/work
62
+ networkx==3.0
63
+ numpy==1.24.1
64
+ nvidia-cublas-cu12==12.1.3.1
65
+ nvidia-cuda-cupti-cu12==12.1.105
66
+ nvidia-cuda-nvrtc-cu12==12.1.105
67
+ nvidia-cuda-runtime-cu12==12.1.105
68
+ nvidia-cudnn-cu12==8.9.2.26
69
+ nvidia-cufft-cu12==11.0.2.54
70
+ nvidia-curand-cu12==10.3.2.106
71
+ nvidia-cusolver-cu12==11.4.5.107
72
+ nvidia-cusparse-cu12==12.1.0.106
73
+ nvidia-nccl-cu12==2.18.1
74
+ nvidia-nvjitlink-cu12==12.3.101
75
+ nvidia-nvtx-cu12==12.1.105
76
+ oauthlib==3.2.2
77
+ opencv-python==4.8.1.78
78
+ orjson==3.9.10
79
+ packaging @ file:///croot/packaging_1693575174725/work
80
+ pandarallel==1.6.5
81
+ pandas==2.0.3
82
+ parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
83
+ pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
84
+ phoenix-datasets @ git+https://github.com/enhuiz/phoenix-datasets@570481bf03a46555ca219f79ace1a2cfab149f8c
85
+ pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
86
+ Pillow==9.3.0
87
+ pkgutil_resolve_name==1.3.10
88
+ platformdirs @ file:///croot/platformdirs_1692205439124/work
89
+ prompt-toolkit @ file:///croot/prompt-toolkit_1672387306916/work
90
+ protobuf==4.24.4
91
+ psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
92
+ ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
93
+ pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
94
+ pyasn1==0.5.0
95
+ pyasn1-modules==0.3.0
96
+ pydantic==2.5.2
97
+ pydantic_core==2.14.5
98
+ pydub==0.25.1
99
+ Pygments @ file:///croot/pygments_1684279966437/work
100
+ pyparsing==3.1.1
101
+ python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
102
+ python-multipart==0.0.6
103
+ pytz==2023.3.post1
104
+ PyWavelets==1.4.1
105
+ PyYAML==6.0.1
106
+ pyzmq @ file:///croot/pyzmq_1686601365461/work
107
+ referencing==0.31.0
108
+ regex==2023.10.3
109
+ requests==2.28.1
110
+ requests-oauthlib==1.3.1
111
+ rich==13.7.0
112
+ rpds-py==0.13.1
113
+ rsa==4.9
114
+ scikit-image==0.21.0
115
+ scipy==1.10.1
116
+ semantic-version==2.10.0
117
+ shellingham==1.5.4
118
+ six @ file:///tmp/build/80754af9/six_1644875935023/work
119
+ sniffio==1.3.0
120
+ stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
121
+ starlette==0.27.0
122
+ sympy==1.12
123
+ tensorboard==2.14.0
124
+ tensorboard-data-server==0.7.2
125
+ tifffile==2023.7.10
126
+ tokenizers==0.14.1
127
+ tomlkit==0.12.0
128
+ toolz==0.12.0
129
+ torch==2.1.1
130
+ torchaudio==2.1.0+cpu
131
+ torchdata==0.7.1
132
+ torchmetrics==1.2.0
133
+ torchtext==0.16.1
134
+ torchvision==0.16.0+cpu
135
+ tornado @ file:///croot/tornado_1696936946304/work
136
+ tqdm==4.66.1
137
+ traitlets @ file:///croot/traitlets_1671143879854/work
138
+ triton==2.1.0
139
+ typer==0.9.0
140
+ typing_extensions @ file:///croot/typing_extensions_1690297465030/work
141
+ tzdata==2023.3
142
+ urllib3==1.26.13
143
+ uvicorn==0.24.0.post1
144
+ vidaug @ git+https://github.com/okankop/vidaug@1c1ddf2640fe4a9171267d64ae5e3bd70c24d54a
145
+ wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
146
+ websockets==11.0.3
147
+ Werkzeug==3.0.1
148
+ xmltodict==0.13.0
149
+ zipp @ file:///croot/zipp_1672387121353/work
tokenizer/tokenizer.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83f8fd1f82b91f8168b359623de56642349fe2282363245dca7c18844e37485
3
+ size 872
tokenizer/vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2bac2e6c0d80acfe2978b5d4ab014d964389c1120b86493765be2876d69748a
3
+ size 1510445
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+ import torch
3
+ import pickle
4
+
5
+ def preProcessText(text):
6
+ # put space in beteen the | -> devanagari danda to make it a separate word.
7
+ text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
8
+ # put space around the question mark ? to make it a separate word
9
+ text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
10
+ # put space in between comma(,)
11
+ text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
12
+ # remove space around the new line character
13
+ text = re.sub(r'\s*\n\s*','\n', text)
14
+ # replace any non-devangari string with a blank
15
+ text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
16
+ # add space in between the devanagari numbers and replace number by <num> token
17
+ text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
18
+ return text
19
+
20
+ def getTokenizer():
21
+ tokenizer_dir = "tokenizer"
22
+ tokenizer_path = tokenizer_dir + "/tokenizer.pth"
23
+ vocab_path = tokenizer_dir + "/vocab.pkl"
24
+ loaded_tokenizer = torch.load(tokenizer_path)
25
+ with open(vocab_path, 'rb') as file:
26
+ loaded_vocab = pickle.load(file)
27
+
28
+ return loaded_tokenizer, loaded_vocab