NirajanBekoju
commited on
Commit
•
464ed03
1
Parent(s):
296f6c4
deployment phase 1
Browse files- __pycache__/config.cpython-38.pyc +0 -0
- __pycache__/model.cpython-38.pyc +0 -0
- __pycache__/utils.cpython-38.pyc +0 -0
- app.py +149 -0
- config.py +16 -0
- model.py +64 -0
- models/best_model.pt +3 -0
- requirements.txt +149 -0
- tokenizer/tokenizer.pth +3 -0
- tokenizer/vocab.pkl +3 -0
- utils.py +28 -0
__pycache__/config.cpython-38.pyc
ADDED
Binary file (481 Bytes). View file
|
|
__pycache__/model.cpython-38.pyc
ADDED
Binary file (2.88 kB). View file
|
|
__pycache__/utils.cpython-38.pyc
ADDED
Binary file (965 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import Tensor, nn
|
5 |
+
from torch.nn import TransformerEncoder, TransformerEncoderLayer
|
6 |
+
from torchtext.vocab import build_vocab_from_iterator
|
7 |
+
from torch.utils.data import dataset
|
8 |
+
from torch.utils.tensorboard import SummaryWriter
|
9 |
+
|
10 |
+
import regex as re
|
11 |
+
import os
|
12 |
+
import time
|
13 |
+
from tqdm import tqdm
|
14 |
+
import copy
|
15 |
+
import math
|
16 |
+
|
17 |
+
from model import TransformerModel
|
18 |
+
from utils import preProcessText, getTokenizer
|
19 |
+
from config import getConfig
|
20 |
+
|
21 |
+
|
22 |
+
model_config, app_config = getConfig()
|
23 |
+
print(model_config)
|
24 |
+
print(app_config)
|
25 |
+
|
26 |
+
bptt=model_config["bptt"]
|
27 |
+
|
28 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
29 |
+
print(device)
|
30 |
+
|
31 |
+
softmax = nn.Softmax(dim=2)
|
32 |
+
|
33 |
+
tokenizer, vocab = getTokenizer()
|
34 |
+
ntokens = len(vocab)
|
35 |
+
|
36 |
+
|
37 |
+
def get_model(model_config, ntokens):
|
38 |
+
emsize = model_config["emsize"]
|
39 |
+
d_hid = model_config["d_hid"]
|
40 |
+
nlayers = model_config["nlayers"]
|
41 |
+
nhead = model_config["nhead"]
|
42 |
+
dropout = model_config["dropout"]
|
43 |
+
model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
|
44 |
+
return model
|
45 |
+
|
46 |
+
def loadModel(best_model_path):
|
47 |
+
global model
|
48 |
+
if os.path.exists(best_model_path):
|
49 |
+
print(f"Preloading model {best_model_path}")
|
50 |
+
if torch.cuda.is_available():
|
51 |
+
state = torch.load(best_model_path)
|
52 |
+
else:
|
53 |
+
state = torch.load(best_model_path, map_location=torch.device('cpu'))
|
54 |
+
model.load_state_dict(state['model_state_dict'])
|
55 |
+
return model
|
56 |
+
else:
|
57 |
+
raise Exception("Model Not Found")
|
58 |
+
|
59 |
+
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
|
60 |
+
"""Converts raw text into a flat Tensor."""
|
61 |
+
# obtain the data in tensor format for each line
|
62 |
+
data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
|
63 |
+
for item in raw_text_iter]
|
64 |
+
# concatenate all the lines
|
65 |
+
return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
|
66 |
+
|
67 |
+
def batchify(data: Tensor, batch_size: int) -> Tensor:
|
68 |
+
"""Divides the data into batch_size separate sequences, removing extra elements
|
69 |
+
that wouldn't cleanly fit.
|
70 |
+
Args:
|
71 |
+
data: Tensor, shape [N]
|
72 |
+
batch_size: int, batch size
|
73 |
+
Returns:
|
74 |
+
Tensor of shape [N // bsz, bsz]
|
75 |
+
"""
|
76 |
+
seq_len = data.size(0) // batch_size
|
77 |
+
data = data[:seq_len * batch_size]
|
78 |
+
data = data.view(batch_size, seq_len).t().contiguous()
|
79 |
+
return data.to(device)
|
80 |
+
|
81 |
+
def generate_square_subsequent_mask(sz: int) -> Tensor:
|
82 |
+
"""Generates an upper-triangular matrix of -inf, with zeros on diag."""
|
83 |
+
return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
|
84 |
+
|
85 |
+
def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words=5, k=50):
|
86 |
+
model.eval()
|
87 |
+
src_mask = generate_square_subsequent_mask(bptt).to(device)
|
88 |
+
pred_text = []
|
89 |
+
for i in range(no_words):
|
90 |
+
batch_size = gen_data.size(0)
|
91 |
+
if batch_size != bptt:
|
92 |
+
src_mask_ = src_mask[:batch_size, :batch_size]
|
93 |
+
|
94 |
+
# generate the probability of the next word
|
95 |
+
output_softmax = model(gen_data, src_mask_)
|
96 |
+
output_softmax_permuted = output_softmax.permute(1, 0, 2)
|
97 |
+
|
98 |
+
# obtain the "k" top probable words index
|
99 |
+
# both indices and values are of size (no. of words, k=50)
|
100 |
+
indices = torch.topk(output_softmax_permuted, k, dim=2).indices.squeeze(0)
|
101 |
+
# obtain the top "k" probability of the probable words
|
102 |
+
values = torch.topk(softmax(output_softmax_permuted), k, dim=2).values
|
103 |
+
values = values/torch.sum(values, dim=2, keepdims=True)
|
104 |
+
values = values.squeeze(0)
|
105 |
+
|
106 |
+
# create categorical distribution and take sample from values
|
107 |
+
# categorical distribution take 1 sample from k=50 samples of each dimension
|
108 |
+
for _ in range(10):
|
109 |
+
ind_sampled = torch.distributions.Categorical(values).sample()
|
110 |
+
next_index = indices[-1][ind_sampled[-1]]
|
111 |
+
# if the obtained token is not <unk>, then no need to sample again
|
112 |
+
if vocab.lookup_token(next_index) != '<unk>':
|
113 |
+
break
|
114 |
+
|
115 |
+
pred_text.append([vocab.lookup_token(next_index)][0])
|
116 |
+
if(batch_size < 15):
|
117 |
+
gen_data = torch.cat((gen_data[:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
|
118 |
+
batch_size = gen_data.size(0)
|
119 |
+
else:
|
120 |
+
gen_data = torch.cat((gen_data[1:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
|
121 |
+
batch_size = gen_data.size(0)
|
122 |
+
|
123 |
+
return pred_text
|
124 |
+
|
125 |
+
def predText(text : str, num_words : int):
|
126 |
+
text = [text]
|
127 |
+
num_words = int(num_words)
|
128 |
+
sample_data = data_process(text)
|
129 |
+
sample_data = batchify(sample_data, 1)
|
130 |
+
pred_text = nonnaive_generator(loaded_model, sample_data[:,-1].unsqueeze(1), no_words=num_words, k=50)
|
131 |
+
whole_text = text[0] + ' ' + ' '.join(pred_text)
|
132 |
+
return whole_text
|
133 |
+
|
134 |
+
|
135 |
+
if __name__ == '__main__':
|
136 |
+
model = get_model(model_config, ntokens).to(device)
|
137 |
+
best_model_path = 'models/best_model.pt'
|
138 |
+
loaded_model = loadModel(best_model_path)
|
139 |
+
|
140 |
+
input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
|
141 |
+
with gr.Blocks() as demo:
|
142 |
+
input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
|
143 |
+
input_num_words = gr.Number(label="Number of word to generate", value=5)
|
144 |
+
|
145 |
+
btn = gr.Button(value="Submit")
|
146 |
+
|
147 |
+
btn.click(predText, inputs=[input_text_box, input_num_words], outputs=[input_text_box])
|
148 |
+
|
149 |
+
demo.launch()
|
config.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def getConfig():
|
2 |
+
model_config = {
|
3 |
+
"emsize" : 300,
|
4 |
+
"d_hid" : 1024,
|
5 |
+
"nlayers" : 6,
|
6 |
+
"nhead" : 6,
|
7 |
+
"dropout" : 0.2,
|
8 |
+
"bptt" : 64
|
9 |
+
}
|
10 |
+
|
11 |
+
app_config = {
|
12 |
+
"logs" : "tensorboard_logs",
|
13 |
+
"epochs" : 25,
|
14 |
+
}
|
15 |
+
|
16 |
+
return model_config, app_config
|
model.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import Tensor, nn
|
3 |
+
from torch.nn import TransformerEncoder, TransformerEncoderLayer
|
4 |
+
|
5 |
+
import math
|
6 |
+
|
7 |
+
class PositionalEncoding(nn.Module):
|
8 |
+
|
9 |
+
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
|
10 |
+
super().__init__()
|
11 |
+
self.dropout = nn.Dropout(p=dropout)
|
12 |
+
|
13 |
+
position = torch.arange(max_len).unsqueeze(1)
|
14 |
+
div_term = torch.exp(torch.arange(0, d_model, 2)
|
15 |
+
* (-math.log(10000.0) / d_model))
|
16 |
+
pe = torch.zeros(max_len, 1, d_model)
|
17 |
+
pe[:, 0, 0::2] = torch.sin(position * div_term)
|
18 |
+
pe[:, 0, 1::2] = torch.cos(position * div_term)
|
19 |
+
self.register_buffer('pe', pe)
|
20 |
+
|
21 |
+
def forward(self, x: Tensor) -> Tensor:
|
22 |
+
"""
|
23 |
+
Args:
|
24 |
+
x: Tensor, shape [seq_len, batch_size, embedding_dim]
|
25 |
+
"""
|
26 |
+
x = x + self.pe[:x.size(0)]
|
27 |
+
return self.dropout(x)
|
28 |
+
|
29 |
+
class TransformerModel(nn.Module):
|
30 |
+
|
31 |
+
def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
|
32 |
+
nlayers: int, dropout: float = 0.5):
|
33 |
+
super().__init__()
|
34 |
+
self.model_type = 'Transformer'
|
35 |
+
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
36 |
+
encoder_layers = TransformerEncoderLayer(
|
37 |
+
d_model, nhead, d_hid, dropout)
|
38 |
+
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
|
39 |
+
self.embedding = nn.Embedding(ntoken, d_model)
|
40 |
+
self.d_model = d_model
|
41 |
+
self.decoder = nn.Linear(d_model, ntoken)
|
42 |
+
|
43 |
+
self.init_weights()
|
44 |
+
|
45 |
+
def init_weights(self) -> None:
|
46 |
+
initrange = 0.1
|
47 |
+
self.embedding.weight.data.uniform_(-initrange, initrange)
|
48 |
+
self.decoder.bias.data.zero_()
|
49 |
+
self.decoder.weight.data.uniform_(-initrange, initrange)
|
50 |
+
|
51 |
+
def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
|
52 |
+
"""
|
53 |
+
Args:
|
54 |
+
src: Tensor, shape [seq_len, batch_size]
|
55 |
+
src_mask: Tensor, shape [seq_len, seq_len]
|
56 |
+
Returns:
|
57 |
+
output Tensor of shape [seq_len, batch_size, ntoken]
|
58 |
+
"""
|
59 |
+
src = src.long()
|
60 |
+
src = self.embedding(src) * math.sqrt(self.d_model)
|
61 |
+
src = self.pos_encoder(src)
|
62 |
+
output = self.transformer_encoder(src, src_mask)
|
63 |
+
output = self.decoder(output)
|
64 |
+
return output
|
models/best_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f56a4727ca6141d21b9165a906445367b3714d5f53d519e86b963ba703747b7
|
3 |
+
size 174967426
|
requirements.txt
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
aiofiles==23.2.1
|
3 |
+
altair==5.1.2
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==3.7.1
|
6 |
+
arpa==0.1.0b4
|
7 |
+
asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
|
8 |
+
attrs==23.1.0
|
9 |
+
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
|
10 |
+
cachetools==5.3.2
|
11 |
+
certifi==2022.12.7
|
12 |
+
charset-normalizer==2.1.1
|
13 |
+
click==8.1.7
|
14 |
+
colorama==0.4.6
|
15 |
+
comm @ file:///croot/comm_1671231121260/work
|
16 |
+
contourpy==1.1.1
|
17 |
+
cycler==0.12.1
|
18 |
+
debugpy @ file:///croot/debugpy_1690905042057/work
|
19 |
+
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
|
20 |
+
dill==0.3.7
|
21 |
+
einops==0.7.0
|
22 |
+
exceptiongroup==1.2.0
|
23 |
+
executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
|
24 |
+
fastapi==0.104.1
|
25 |
+
ffmpy==0.3.1
|
26 |
+
filelock==3.9.0
|
27 |
+
fonttools==4.43.1
|
28 |
+
fsspec==2023.4.0
|
29 |
+
google-auth==2.23.4
|
30 |
+
google-auth-oauthlib==1.0.0
|
31 |
+
gradio==4.7.1
|
32 |
+
gradio_client==0.7.0
|
33 |
+
grpcio==1.59.2
|
34 |
+
h11==0.14.0
|
35 |
+
httpcore==1.0.2
|
36 |
+
httpx==0.25.1
|
37 |
+
huggingface-hub==0.17.3
|
38 |
+
idna==3.4
|
39 |
+
imageio==2.32.0
|
40 |
+
importlib-metadata @ file:///croot/importlib-metadata_1678997070253/work
|
41 |
+
importlib-resources==6.1.0
|
42 |
+
ipykernel @ file:///croot/ipykernel_1691121631942/work
|
43 |
+
ipython @ file:///croot/ipython_1691532092695/work
|
44 |
+
jedi @ file:///tmp/build/80754af9/jedi_1644315233700/work
|
45 |
+
Jinja2==3.1.2
|
46 |
+
jsonschema==4.20.0
|
47 |
+
jsonschema-specifications==2023.11.1
|
48 |
+
jupyter_client @ file:///croot/jupyter_client_1680171862562/work
|
49 |
+
jupyter_core @ file:///croot/jupyter_core_1679906564508/work
|
50 |
+
kiwisolver==1.4.5
|
51 |
+
lazy_loader==0.3
|
52 |
+
lightning-utilities==0.9.0
|
53 |
+
lxml==4.9.3
|
54 |
+
Markdown==3.5.1
|
55 |
+
markdown-it-py==3.0.0
|
56 |
+
MarkupSafe==2.1.2
|
57 |
+
matplotlib==3.7.3
|
58 |
+
matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
|
59 |
+
mdurl==0.1.2
|
60 |
+
mpmath==1.3.0
|
61 |
+
nest-asyncio @ file:///croot/nest-asyncio_1672387112409/work
|
62 |
+
networkx==3.0
|
63 |
+
numpy==1.24.1
|
64 |
+
nvidia-cublas-cu12==12.1.3.1
|
65 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
66 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
67 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
68 |
+
nvidia-cudnn-cu12==8.9.2.26
|
69 |
+
nvidia-cufft-cu12==11.0.2.54
|
70 |
+
nvidia-curand-cu12==10.3.2.106
|
71 |
+
nvidia-cusolver-cu12==11.4.5.107
|
72 |
+
nvidia-cusparse-cu12==12.1.0.106
|
73 |
+
nvidia-nccl-cu12==2.18.1
|
74 |
+
nvidia-nvjitlink-cu12==12.3.101
|
75 |
+
nvidia-nvtx-cu12==12.1.105
|
76 |
+
oauthlib==3.2.2
|
77 |
+
opencv-python==4.8.1.78
|
78 |
+
orjson==3.9.10
|
79 |
+
packaging @ file:///croot/packaging_1693575174725/work
|
80 |
+
pandarallel==1.6.5
|
81 |
+
pandas==2.0.3
|
82 |
+
parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
|
83 |
+
pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
|
84 |
+
phoenix-datasets @ git+https://github.com/enhuiz/phoenix-datasets@570481bf03a46555ca219f79ace1a2cfab149f8c
|
85 |
+
pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
|
86 |
+
Pillow==9.3.0
|
87 |
+
pkgutil_resolve_name==1.3.10
|
88 |
+
platformdirs @ file:///croot/platformdirs_1692205439124/work
|
89 |
+
prompt-toolkit @ file:///croot/prompt-toolkit_1672387306916/work
|
90 |
+
protobuf==4.24.4
|
91 |
+
psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
|
92 |
+
ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
93 |
+
pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
|
94 |
+
pyasn1==0.5.0
|
95 |
+
pyasn1-modules==0.3.0
|
96 |
+
pydantic==2.5.2
|
97 |
+
pydantic_core==2.14.5
|
98 |
+
pydub==0.25.1
|
99 |
+
Pygments @ file:///croot/pygments_1684279966437/work
|
100 |
+
pyparsing==3.1.1
|
101 |
+
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
|
102 |
+
python-multipart==0.0.6
|
103 |
+
pytz==2023.3.post1
|
104 |
+
PyWavelets==1.4.1
|
105 |
+
PyYAML==6.0.1
|
106 |
+
pyzmq @ file:///croot/pyzmq_1686601365461/work
|
107 |
+
referencing==0.31.0
|
108 |
+
regex==2023.10.3
|
109 |
+
requests==2.28.1
|
110 |
+
requests-oauthlib==1.3.1
|
111 |
+
rich==13.7.0
|
112 |
+
rpds-py==0.13.1
|
113 |
+
rsa==4.9
|
114 |
+
scikit-image==0.21.0
|
115 |
+
scipy==1.10.1
|
116 |
+
semantic-version==2.10.0
|
117 |
+
shellingham==1.5.4
|
118 |
+
six @ file:///tmp/build/80754af9/six_1644875935023/work
|
119 |
+
sniffio==1.3.0
|
120 |
+
stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
|
121 |
+
starlette==0.27.0
|
122 |
+
sympy==1.12
|
123 |
+
tensorboard==2.14.0
|
124 |
+
tensorboard-data-server==0.7.2
|
125 |
+
tifffile==2023.7.10
|
126 |
+
tokenizers==0.14.1
|
127 |
+
tomlkit==0.12.0
|
128 |
+
toolz==0.12.0
|
129 |
+
torch==2.1.1
|
130 |
+
torchaudio==2.1.0+cpu
|
131 |
+
torchdata==0.7.1
|
132 |
+
torchmetrics==1.2.0
|
133 |
+
torchtext==0.16.1
|
134 |
+
torchvision==0.16.0+cpu
|
135 |
+
tornado @ file:///croot/tornado_1696936946304/work
|
136 |
+
tqdm==4.66.1
|
137 |
+
traitlets @ file:///croot/traitlets_1671143879854/work
|
138 |
+
triton==2.1.0
|
139 |
+
typer==0.9.0
|
140 |
+
typing_extensions @ file:///croot/typing_extensions_1690297465030/work
|
141 |
+
tzdata==2023.3
|
142 |
+
urllib3==1.26.13
|
143 |
+
uvicorn==0.24.0.post1
|
144 |
+
vidaug @ git+https://github.com/okankop/vidaug@1c1ddf2640fe4a9171267d64ae5e3bd70c24d54a
|
145 |
+
wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
|
146 |
+
websockets==11.0.3
|
147 |
+
Werkzeug==3.0.1
|
148 |
+
xmltodict==0.13.0
|
149 |
+
zipp @ file:///croot/zipp_1672387121353/work
|
tokenizer/tokenizer.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a83f8fd1f82b91f8168b359623de56642349fe2282363245dca7c18844e37485
|
3 |
+
size 872
|
tokenizer/vocab.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2bac2e6c0d80acfe2978b5d4ab014d964389c1120b86493765be2876d69748a
|
3 |
+
size 1510445
|
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex as re
|
2 |
+
import torch
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
def preProcessText(text):
|
6 |
+
# put space in beteen the | -> devanagari danda to make it a separate word.
|
7 |
+
text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
|
8 |
+
# put space around the question mark ? to make it a separate word
|
9 |
+
text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
|
10 |
+
# put space in between comma(,)
|
11 |
+
text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
|
12 |
+
# remove space around the new line character
|
13 |
+
text = re.sub(r'\s*\n\s*','\n', text)
|
14 |
+
# replace any non-devangari string with a blank
|
15 |
+
text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
|
16 |
+
# add space in between the devanagari numbers and replace number by <num> token
|
17 |
+
text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
|
18 |
+
return text
|
19 |
+
|
20 |
+
def getTokenizer():
|
21 |
+
tokenizer_dir = "tokenizer"
|
22 |
+
tokenizer_path = tokenizer_dir + "/tokenizer.pth"
|
23 |
+
vocab_path = tokenizer_dir + "/vocab.pkl"
|
24 |
+
loaded_tokenizer = torch.load(tokenizer_path)
|
25 |
+
with open(vocab_path, 'rb') as file:
|
26 |
+
loaded_vocab = pickle.load(file)
|
27 |
+
|
28 |
+
return loaded_tokenizer, loaded_vocab
|