Spaces:

NirajanBekoju
/

Nepali-Text-Generation-Model

Sleeping

+import gradio as gr
+import torch
+from torch import Tensor, nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+from torchtext.vocab import build_vocab_from_iterator
+from torch.utils.data import dataset
+from torch.utils.tensorboard import SummaryWriter
+import regex as re
+import os
+import time
+from tqdm import tqdm
+import copy
+import math
+from model import TransformerModel
+from utils import preProcessText, getTokenizer
+from config import getConfig
+model_config, app_config = getConfig()
+print(model_config)
+print(app_config)
+bptt=model_config["bptt"]
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(device)
+softmax = nn.Softmax(dim=2)
+tokenizer, vocab = getTokenizer()
+ntokens = len(vocab)
+def get_model(model_config, ntokens):
+    emsize = model_config["emsize"]
+    d_hid = model_config["d_hid"]
+    nlayers = model_config["nlayers"]
+    nhead = model_config["nhead"]
+    dropout = model_config["dropout"]
+    model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
+    return model
+def loadModel(best_model_path):
+    global model
+    if os.path.exists(best_model_path):
+        print(f"Preloading model {best_model_path}")
+        if torch.cuda.is_available():
+            state = torch.load(best_model_path)
+        else:
+            state = torch.load(best_model_path, map_location=torch.device('cpu'))
+        model.load_state_dict(state['model_state_dict'])
+        return model
+    else:
+        raise Exception("Model Not Found")
+def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
+    """Converts raw text into a flat Tensor."""
+    # obtain the data in tensor format for each line
+    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
+            for item in raw_text_iter]
+    # concatenate all the lines
+    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
+def batchify(data: Tensor, batch_size: int) -> Tensor:
+    """Divides the data into batch_size separate sequences, removing extra elements
+    that wouldn't cleanly fit.
+    Args:
+        data: Tensor, shape [N]
+        batch_size: int, batch size
+    Returns:
+        Tensor of shape [N // bsz, bsz]
+    """
+    seq_len = data.size(0) // batch_size
+    data = data[:seq_len * batch_size]
+    data = data.view(batch_size, seq_len).t().contiguous()
+    return data.to(device)
+def generate_square_subsequent_mask(sz: int) -> Tensor:
+    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
+    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
+def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words=5, k=50):
+    model.eval()
+    src_mask = generate_square_subsequent_mask(bptt).to(device)
+    pred_text = []
+    for i in range(no_words):
+        batch_size = gen_data.size(0)
+        if batch_size != bptt:
+            src_mask_ = src_mask[:batch_size, :batch_size]
+        # generate the probability of the next word
+        output_softmax = model(gen_data, src_mask_)
+        output_softmax_permuted = output_softmax.permute(1, 0, 2)
+        # obtain the "k" top probable words index
+        # both indices and values are of size (no. of words, k=50)
+        indices = torch.topk(output_softmax_permuted, k, dim=2).indices.squeeze(0)
+        # obtain the top "k" probability of the probable words
+        values = torch.topk(softmax(output_softmax_permuted), k, dim=2).values
+        values = values/torch.sum(values, dim=2, keepdims=True)
+        values = values.squeeze(0)
+        # create categorical distribution and take sample from values
+        # categorical distribution take 1 sample from k=50 samples of each dimension
+        for _ in range(10):
+            ind_sampled = torch.distributions.Categorical(values).sample()
+            next_index = indices[-1][ind_sampled[-1]]
+            # if the obtained token is not <unk>, then no need to sample again
+            if vocab.lookup_token(next_index) != '<unk>':
+                break
+        pred_text.append([vocab.lookup_token(next_index)][0])
+        if(batch_size < 15):
+            gen_data = torch.cat((gen_data[:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
+            batch_size = gen_data.size(0)
+        else:
+            gen_data = torch.cat((gen_data[1:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)
+            batch_size = gen_data.size(0)
+    return pred_text
+def predText(text : str, num_words : int):
+    text = [text]
+    num_words = int(num_words)
+    sample_data = data_process(text)
+    sample_data = batchify(sample_data, 1)
+    pred_text = nonnaive_generator(loaded_model,  sample_data[:,-1].unsqueeze(1), no_words=num_words, k=50)
+    whole_text = text[0] + ' ' + ' '.join(pred_text)
+    return whole_text
+if __name__ == '__main__':
+    model = get_model(model_config, ntokens).to(device)
+    best_model_path = 'models/best_model.pt'
+    loaded_model = loadModel(best_model_path)
+    input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
+    with gr.Blocks() as demo:
+        input_text_box = gr.Textbox(label="Text", value="म घर", lines=5)
+        input_num_words = gr.Number(label="Number of word to generate", value=5)
+        btn = gr.Button(value="Submit")
+        btn.click(predText, inputs=[input_text_box, input_num_words], outputs=[input_text_box])
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def getConfig():
+    model_config = {
+        "emsize" : 300,
+        "d_hid" : 1024,
+        "nlayers" : 6,
+        "nhead" : 6,
+        "dropout" : 0.2,
+        "bptt" : 64
+    }
+    app_config = {
+        "logs" : "tensorboard_logs",
+        "epochs" : 25,
+    }
+    return model_config, app_config

model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from torch import Tensor, nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+import math
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2)
+                             * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, 1, d_model)
+        pe[:, 0, 0::2] = torch.sin(position * div_term)
+        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: Tensor, shape [seq_len, batch_size, embedding_dim]
+        """
+        x = x + self.pe[:x.size(0)]
+        return self.dropout(x)
+class TransformerModel(nn.Module):
+    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
+                 nlayers: int, dropout: float = 0.5):
+        super().__init__()
+        self.model_type = 'Transformer'
+        self.pos_encoder = PositionalEncoding(d_model, dropout)
+        encoder_layers = TransformerEncoderLayer(
+            d_model, nhead, d_hid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.embedding = nn.Embedding(ntoken, d_model)
+        self.d_model = d_model
+        self.decoder = nn.Linear(d_model, ntoken)
+        self.init_weights()
+    def init_weights(self) -> None:
+        initrange = 0.1
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
+        """
+        Args:
+            src: Tensor, shape [seq_len, batch_size]
+            src_mask: Tensor, shape [seq_len, seq_len]
+        Returns:
+            output Tensor of shape [seq_len, batch_size, ntoken]
+        """
+        src = src.long()
+        src = self.embedding(src) * math.sqrt(self.d_model)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, src_mask)
+        output = self.decoder(output)
+        return output

models/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f56a4727ca6141d21b9165a906445367b3714d5f53d519e86b963ba703747b7
+size 174967426

requirements.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+absl-py==2.0.0
+aiofiles==23.2.1
+altair==5.1.2
+annotated-types==0.6.0
+anyio==3.7.1
+arpa==0.1.0b4
+asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
+attrs==23.1.0
+backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
+cachetools==5.3.2
+certifi==2022.12.7
+charset-normalizer==2.1.1
+click==8.1.7
+colorama==0.4.6
+comm @ file:///croot/comm_1671231121260/work
+contourpy==1.1.1
+cycler==0.12.1
+debugpy @ file:///croot/debugpy_1690905042057/work
+decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
+dill==0.3.7
+einops==0.7.0
+exceptiongroup==1.2.0
+executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
+fastapi==0.104.1
+ffmpy==0.3.1
+filelock==3.9.0
+fonttools==4.43.1
+fsspec==2023.4.0
+google-auth==2.23.4
+google-auth-oauthlib==1.0.0
+gradio==4.7.1
+gradio_client==0.7.0
+grpcio==1.59.2
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.25.1
+huggingface-hub==0.17.3
+idna==3.4
+imageio==2.32.0
+importlib-metadata @ file:///croot/importlib-metadata_1678997070253/work
+importlib-resources==6.1.0
+ipykernel @ file:///croot/ipykernel_1691121631942/work
+ipython @ file:///croot/ipython_1691532092695/work
+jedi @ file:///tmp/build/80754af9/jedi_1644315233700/work
+Jinja2==3.1.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.1
+jupyter_client @ file:///croot/jupyter_client_1680171862562/work
+jupyter_core @ file:///croot/jupyter_core_1679906564508/work
+kiwisolver==1.4.5
+lazy_loader==0.3
+lightning-utilities==0.9.0
+lxml==4.9.3
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.2
+matplotlib==3.7.3
+matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
+mdurl==0.1.2
+mpmath==1.3.0
+nest-asyncio @ file:///croot/nest-asyncio_1672387112409/work
+networkx==3.0
+numpy==1.24.1
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.2
+opencv-python==4.8.1.78
+orjson==3.9.10
+packaging @ file:///croot/packaging_1693575174725/work
+pandarallel==1.6.5
+pandas==2.0.3
+parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
+pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
+phoenix-datasets @ git+https://github.com/enhuiz/phoenix-datasets@570481bf03a46555ca219f79ace1a2cfab149f8c
+pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
+Pillow==9.3.0
+pkgutil_resolve_name==1.3.10
+platformdirs @ file:///croot/platformdirs_1692205439124/work
+prompt-toolkit @ file:///croot/prompt-toolkit_1672387306916/work
+protobuf==4.24.4
+psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
+ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydub==0.25.1
+Pygments @ file:///croot/pygments_1684279966437/work
+pyparsing==3.1.1
+python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyWavelets==1.4.1
+PyYAML==6.0.1
+pyzmq @ file:///croot/pyzmq_1686601365461/work
+referencing==0.31.0
+regex==2023.10.3
+requests==2.28.1
+requests-oauthlib==1.3.1
+rich==13.7.0
+rpds-py==0.13.1
+rsa==4.9
+scikit-image==0.21.0
+scipy==1.10.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six @ file:///tmp/build/80754af9/six_1644875935023/work
+sniffio==1.3.0
+stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
+starlette==0.27.0
+sympy==1.12
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+tifffile==2023.7.10
+tokenizers==0.14.1
+tomlkit==0.12.0
+toolz==0.12.0
+torch==2.1.1
+torchaudio==2.1.0+cpu
+torchdata==0.7.1
+torchmetrics==1.2.0
+torchtext==0.16.1
+torchvision==0.16.0+cpu
+tornado @ file:///croot/tornado_1696936946304/work
+tqdm==4.66.1
+traitlets @ file:///croot/traitlets_1671143879854/work
+triton==2.1.0
+typer==0.9.0
+typing_extensions @ file:///croot/typing_extensions_1690297465030/work
+tzdata==2023.3
+urllib3==1.26.13
+uvicorn==0.24.0.post1
+vidaug @ git+https://github.com/okankop/vidaug@1c1ddf2640fe4a9171267d64ae5e3bd70c24d54a
+wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
+websockets==11.0.3
+Werkzeug==3.0.1
+xmltodict==0.13.0
+zipp @ file:///croot/zipp_1672387121353/work

tokenizer/tokenizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a83f8fd1f82b91f8168b359623de56642349fe2282363245dca7c18844e37485
+size 872

tokenizer/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2bac2e6c0d80acfe2978b5d4ab014d964389c1120b86493765be2876d69748a
+size 1510445

utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import regex as re
+import torch
+import pickle
+def preProcessText(text):
+    # put space in beteen the | -> devanagari danda to make it a separate word.
+    text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
+    # put space around the question mark ?  to make it a separate word
+    text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
+    # put space in between comma(,)
+    text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
+    # remove space around the new line character
+    text = re.sub(r'\s*\n\s*','\n', text)
+    # replace any non-devangari string with a blank
+    text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
+    # add space in between the devanagari numbers and replace number by <num> token
+    text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
+    return text
+def getTokenizer():
+    tokenizer_dir = "tokenizer"
+    tokenizer_path = tokenizer_dir + "/tokenizer.pth"
+    vocab_path = tokenizer_dir + "/vocab.pkl"
+    loaded_tokenizer = torch.load(tokenizer_path)
+    with open(vocab_path, 'rb') as file:
+        loaded_vocab = pickle.load(file)
+    return loaded_tokenizer, loaded_vocab