Spaces:

bkhmsi
/

Partial-Arabic-Diacritization

Running

App Files Files Community

bkhmsi commited on Jan 16, 2024

Commit

d7c4b94

1 Parent(s): 75c487d

support for TD2

Browse files

Files changed (9) hide show

.gitignore +2 -1
PartialDD.png +0 -0
app.py +45 -14
config.yaml +27 -2
dataloader.py +1 -1
dataloader_plm.py +126 -0
model_partial.py +10 -6
model_plm.py +360 -0
predict.py +16 -7

.gitignore CHANGED Viewed

@@ -2,4 +2,5 @@
 *.pt
 *.vec
 *.pem
-.DS_Store

 *.pt
 *.vec
 *.pem
+.DS_Store
+gradio_cached_examples

PartialDD.png ADDED Viewed

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import yaml
 import gdown
-import time
 import gradio as gr
 from predict import PredictTri
-from gradio import blocks
 output_path = "tashkeela-d2.pt"
 gdrive_templ = "https://drive.google.com/file/d/{}/view?usp=sharing"
@@ -12,13 +11,14 @@ if not os.path.exists(output_path):
     model_gdrive_id = "1FGelqImFkESbTyRsx_elkKIOZ9VbhRuo"
     gdown.download(gdrive_templ.format(model_gdrive_id), output=output_path, quiet=False, fuzzy=True)
-time.sleep(1)
 output_path = "vocab.vec"
 if not os.path.exists(output_path):
     vocab_gdrive_id = "1-0muGvcSYEf8RAVRcwXay4MRex6kmCii"
     gdown.download(gdrive_templ.format(vocab_gdrive_id), output=output_path, quiet=False, fuzzy=True)
 with open("config.yaml", 'r', encoding="utf-8") as file:
     config = yaml.load(file, Loader=yaml.FullLoader)
@@ -27,16 +27,31 @@ config["train"]["max-token-count"] = config["predictor"]["window"] * 3
 predictor = PredictTri(config)
-def diacritze_full(text):
     do_hard_mask = None
     threshold = None
-    predictor.create_dataloader(text, False, do_hard_mask, threshold)
     diacritized_lines = predictor.predict_partial(do_partial=False, lines=text.split('\n'))
     return diacritized_lines
-def diacritze_partial(text, mask_mode, threshold):
     do_partial = True
-    predictor.create_dataloader(text, do_partial, mask_mode=="Hard", threshold)
     diacritized_lines = predictor.predict_partial(do_partial=do_partial, lines=text.split('\n'))
     return diacritized_lines
@@ -45,9 +60,19 @@ with gr.Blocks(theme=gr.themes.Default(text_size="lg")) as demo:
     """
     # Partial Diacritization: A Context-Contrastive Inference Approach
     ### Authors: Muhammad ElNokrashy, Badr AlKhamissi
-    ### Paper Link: TBD
     """)
     with gr.Tab(label="Full Diacritization"):
         full_input_txt = gr.Textbox(
@@ -69,11 +94,11 @@ with gr.Blocks(theme=gr.themes.Default(text_size="lg")) as demo:
         )
         full_btn = gr.Button(value="Shakkel")
-        full_btn.click(diacritze_full, inputs=[full_input_txt], outputs=[full_output_txt])
         gr.Examples(
             examples=[
-                "ولو حمل من مجلس الخيار ، ولم يمنع من الكلام"
             ],
             inputs=full_input_txt,
             outputs=full_output_txt,
@@ -105,11 +130,13 @@ with gr.Blocks(theme=gr.themes.Default(text_size="lg")) as demo:
         )
         partial_btn = gr.Button(value="Shakkel")
-        partial_btn.click(diacritze_partial, inputs=[partial_input_txt, masking_mode, threshold_slider], outputs=[partial_output_txt])
         gr.Examples(
             examples=[
-                ["ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "Hard", 0],
             ],
             inputs=[partial_input_txt, masking_mode, threshold_slider],
             outputs=partial_output_txt,
@@ -117,7 +144,11 @@ with gr.Blocks(theme=gr.themes.Default(text_size="lg")) as demo:
             cache_examples=True,
         )
 if __name__ == "__main__":
     demo.queue().launch(

 import os
 import yaml
 import gdown
 import gradio as gr
 from predict import PredictTri
+from huggingface_hub import hf_hub_download
 output_path = "tashkeela-d2.pt"
 gdrive_templ = "https://drive.google.com/file/d/{}/view?usp=sharing"
     model_gdrive_id = "1FGelqImFkESbTyRsx_elkKIOZ9VbhRuo"
     gdown.download(gdrive_templ.format(model_gdrive_id), output=output_path, quiet=False, fuzzy=True)
 output_path = "vocab.vec"
 if not os.path.exists(output_path):
     vocab_gdrive_id = "1-0muGvcSYEf8RAVRcwXay4MRex6kmCii"
     gdown.download(gdrive_templ.format(vocab_gdrive_id), output=output_path, quiet=False, fuzzy=True)
+if not os.path.exists("td2/tashkeela-ashaar-td2.pt"):
+    hf_hub_download(repo_id="munael/Partial-Arabic-Diacritization-TD2", filename="tashkeela-ashaar-td2.pt", local_dir="td2")
 with open("config.yaml", 'r', encoding="utf-8") as file:
     config = yaml.load(file, Loader=yaml.FullLoader)
 predictor = PredictTri(config)
+current_model_name = "TD2"
+config["model-name"] = current_model_name
+def diacritze_full(text, model_name):
+    global current_model_name, predictor
+    if model_name != current_model_name:
+        config["model-name"] = model_name
+        current_model_name = model_name
+        predictor = PredictTri(config)
     do_hard_mask = None
     threshold = None
+    predictor.create_dataloader(text, False, do_hard_mask, threshold, model_name)
     diacritized_lines = predictor.predict_partial(do_partial=False, lines=text.split('\n'))
     return diacritized_lines
+def diacritze_partial(text, mask_mode, threshold, model_name):
+    global current_model_name, predictor
+    if model_name != current_model_name:
+        config["model-name"] = model_name
+        current_model_name = model_name
+        predictor = PredictTri(config)
     do_partial = True
+    predictor.create_dataloader(text, do_partial, mask_mode=="Hard", threshold, model_name)
     diacritized_lines = predictor.predict_partial(do_partial=do_partial, lines=text.split('\n'))
     return diacritized_lines
     """
     # Partial Diacritization: A Context-Contrastive Inference Approach
     ### Authors: Muhammad ElNokrashy, Badr AlKhamissi
+    ### Paper Link: TBD (abstract below)
     """)
+    gr.HTML(
+        "<img src='./PartialDD.png' style='float:right'/>"
+    )
+    model_choice = gr.Dropdown(
+        choices=["D2", "TD2"],
+        label="Diacritization Model",
+        value=current_model_name
+    )
     with gr.Tab(label="Full Diacritization"):
         full_input_txt = gr.Textbox(
         )
         full_btn = gr.Button(value="Shakkel")
+        full_btn.click(diacritze_full, inputs=[full_input_txt, model_choice], outputs=[full_output_txt])
         gr.Examples(
             examples=[
+                "ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "TD2"
             ],
             inputs=full_input_txt,
             outputs=full_output_txt,
         )
         partial_btn = gr.Button(value="Shakkel")
+        partial_btn.click(diacritze_partial, inputs=[partial_input_txt, masking_mode, threshold_slider, model_choice], outputs=[partial_output_txt])
         gr.Examples(
             examples=[
+                ["ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "Hard", 0, "TD2"],
+                ["ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "Soft", 0.1, "TD2"],
+                ["ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "Soft", 0.01, "TD2"],
             ],
             inputs=[partial_input_txt, masking_mode, threshold_slider],
             outputs=partial_output_txt,
             cache_examples=True,
         )
+        gr.Markdown(
+        """
+        ### Abstract
+        > Diacritization plays a pivotal role in improving readability and disambiguating the meaning of Arabic texts. Efforts have so far focused on marking every eligible character (Full Diacritization). Comparatively overlooked, Partial Diacritzation (PD) is the selection of a subset of characters to be marked to aid comprehension where needed.Research has indicated that excessive diacritic marks can hinder skilled readers---reducing reading speed and accuracy. We conduct a behavioral experiment and show that partially marked text is often easier to read than fully marked text, and sometimes easier than plain text. In this light, we introduce Context-Contrastive Partial Diacritization (CCPD)---a novel approach to PD which integrates seamlessly with existing Arabic diacritization systems. CCPD processes each word twice, once with context and once without, and diacritizes only the characters with disparities between the two inferences. Further, we introduce novel indicators for measuring partial diacritization quality {SR, PDER, HDER, ERE}, essential for establishing this as a machine learning task. Lastly, we introduce TD2, a Transformer-variant of an established model which offers a markedly different performance profile on our proposed indicators compared to all other known systems.
+        """)
 if __name__ == "__main__":
     demo.queue().launch(

config.yaml CHANGED Viewed

@@ -1,22 +1,47 @@
 run-title: tashkeela-d2
 debug: false
 paths:
   base: ./dataset/ashaar
   save: ./models
   load: tashkeela-d2.pt
   resume: ./models/Tashkeela-D2/tashkeela-d2.pt
   constants: ./dataset/helpers/constants
   word-embs: vocab.vec
   test: test
 loader:
   wembs-limit: -1
   num-workers: 0
 train:
   epochs: 1000
-  batch-size: 32
   char-embed-dim: 32
   resume: false
   resume-lr: false
@@ -51,7 +76,7 @@ train:
   stopping-patience: 3
 predictor:
-  batch-size: 75
   stride: 2
   window: 20
   gt-signal-prob: 0

 run-title: tashkeela-d2
 debug: false
+model-name: TD2
 paths:
   base: ./dataset/ashaar
   save: ./models
   load: tashkeela-d2.pt
+  load-td2: td2/tashkeela-ashaar-td2.pt
   resume: ./models/Tashkeela-D2/tashkeela-d2.pt
   constants: ./dataset/helpers/constants
   word-embs: vocab.vec
   test: test
+modeling:
+  "checkpoint": munael/Partial-Arabic-Diacritization-TD2
+  "base_model": CAMeL-Lab/bert-base-arabic-camelbert-mix-ner
+  # "base_model": UBC-NLP/MARBERTv2
+  # "base_model": UBC-NLP/ARBERTv2
+  "deep-cls": true
+  "full-finetune": true #< From true
+  "keep-token-model-layers": 2
+  # "num-finetune-last-layers": 2 #
+  "num-chars": 40
+  "char-embed-dim": 128
+  "token_hidden_size": 768
+  "deep-down-proj": true
+  "dropout": 0.2
+  "sentence_dropout": 0.1
+  "diac_model_config": {
+    "vocab_size": 1,
+    "num_hidden_layers": 2,
+    "hidden_size": 768,
+    "intermediate_size": 2304,
+    "num_attention_heads": 8,
+  }
 loader:
   wembs-limit: -1
   num-workers: 0
 train:
   epochs: 1000
+  batch-size: 1
   char-embed-dim: 32
   resume: false
   resume-lr: false
   stopping-patience: 3
 predictor:
+  batch-size: 1
   stride: 2
   window: 20
   gt-signal-prob: 0

dataloader.py CHANGED Viewed

@@ -24,7 +24,7 @@ class DataRetriever(Dataset):
     def __getitem__(self, idx):
         word_x, char_x, diac_x, diac_y = self.create_sentence(idx)
-        return self.preprocess((word_x, char_x, diac_x)), T.tensor(diac_y, dtype=T.long)
     def create_sentence(self, idx):
         line = self.lines[idx]

     def __getitem__(self, idx):
         word_x, char_x, diac_x, diac_y = self.create_sentence(idx)
+        return self.preprocess((word_x, char_x, diac_x)), T.tensor(diac_y, dtype=T.long), [0]
     def create_sentence(self, idx):
         line = self.lines[idx]

dataloader_plm.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from typing import List, Tuple, Any
+import os
+from functools import lru_cache
+from pyarabic.araby import tokenize, strip_tashkeel
+import numpy as np
+import torch as T
+from torch.utils.data import Dataset
+try:
+    from transformers import PreTrainedTokenizer
+except:
+    from typing import Any as PreTrainedTokenizer
+from data_utils import DatasetUtils
+import diac_utils as du
+class DataRetriever(Dataset):
+    def __init__(
+            self,
+            lines,
+            data_utils: DatasetUtils,
+            is_test: bool = False,
+            *,
+            tokenizer: PreTrainedTokenizer,
+            lines_mode: bool = False,
+            **kwargs,
+    ):
+        super(DataRetriever).__init__()
+        self.data_utils = data_utils
+        self.is_test = is_test
+        self.tokenizer = tokenizer
+        self.stride = data_utils.test_stride
+        self.data_points = lines
+        self.bos_token_id = int(self.tokenizer.bos_token_id or self.tokenizer.cls_token_id)
+        self.eos_token_id = int(self.tokenizer.eos_token_id or self.tokenizer.sep_token_id)
+        self.max_tokens = self.data_utils.max_token_count
+        self.max_slen = self.data_utils.max_sent_len
+        self.max_wlen = self.data_utils.max_word_len
+        # self.p_val    = self.data_utils.pad_val
+        self.p_val    = self.tokenizer.pad_token_id
+        self.pc_val   = self.data_utils.pad_char_id
+        self.pt_val   = self.data_utils.pad_target_val
+        self.char_x_padding = [self.pc_val]     * self.max_wlen
+        self.diac_x_padding = [[self.pc_val]*8] * self.max_wlen
+        self.diac_y_padding = [self.pt_val]     * self.max_wlen
+    def preprocess(self, data, dtype=T.long):
+        return [T.tensor(np.array(x), dtype=dtype) for x in data]
+    def __len__(self):
+        return len(self.data_points)
+    @lru_cache(maxsize=1024 * 2)
+    def __getitem__(self, idx: int) -> Tuple[List[T.Tensor], T.Tensor, T.Tensor]:
+        word_x, char_x, diac_x, diac_y, subword_lengths = self.create_sentence(idx)
+        return (
+            self.preprocess([word_x, char_x, diac_x]),
+            T.tensor(diac_y, dtype=T.long),
+            T.tensor(subword_lengths, dtype=T.long)
+        )
+    def create_sentence(self, idx):
+        line = self.data_points[idx]
+        # tokens = tokenize(line.strip())
+        words: List[str] = tokenize(line.strip())
+        # words_: List[str] = []
+        # for word in words:
+        #     if len(strip_tashkeel(word)) == 0:
+        #         words_[-1] += word.strip()
+        #     else:
+        #         words_.append(word)
+        # word_tokens_bin = [self.tokenizer(word) for word in words]
+        # tokens_bin = self.tokenizer(line.strip())
+        subwords_x = [self.bos_token_id]
+        subword_lengths = []
+        char_x = []
+        diac_x = []
+        diac_y = []
+        diac_y_tmp = []
+        for i_word, word in enumerate(words):
+            word = du.strip_unknown_tashkeel(word)
+            word_chars = du.split_word_on_characters_with_diacritics(word)
+            cx, cy, cy_3head = du.create_label_for_word(word_chars)
+            word_strip = strip_tashkeel(word)
+            #? List[int: "word_index"]
+            #? Strip the BOS/EOS which the tokenizer adds
+            word_sub_ids = self.tokenizer(word_strip)['input_ids'][1:-1]
+            subword_lengths += [len(word_sub_ids)]
+            subwords_x += word_sub_ids
+            # word_x += [self.data_utils.w2idx.get(word_strip, self.data_utils.w2idx["<pad>"])]
+            char_x += [self.data_utils.pad_and_truncate_sequence(cx, self.max_wlen)]
+            diac_y += [self.data_utils.pad_and_truncate_sequence(cy, self.max_wlen, pad=self.data_utils.pad_target_val)]
+            diac_y_tmp += [self.data_utils.pad_and_truncate_sequence(cy_3head, self.max_wlen, pad=[self.data_utils.pad_target_val]*3)]
+        assert len(char_x) == len(subword_lengths), f"{char_x=}; {subword_lengths=} ;;"
+        assert len(char_x) == len(words)
+        diac_x = self.data_utils.create_decoder_input(diac_y_tmp)
+        subwords_x += [self.eos_token_id]
+        # assert len(char_x) + 2 == len(subwords_x), f"{len(char_x)} + 2 != {len(subwords_x)} ;;" # Because of BOS, EOS
+        assert len(subword_lengths) == len(words)
+        subwords_x      = self.data_utils.pad_and_truncate_sequence(subwords_x,      self.max_tokens, pad=self.p_val)
+        subword_lengths = self.data_utils.pad_and_truncate_sequence(subword_lengths, self.max_slen, pad=0)
+        char_x = self.data_utils.pad_and_truncate_sequence(char_x, self.max_slen, pad=self.char_x_padding)
+        diac_x = self.data_utils.pad_and_truncate_sequence(diac_x, self.max_slen, pad=self.diac_x_padding)
+        diac_y = self.data_utils.pad_and_truncate_sequence(diac_y, self.max_slen, pad=self.diac_y_padding)
+        return subwords_x, char_x, diac_x, diac_y, subword_lengths

model_partial.py CHANGED Viewed

@@ -9,7 +9,7 @@ from torch.nn import functional as F
 from diac_utils import flat_2_3head
 from model_dd import DiacritizerD2
-from model_dd import DatasetUtils
 class Readout(nn.Module):
     def __init__(
@@ -72,8 +72,11 @@ class PartialDD(nn.Module):
         #     self.config_d2 = yaml.safe_load(fin)
         # self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
         self.config = config
-        self._use_d2 = True
-        self.sentence_diac = DiacritizerD2(self.config)
         # self.sentence_diac.to(self.device)
         # self.build()
@@ -90,9 +93,10 @@ class PartialDD(nn.Module):
     def load_state_dict(
             self,
-            state_dict: dict
     ):
-        self.sentence_diac.load_state_dict(state_dict)
     def _slim_batch(
             self,
@@ -277,7 +281,7 @@ class PartialDD(nn.Module):
         }
         print("> Predicting...")
         # breakpoint()
-        for i_batch, (inputs, _) in enumerate(tqdm(dataloader)):
             # if i_batch > 10:
             #     break
             #^ inputs: [toke_ids, char_ids, diac_ids]

 from diac_utils import flat_2_3head
 from model_dd import DiacritizerD2
+from model_plm import Diacritizer
 class Readout(nn.Module):
     def __init__(
         #     self.config_d2 = yaml.safe_load(fin)
         # self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
         self.config = config
+        self._use_d2 = config["model-name"] == "D2"
+        if self._use_d2:
+            self.sentence_diac = DiacritizerD2(self.config)
+        else:
+            self.sentence_diac = Diacritizer(self.config, load_pretrained=False)
         # self.sentence_diac.to(self.device)
         # self.build()
     def load_state_dict(
             self,
+            state_dict: dict,
+            strict: bool = True,
     ):
+        self.sentence_diac.load_state_dict(state_dict, strict=strict)
     def _slim_batch(
             self,
         }
         print("> Predicting...")
         # breakpoint()
+        for i_batch, (inputs, _, subword_lengths) in enumerate(tqdm(dataloader)):
             # if i_batch > 10:
             #     break
             #^ inputs: [toke_ids, char_ids, diac_ids]

model_plm.py ADDED Viewed

	@@ -0,0 +1,360 @@

+from typing import List, Iterator, cast
+import copy
+import numpy as np
+import torch as T
+from torch import nn
+from torch.nn import functional as F
+from transformers import BertConfig, BertModel
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+class Diacritizer(nn.Module):
+    def __init__(
+            self,
+            config,
+            device=None,
+            load_pretrained=True
+    ) -> None:
+        super().__init__()
+        self._dummy = nn.Parameter(T.ones(1))
+        if 'modeling' in config:
+            config = config['modeling']
+        self.config = config
+        model_name = config.get('base_model', "CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if load_pretrained:
+            self.token_model: BertModel = AutoModel.from_pretrained(model_name)
+        else:
+            marbert_config = AutoConfig.from_pretrained(model_name)
+            self.token_model = AutoModel.from_config(marbert_config)
+        self.num_classes  = 15
+        self.diac_model_config = BertConfig(**config['diac_model_config'])
+        self.token_model_config: BertConfig = self.token_model.config
+        self.char_embs      = nn.Embedding(config["num-chars"], embedding_dim=config["char-embed-dim"])
+        self.diac_emb_model = self.build_diac_model(self.token_model)
+        self.down_project_token_embeds_deep = None
+        self.down_project_token_embeds = None
+        if 'token_hidden_size' in config:
+            if config['token_hidden_size'] == 'auto':
+                down_proj_size = self.diac_emb_model.config.hidden_size
+            else:
+                down_proj_size = config['token_hidden_size']
+            if config.get('deep-down-proj', False):
+                self.down_project_token_embeds_deep = nn.Sequential(
+                    nn.Linear(
+                        self.token_model_config.hidden_size + config["char-embed-dim"],
+                        down_proj_size * 4,
+                        bias=False,
+                    ),
+                    nn.Tanh(),
+                    nn.Linear(
+                        down_proj_size * 4,
+                        down_proj_size,
+                        bias=False,
+                    )
+                )
+            # else:
+                self.down_project_token_embeds = nn.Linear(
+        self.token_model_config.hidden_size + config["char-embed-dim"],
+                    down_proj_size,
+                    bias=False,
+                )
+        # assert self.down_project_token_embeds_deep is None or self.down_project_token_embeds is None
+        classifier_feature_size = self.diac_model_config.hidden_size
+        if config.get('deep-cls', False):
+            # classifier_feature_size = 512
+            self.final_feature_transform = nn.Linear(
+                self.diac_model_config.hidden_size
+                    + self.token_model_config.hidden_size,
+                #^ diac_features + [residual from token_model]
+                out_features=classifier_feature_size,
+                bias=False
+            )
+        else:
+            self.final_feature_transform = None
+        self.feature_layer_norm = nn.LayerNorm(classifier_feature_size)
+        self.classifier = nn.Linear(classifier_feature_size, self.num_classes, bias=True)
+        self.trim_model_(config)
+        self.dropout = nn.Dropout(config['dropout'])
+        self.sent_dropout_p = config['sentence_dropout']
+        self.closs = F.cross_entropy
+    def build_diac_model(self, token_model=None):
+        if self.config.get('pre-init-diac-model', False):
+            model = copy.deepcopy(self.token_model)
+            model.pooler = None
+            model.embeddings.word_embeddings = None
+            num_layers = self.config.get('keep-token-model-layers', None)
+            model.encoder.layer = nn.ModuleList(
+                list(model.encoder.layer[num_layers:num_layers*2])
+            )
+            model.encoder.config.num_hidden_layers = num_layers
+        else:
+            model = BertModel(self.diac_model_config)
+        return model
+    def trim_model_(self, config):
+        self.token_model.pooler = None
+        self.diac_emb_model.pooler = None
+        # self.diac_emb_model.embeddings = None
+        self.diac_emb_model.embeddings.word_embeddings = None
+        num_token_model_kept_layers = config.get('keep-token-model-layers', None)
+        if num_token_model_kept_layers is not None:
+            self.token_model.encoder.layer = nn.ModuleList(
+                list(self.token_model.encoder.layer[:num_token_model_kept_layers])
+            )
+            self.token_model.encoder.config.num_hidden_layers = num_token_model_kept_layers
+        if not config.get('full-finetune', False):
+            for param in self.token_model.parameters():
+                param.requires_grad = False
+            finetune_last_layers = config.get('num-finetune-last-layers', 4)
+            if finetune_last_layers > 0:
+                unfrozen_layers = self.token_model.encoder.layer[-finetune_last_layers:]
+                for layer in unfrozen_layers:
+                    for param in layer.parameters():
+                        param.requires_grad = True
+    def get_grouped_params(self):
+        downstream_params: Iterator[nn.Parameter] = cast(
+            Iterator,
+            (param
+                for module in (self.diac_emb_model, self.classifier, self.char_embs)
+                for param in module.parameters())
+        )
+        pg = {
+            'pretrained': self.token_model.parameters(),
+            'downstream': downstream_params,
+        }
+        return pg
+    @property
+    def device(self):
+        return self._dummy.device
+    def step(self, xt, yt, mask=None, subword_lengths: T.Tensor=None):
+        # ^ word_x, char_x, diac_x are Indices
+        # ^ xt             : self.preprocess((word_x, char_x, diac_x)),
+        # ^ yt             : T.tensor(diac_y, dtype=T.long),
+        # ^ subword_lengths: T.tensor(subword_lengths, dtype=T.long)
+        #< Move char_x, diac_x to device because they're small and trainable
+        xt[0], xt[1], yt, subword_lengths = self._slim_batch_size(xt[0], xt[1], yt, subword_lengths)
+        xt[0] = xt[0].to(self.device)
+        xt[1] = xt[1].to(self.device)
+        # xt[2] = xt[2].to(self.device)
+        yt = yt.to(self.device)
+        #^ yt: [b tw tc]
+        Nb, Tword, Tchar = xt[1].shape
+        if Tword * Tchar < 500:
+            diac = self(*xt, subword_lengths)
+            loss = self.closs(diac.view(-1, self.num_classes), yt.view(-1), reduction='sum')
+        else:
+            num_chunks = Tword * Tchar / 300
+            loss = 0
+            for i in range(round(num_chunks+0.5)):
+                _slice = slice(i*300, (i+1)*300)
+                chunk = self._slice_batch(xt, _slice)
+                diac = self(*chunk, subword_lengths[_slice])
+                chunk_loss = self.closs(diac.view(-1, self.num_classes), yt.view(-1), reduction='sum')
+                loss = loss + chunk_loss
+        return loss
+    def _slice_batch(self, xt: List[T.Tensor], _slice):
+        return [xt[0][_slice], xt[1][_slice], xt[2][_slice]]
+    def _slim_batch_size(
+            self,
+            tx: T.Tensor,
+            cx: T.Tensor,
+            yt: T.Tensor,
+            subword_lengths: T.Tensor
+    ):
+        #^ tx : [b tt]
+        #^ cx : [b tw tc]
+        #^ yt : [b tw tc]
+        token_nonpad_mask = tx.ne(self.tokenizer.pad_token_id)
+        Ttoken = token_nonpad_mask.sum(1).max()
+        tx = tx[:, :Ttoken]
+        char_nonpad_mask = cx.ne(0)
+        Tword = char_nonpad_mask.any(2).sum(1).max()
+        Tchar = char_nonpad_mask.sum(2).max()
+        cx = cx[:, :Tword, :Tchar]
+        yt = yt[:, :Tword, :Tchar]
+        subword_lengths = subword_lengths[:, :Tword]
+        return tx, cx, yt, subword_lengths
+    def token_dropout(self, toke_x):
+        #^ toke_x : [b tw]
+        if self.training:
+            q = 1.0 - self.sent_dropout_p
+            sdo = T.bernoulli(T.full(toke_x.shape, q))
+            toke_x[sdo == 0] = self.tokenizer.pad_token_id
+        return toke_x
+    def sentence_dropout(self, word_embs: T.Tensor):
+        #^ word_embs : [b tw dwe]
+        if self.training:
+            q = 1.0 - self.sent_dropout_p
+            sdo = T.bernoulli(T.full(word_embs.shape[:2], q))
+            sdo = sdo.detach().unsqueeze(-1).to(word_embs)
+            word_embs = word_embs * sdo
+            # toke_x[sdo == 0] = self.tokenizer.pad_token_id
+        return word_embs
+    def embed_tokens(self, input_ids: T.Tensor, attention_mask: T.Tensor):
+        y: BaseModelOutputWithPoolingAndCrossAttentions
+        y = self.token_model(input_ids, attention_mask=attention_mask)
+        z = y.last_hidden_state
+        return z
+    def forward(
+            self,
+            toke_x          : T.Tensor,
+            char_x          : T.Tensor,
+            diac_x          : T.Tensor,
+            subword_lengths : T.Tensor,
+    ):
+        #^ toke_x : [b tt]
+        #^ char_x : [b tw tc]
+        #^ diac_x/labels : [b tw tc]
+        #^ subword_lengths : [b, tw]
+        # !TODO Use `subword_lengths` to aggregate subword embeddings first before ...
+        # ... passing concatenated contextual embedding to chars in diac_model
+        token_nonpad_mask = toke_x.ne(self.tokenizer.pad_token_id)
+        char_nonpad_mask = char_x.ne(0)
+        Nb, Tw, Tc = char_x.shape
+        # assert Tw == Tw_0 and Tc == Tc_0, f"{Tw=} {Tw_0=}, {Tc=} {Tc_0=}"
+        # toke_x = self.token_dropout(toke_x)
+        token_embs = self.embed_tokens(toke_x, attention_mask=token_nonpad_mask)
+        # token_embs = self.sentence_dropout(token_embs)
+        #? Strip BOS,EOS
+        token_embs = token_embs[:, 1:-1, ...]
+        sent_word_strides = subword_lengths.cumsum(1)
+        sent_enc: T.Tensor = T.zeros(Nb, Tw, token_embs.shape[-1]).to(token_embs)
+        for i_b in range(Nb):
+            token_embs_ib = token_embs[i_b]
+            start_iw = 0
+            for i_word, end_iw in enumerate(sent_word_strides[i_b]):
+                if end_iw == start_iw: break
+                word_emb = token_embs_ib[start_iw : end_iw].sum(0) / (end_iw - start_iw)
+                sent_enc[i_b, i_word] = word_emb
+                start_iw = end_iw
+        #^ sent_enc: [b tw dwe]
+        char_x_flat = char_x.reshape(Nb*Tw, Tc)
+        char_nonpad_mask = char_x_flat.gt(0)
+        # ^ char_nonpad_mask [b*tw tc]
+        char_x_flat = char_x_flat * char_nonpad_mask
+        cembs = self.char_embs(char_x_flat)
+        #^ cembs: [b*tw tc dce]
+        wembs = sent_enc.unsqueeze(-2).expand(Nb, Tw, Tc, -1).view(Nb*Tw, Tc, -1)
+        #^ wembs: [b tw dwe] => [b tw _ dwe] => [b*tw tc dwe]
+        cw_embs = T.cat([cembs, wembs], dim=-1)
+        #^ char_embs : [b*tw tc dcw] ; dcw = dc + dwe
+        cw_embs = self.dropout(cw_embs)
+        cw_embs_ = cw_embs
+        if self.down_project_token_embeds is not None:
+            cw_embs_ = self.down_project_token_embeds(cw_embs)
+        if self.down_project_token_embeds_deep is not None:
+            cw_embs_ = cw_embs_ + self.down_project_token_embeds_deep(cw_embs)
+        cw_embs = cw_embs_
+        diac_enc: BaseModelOutputWithPoolingAndCrossAttentions
+        diac_enc = self.diac_emb_model(inputs_embeds=cw_embs, attention_mask=char_nonpad_mask)
+        diac_emb = diac_enc.last_hidden_state
+        diac_emb = self.dropout(diac_emb)
+        #^ diac_emb: [b*tw tc dce]
+        diac_emb = diac_emb.view(Nb, Tw, Tc, -1)
+        sent_residual = sent_enc.unsqueeze(2).expand(-1, -1, Tc, -1)
+        final_feature = T.cat([sent_residual, diac_emb], dim=-1)
+        if self.final_feature_transform is not None:
+            final_feature = self.final_feature_transform(final_feature)
+            final_feature = F.tanh(final_feature)
+            final_feature = self.dropout(final_feature)
+        else:
+            final_feature = diac_emb
+        # final_feature = self.feature_layer_norm(final_feature)
+        diac_out = self.classifier(final_feature)
+        # if T.isnan(diac_out).any():
+        #     breakpoint()
+        return diac_out
+    def predict(self, dataloader):
+        from tqdm import tqdm
+        import diac_utils as du
+        training = self.training
+        self.eval()
+        preds = {'haraka': [], 'shadda': [], 'tanween': []}
+        print("> Predicting...")
+        for inputs, _, subword_lengths in tqdm(dataloader, total=len(dataloader)):
+            inputs[0] = inputs[0].to(self.device)
+            inputs[1] = inputs[1].to(self.device)
+            output = self(*inputs, subword_lengths).detach()
+            marks = np.argmax(T.softmax(output, dim=-1).cpu().numpy(), axis=-1)
+            #^ [b ts tw]
+            haraka, tanween, shadda = du.flat_2_3head(marks)
+            preds['haraka'].extend(haraka)
+            preds['tanween'].extend(tanween)
+            preds['shadda'].extend(shadda)
+        self.train(training)
+        return (
+            np.array(preds['haraka']),
+            np.array(preds["tanween"]),
+            np.array(preds["shadda"]),
+        )
+if __name__ == "__main__":
+    model = Diacritizer({
+        "num-chars": 36,
+        "hidden_size": 768,
+        "char-embed-dim": 32,
+        "dropout": 0.25,
+        "sentence_dropout": 0.2,
+        "diac_model_config": {
+            "num_layers": 4,
+            "hidden_size": 768 + 32,
+            "intermediate_size": (768 + 32) * 4,
+        },
+    }, load_pretrained=False)
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(model)
+    print(f"{trainable_params:,}/{total_params:,} Trainable Parameters")

predict.py CHANGED Viewed

@@ -14,9 +14,9 @@ from torch.utils.data import DataLoader
 from diac_utils import HARAKAT_MAP, shakkel_char, flat2_3head
 from model_partial import PartialDD
-from model_dd import DiacritizerD2
 from data_utils import DatasetUtils
 from dataloader import DataRetriever
 from segment import segment
 from partial_dd_metrics import (
@@ -105,15 +105,19 @@ class Predictor:
             config['predictor'].get('device', 'cuda:0')
             if T.cuda.is_available() else 'cpu'
         )
         self.model = PartialDD(config)
-        self.model.sentence_diac.build(word_embeddings, vocab_size)
-        state_dict = T.load(config["paths"]["load"], map_location=T.device(self.device))['state_dict']
-        self.model.load_state_dict(state_dict)
         self.model.to(self.device)
         self.model.eval()
-    def create_dataloader(self, text, do_partial, do_hard_mask, threshold):
         self.threshold = threshold
         self.do_hard_mask = do_hard_mask
@@ -137,7 +141,12 @@ class Predictor:
         self.original_lines = text.split('\n')
         self.data_loader = DataLoader(
-            DataRetriever(self.data_utils, segments),
             batch_size=self.config["predictor"].get("batch-size", 32),
             shuffle=False,
             num_workers=self.config['loader'].get('num-workers', 0),

 from diac_utils import HARAKAT_MAP, shakkel_char, flat2_3head
 from model_partial import PartialDD
 from data_utils import DatasetUtils
 from dataloader import DataRetriever
+from dataloader_plm import DataRetriever as DataRetrieverPLM
 from segment import segment
 from partial_dd_metrics import (
             config['predictor'].get('device', 'cuda:0')
             if T.cuda.is_available() else 'cpu'
         )
         self.model = PartialDD(config)
+        if config["model-name"] == "D2":
+            self.model.sentence_diac.build(word_embeddings, vocab_size)
+            state_dict = T.load(config["paths"]["load"], map_location=T.device(self.device))['state_dict']
+        else:
+            state_dict = T.load(config["paths"]["load-td2"], map_location=T.device(self.device))['state_dict']
+        self.model.load_state_dict(state_dict, strict=False)
         self.model.to(self.device)
         self.model.eval()
+    def create_dataloader(self, text, do_partial, do_hard_mask, threshold, model_name):
         self.threshold = threshold
         self.do_hard_mask = do_hard_mask
         self.original_lines = text.split('\n')
         self.data_loader = DataLoader(
+            DataRetriever(self.data_utils, segments)
+            if model_name == "D2"
+            else DataRetrieverPLM(segments, self.data_utils,
+                is_test=True,
+                tokenizer=self.model.tokenizer
+            ),
             batch_size=self.config["predictor"].get("batch-size", 32),
             shuffle=False,
             num_workers=self.config['loader'].get('num-workers', 0),