Spaces:

bkhmsi
/

Partial-Arabic-Diacritization

Running

App Files Files Community

bkhmsi commited on Jan 11, 2024

Commit

bb42b73

1 Parent(s): f81acf7

restructured space

Browse files

Files changed (7) hide show

README.md +3 -3
app.py +86 -28
data_utils.py +1 -1
diac_utils.py +35 -4
gradio_cached_examples/16/log.csv +2 -0
gradio_cached_examples/6/log.csv +2 -0
predict.py +53 -28

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Partial Tashkeel
-emoji: 🐠
 colorFrom: blue
 colorTo: gray
 sdk: gradio
 sdk_version: 4.1.2
 app_file: app.py
-pinned: false
 license: cc-by-sa-3.0
 ---

 ---
+title: Partial Arabic Diacritization
+emoji: 🖋️
 colorFrom: blue
 colorTo: gray
 sdk: gradio
 sdk_version: 4.1.2
 app_file: app.py
+pinned: true
 license: cc-by-sa-3.0
 ---

app.py CHANGED Viewed

@@ -9,12 +9,12 @@ output_path = "tashkeela-d2.pt"
 gdrive_templ = "https://drive.google.com/file/d/{}/view?usp=sharing"
 if not os.path.exists(output_path):
     model_gdrive_id = "1FGelqImFkESbTyRsx_elkKIOZ9VbhRuo"
-    gdown.download(gdrive_templ.format(model_gdrive_id), output=output_path, quiet=False)
 output_path = "vocab.vec"
 if not os.path.exists(output_path):
     vocab_gdrive_id = "1-0muGvcSYEf8RAVRcwXay4MRex6kmCii"
-    gdown.download(gdrive_templ.format(vocab_gdrive_id), output=output_path, quiet=False)
 with open("config.yaml", 'r', encoding="utf-8") as file:
     config = yaml.load(file, Loader=yaml.FullLoader)
@@ -22,41 +22,99 @@ with open("config.yaml", 'r', encoding="utf-8") as file:
 config["train"]["max-sent-len"] = config["predictor"]["window"]
 config["train"]["max-token-count"] = config["predictor"]["window"] * 3
-def diacritze(text, do_partial):
-    predictor = PredictTri(config, text)
-    diacritized_lines = predictor.predict_partial(do_partial=do_partial)
     return diacritized_lines
-with gr.Blocks() as demo:
     gr.Markdown(
     """
     # Partial Diacritization: A Context-Contrastive Inference Approach
-    ## Authors: Muhammad ElNokrashy, Badr AlKhamissi
     """)
-    with gr.Row():
-        check_box = gr.Checkbox(label="Partial", info="Apply Partial Diacritics or Full Diacritics")
-        threshold_txt = gr.Textbox("")
-    input_txt = gr.Textbox(
-        placeholder="اكتب هنا",
-        lines=5,
-        label="Input",
-        type='text',
-        rtl=True,
-        text_align='right',
-    )
-    output_txt = gr.Textbox(
-        lines=5,
-        label="Output",
-        type='text',
-        rtl=True,
-        text_align='right',
-    )
-    btn = gr.Button(value="Shakkel")
-    btn.click(diacritze, inputs=[input_txt, check_box], outputs=[output_txt])
 if __name__ == "__main__":
     demo.queue().launch(

 gdrive_templ = "https://drive.google.com/file/d/{}/view?usp=sharing"
 if not os.path.exists(output_path):
     model_gdrive_id = "1FGelqImFkESbTyRsx_elkKIOZ9VbhRuo"
+    gdown.download(gdrive_templ.format(model_gdrive_id), output=output_path, quiet=False, fuzzy=True)
 output_path = "vocab.vec"
 if not os.path.exists(output_path):
     vocab_gdrive_id = "1-0muGvcSYEf8RAVRcwXay4MRex6kmCii"
+    gdown.download(gdrive_templ.format(vocab_gdrive_id), output=output_path, quiet=False, fuzzy=True)
 with open("config.yaml", 'r', encoding="utf-8") as file:
     config = yaml.load(file, Loader=yaml.FullLoader)
 config["train"]["max-sent-len"] = config["predictor"]["window"]
 config["train"]["max-token-count"] = config["predictor"]["window"] * 3
+predictor = PredictTri(config)
+def diacritze_full(text):
+    do_hard_mask = None
+    threshold = None
+    predictor.create_dataloader(text, False, do_hard_mask, threshold)
+    diacritized_lines = predictor.predict_partial(do_partial=False, lines=text.split('\n'))
+    return diacritized_lines
+def diacritze_partial(text, mask_mode, threshold):
+    do_partial = True
+    predictor.create_dataloader(text, do_partial, mask_mode=="Hard", threshold)
+    diacritized_lines = predictor.predict_partial(do_partial=do_partial, lines=text.split('\n'))
     return diacritized_lines
+with gr.Blocks(theme=gr.themes.Default(text_size="lg")) as demo:
     gr.Markdown(
     """
     # Partial Diacritization: A Context-Contrastive Inference Approach
+    ### Authors: Muhammad ElNokrashy, Badr AlKhamissi
+    ### Paper Link: TBD
     """)
+    with gr.Tab(label="Full Diacritization"):
+        full_input_txt = gr.Textbox(
+            placeholder="اكتب هنا",
+            lines=5,
+            label="Input",
+            type='text',
+            rtl=True,
+            text_align='right',
+        )
+        full_output_txt = gr.Textbox(
+            lines=5,
+            label="Output",
+            type='text',
+            rtl=True,
+            text_align='right',
+            show_copy_button=True,
+        )
+        full_btn = gr.Button(value="Shakkel")
+        full_btn.click(diacritze_full, inputs=[full_input_txt], outputs=[full_output_txt])
+        gr.Examples(
+            examples=[
+                "ولو حمل من مجلس الخيار ، ولم يمنع من الكلام"
+            ],
+            inputs=full_input_txt,
+            outputs=full_output_txt,
+            fn=diacritze_full,
+            cache_examples=True,
+        )
+    with gr.Tab(label="Partial Diacritization") as partial_settings:
+        with gr.Row():
+            masking_mode = gr.Radio(choices=["Hard", "Soft"], value="Hard", label="Masking Mode")
+            threshold_slider = gr.Slider(label="Soft Masking Threshold", minimum=0, maximum=1, value=0.1)
+        partial_input_txt = gr.Textbox(
+            placeholder="اكتب هنا",
+            lines=5,
+            label="Input",
+            type='text',
+            rtl=True,
+            text_align='right',
+        )
+        partial_output_txt = gr.Textbox(
+            lines=5,
+            label="Output",
+            type='text',
+            rtl=True,
+            text_align='right',
+            show_copy_button=True,
+        )
+        partial_btn = gr.Button(value="Shakkel")
+        partial_btn.click(diacritze_partial, inputs=[partial_input_txt, masking_mode, threshold_slider], outputs=[partial_output_txt])
+        gr.Examples(
+            examples=[
+                ["ولو حمل من مجلس الخيار ، ولم يمنع من الكلام", "Hard", 0],
+            ],
+            inputs=[partial_input_txt, masking_mode, threshold_slider],
+            outputs=partial_output_txt,
+            fn=diacritze_partial,
+            cache_examples=True,
+        )
 if __name__ == "__main__":
     demo.queue().launch(

data_utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ class DatasetUtils:
         self.max_sent_len = config["train"]["max-sent-len"]
         self.max_token_count = config["train"]["max-token-count"]
         self.pad_target_val = -100
-        self.pad_char_id = du.LETTER_LIST.index('<pad>')
         self.markov_signal = config['train'].get('markov-signal', False)
         self.batch_first = config['train'].get('batch-first', True)

         self.max_sent_len = config["train"]["max-sent-len"]
         self.max_token_count = config["train"]["max-token-count"]
         self.pad_target_val = -100
+        self.pad_char_id = du.DIAC_PAD_IDX #LETTER_LIST.index('<pad>')
         self.markov_signal = config['train'].get('markov-signal', False)
         self.batch_first = config['train'].get('batch-first', True)

diac_utils.py CHANGED Viewed

@@ -37,6 +37,8 @@ HARAKAT_MAP = [
     (0,0,0), #< Padding == -1 (also for spaces)
 ]
 SPECIAL_TOKENS = ['<pad>', '<unk>', '<num>', '<punc>']
 LETTER_LIST = SPECIAL_TOKENS + list("ءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىي")
 CLASSES_LIST = [' ', 'َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ', 'َّ', 'ًّ', 'ُّ', 'ٌّ', 'ِّ', 'ٍّ']
@@ -63,13 +65,13 @@ def shakkel_char(diac: int, tanween: bool, shadda: bool) -> str:
     return returned_text
 def diac_ids_of_line(line: str):
-    words = tokenize(line)
     diacs = []
     for word in words:
         word_chars = split_word_on_characters_with_diacritics(word)
-        cx, cy, cy_3head = create_label_for_word(word_chars)
         diacs.extend(cy)
-        diacs.append(-1)
     return np.array(diacs[:-1])
 def strip_unknown_tashkeel(word: str):
@@ -77,6 +79,23 @@ def strip_unknown_tashkeel(word: str):
     return word
     return ''.join(c for c in word if c not in UNKNOWN_DIACRITICS)
 def split_word_on_characters_with_diacritics(word: str):
     '''
     TODO! Make faster without deque and looping
@@ -100,6 +119,18 @@ def split_word_on_characters_with_diacritics(word: str):
     return chars_w_diac
 def char_type(char: str):
     if char in LETTER_LIST:
         return LETTER_LIST.index(char)
@@ -220,4 +251,4 @@ def flat2_3head(diac_idx):
         tanween += [c_out[1]]
         shadda += [c_out[2]]
-    return np.array(haraka), np.array(tanween), np.array(shadda)

     (0,0,0), #< Padding == -1 (also for spaces)
 ]
+DIAC_PAD_IDX = -1
 SPECIAL_TOKENS = ['<pad>', '<unk>', '<num>', '<punc>']
 LETTER_LIST = SPECIAL_TOKENS + list("ءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىي")
 CLASSES_LIST = [' ', 'َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ', 'َّ', 'ًّ', 'ُّ', 'ٌّ', 'ِّ', 'ٍّ']
     return returned_text
 def diac_ids_of_line(line: str):
     diacs = []
+    words = tokenize(line)
     for word in words:
         word_chars = split_word_on_characters_with_diacritics(word)
+        _cx, cy, _cy_3head = create_label_for_word(word_chars)
         diacs.extend(cy)
+        diacs.append(DIAC_PAD_IDX)
     return np.array(diacs[:-1])
 def strip_unknown_tashkeel(word: str):
     return word
     return ''.join(c for c in word if c not in UNKNOWN_DIACRITICS)
+def create_gt_labels(lines):
+    gt_labels = []
+    for line in lines:
+        # gt_labels_line = []
+        # tokens = tokenize(line.strip())
+        # for w_idx, word in enumerate(tokens):
+        #     split_word = self.split_word_on_characters_with_diacritics(word)
+        #     _, cy_flat, _ = du.create_label_for_word(split_word)
+        #     gt_labels_line.extend(cy_flat)
+        #     if w_idx+1 < len(tokens):
+        #         gt_labels_line += [0]
+        gt_labels_line = diac_ids_of_line(line)
+        gt_labels.append(gt_labels_line)
+    return gt_labels
 def split_word_on_characters_with_diacritics(word: str):
     '''
     TODO! Make faster without deque and looping
     return chars_w_diac
+def load_lines(path: str, *, strip: bool):
+    with open(path, 'r', encoding="utf-8", newline='\n') as fin:
+        if strip:
+            original_lines = [strip_tashkeel(normalize_spaces(line)) for line in fin.readlines()]
+        else:
+            original_lines = [normalize_spaces(line) for line in fin.readlines()]
+    return original_lines
+def normalize_spaces(line: str):
+    return ' '.join(tokenize(line.strip()))
 def char_type(char: str):
     if char in LETTER_LIST:
         return LETTER_LIST.index(char)
         tanween += [c_out[1]]
         shadda += [c_out[2]]
+    return np.array(haraka), np.array(tanween), np.array(shadda)

gradio_cached_examples/16/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Output,flag,username,timestamp
2	+ ولو حمَل من مجلسِ الخيارِ ، ولم يُمنعْ من الكلام,,,2024-01-11 01:33:39.114395

gradio_cached_examples/6/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Output,flag,username,timestamp
2	+ وَلَوْ حَمَلَ مِنْ مَجْلِسِ الْخِيَارِ ، وَلَمْ يُمْنَعْ مِنْ الْكَلَامِ,,,2024-01-11 01:30:56.446393

predict.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 import torch as T
 from torch.utils.data import DataLoader
-from diac_utils import HARAKAT_MAP, shakkel_char, diac_ids_of_line
 from model_partial import PartialDD
 from model_dd import DiacritizerD2
 from data_utils import DatasetUtils
@@ -31,10 +31,21 @@ def apply_tashkeel(
         diacs: Union[np.ndarray, T.Tensor]
 ):
     line_w_diacs = ""
-    diacs_h3 = DatasetUtils.flat2_3head(diacs)
-    for ch, tashkeel in zip(line, zip(*diacs_h3)):
         line_w_diacs += ch
-        line_w_diacs += DatasetUtils.shakkel_char(*tashkeel)
     return line_w_diacs
 def diac_text(data, model_output_base, model_output_ctxt, selection_mode='contrastive-hard', threshold=0.1):
@@ -80,29 +91,16 @@ def diac_text(data, model_output_base, model_output_ctxt, selection_mode='contra
         line = apply_tashkeel(line, line_diacs)
         output.append(line)
-    return '\n'.join(output)
 class Predictor:
-    def __init__(self, config, text):
         self.data_utils = DatasetUtils(config)
         vocab_size = len(self.data_utils.letter_list)
         word_embeddings = self.data_utils.embeddings
-        stride = config["segment"]["stride"]
-        window = config["segment"]["window"]
-        min_window = config["segment"]["min-window"]
-        segments, mapping = segment([text], stride, window, min_window)
-        mapping_lines = []
-        for sent_idx, seg_idx, word_idx, char_idx in mapping:
-            mapping_lines += [f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}"]
-        self.mapping = self.data_utils.load_mapping_v3_from_list(mapping_lines)
-        self.original_lines = [text]
-        self.segments = segments
         self.device = T.device(
             config['predictor'].get('device', 'cuda:0')
             if T.cuda.is_available() else 'cpu'
@@ -115,16 +113,39 @@ class Predictor:
         self.model.to(self.device)
         self.model.eval()
         self.data_loader = DataLoader(
             DataRetriever(self.data_utils, segments),
-            batch_size=config["predictor"].get("batch-size", 32),
             shuffle=False,
-            num_workers=config['loader'].get('num-workers', 0),
         )
 class PredictTri(Predictor):
-    def __init__(self, config, text):
-        super().__init__(config, text)
         self.diacritics = {
             "FATHA": 1,
             "KASRA": 2,
@@ -146,11 +167,15 @@ class PredictTri(Predictor):
         diacritized_lines, _ = self.coalesce_votes_by_majority(y_gen_diac, y_gen_tanween, y_gen_shadda)
         return diacritized_lines
-    def predict_partial(self, do_partial):
         outputs = self.model.predict_partial(self.data_loader, return_extra=True, eval_only='both', do_partial=do_partial)
-        y_gen_diac, y_gen_tanween, y_gen_shadda = outputs['diacritics']
-        diac_lines, _ = self.coalesce_votes_by_majority(y_gen_diac, y_gen_tanween, y_gen_shadda)
         return '\n'.join(diac_lines)
     def predict_majority_vote_context_contrastive(self, overwrite_cache=False):

 import torch as T
 from torch.utils.data import DataLoader
+from diac_utils import HARAKAT_MAP, shakkel_char, flat2_3head
 from model_partial import PartialDD
 from model_dd import DiacritizerD2
 from data_utils import DatasetUtils
         diacs: Union[np.ndarray, T.Tensor]
 ):
     line_w_diacs = ""
+    ts, tw = diacs.shape
+    diacs = diacs.flatten()
+    diacs_h3 = flat2_3head(diacs)
+    diacs_h3 = tuple(x.reshape(ts, tw) for x in diacs_h3)
+    diac_char_idx = 0
+    diac_word_idx = 0
+    for ch in line:
         line_w_diacs += ch
+        if ch == " ":
+            diac_char_idx = 0
+            diac_word_idx += 1
+        else:
+            tashkeel = (diacs_h3[0][diac_word_idx][diac_char_idx], diacs_h3[1][diac_word_idx][diac_char_idx], diacs_h3[2][diac_word_idx][diac_char_idx])
+            diac_char_idx += 1
+            line_w_diacs += shakkel_char(*tashkeel)
     return line_w_diacs
 def diac_text(data, model_output_base, model_output_ctxt, selection_mode='contrastive-hard', threshold=0.1):
         line = apply_tashkeel(line, line_diacs)
         output.append(line)
+    return output
 class Predictor:
+    def __init__(self, config):
         self.data_utils = DatasetUtils(config)
         vocab_size = len(self.data_utils.letter_list)
         word_embeddings = self.data_utils.embeddings
+        self.config = config
         self.device = T.device(
             config['predictor'].get('device', 'cuda:0')
             if T.cuda.is_available() else 'cpu'
         self.model.to(self.device)
         self.model.eval()
+    def create_dataloader(self, text, do_partial, do_hard_mask, threshold):
+        self.threshold = threshold
+        self.do_hard_mask = do_hard_mask
+        stride = self.config["segment"]["stride"]
+        window = self.config["segment"]["window"]
+        min_window = self.config["segment"]["min-window"]
+        if self.do_hard_mask or not do_partial:
+            segments, mapping = segment([text], stride, window, min_window)
+            mapping_lines = []
+            for sent_idx, seg_idx, word_idx, char_idx in mapping:
+                mapping_lines += [f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}"]
+            self.mapping = self.data_utils.load_mapping_v3_from_list(mapping_lines)
+            self.original_lines = [text]
+            self.segments = segments
+        else:
+            segments = text.split('\n')
+        self.segments = segments
+        self.original_lines = text.split('\n')
         self.data_loader = DataLoader(
             DataRetriever(self.data_utils, segments),
+            batch_size=self.config["predictor"].get("batch-size", 32),
             shuffle=False,
+            num_workers=self.config['loader'].get('num-workers', 0),
         )
 class PredictTri(Predictor):
+    def __init__(self, config):
+        super().__init__(config)
         self.diacritics = {
             "FATHA": 1,
             "KASRA": 2,
         diacritized_lines, _ = self.coalesce_votes_by_majority(y_gen_diac, y_gen_tanween, y_gen_shadda)
         return diacritized_lines
+    def predict_partial(self, do_partial, lines):
         outputs = self.model.predict_partial(self.data_loader, return_extra=True, eval_only='both', do_partial=do_partial)
+        if self.do_hard_mask or not do_partial:
+            y_gen_diac, y_gen_tanween, y_gen_shadda = outputs['diacritics']
+            diac_lines, _ = self.coalesce_votes_by_majority(y_gen_diac, y_gen_tanween, y_gen_shadda)
+        else:
+            diac_lines = diac_text(lines, outputs["other"][1], outputs["other"][0], selection_mode='1', threshold=self.threshold)
         return '\n'.join(diac_lines)
     def predict_majority_vote_context_contrastive(self, overwrite_cache=False):