#%% from transformers import RobertaForTokenClassification, AutoTokenizer model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics") tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics") #%% label2diacritic = {0: 'ّ', # SHADDA 1: 'َ', # FATHA 2: 'ِ', # KASRA 3: 'ُ', # DAMMA 4: 'ْ'} # SUKKUN def arabic2diacritics(text, model, tokenizer): tokens = tokenizer(text, return_tensors="pt") preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS new_text = [] for p, c in zip(preds, text): new_text.append(c) for i in range(1, 5): if p[i]: new_text.append(label2diacritic[i]) # check shadda last if p[0]: new_text.append(label2diacritic[0]) new_text = "".join(new_text) return new_text def diacritize(text): return arabic2diacritics(text, model, tokenizer) def diacritize_if_not_already(text): if any(c in label2diacritic.values() for c in text): return text else: return arabic2diacritics(text, model, tokenizer) #%% # text = "بديش اروح عالمدرسة بكرا" # arabic2diacritics(text, model, tokenizer) # %%