script-normalization-kurdish-persian-arabic

Sleeping

script-normalization-kurdish-persian-arabic

File size: 8,631 Bytes

75b9522
23ca1e9
75b9522
23ca1e9
 
75b9522
 
 
 
 
 
 
23ca1e9
75b9522
 
 
4f3ec12
 
 
 
 
 
 
 
 
 
 
 
23ca1e9
75b9522
 
23ca1e9
e6a3e60
23ca1e9
 
 
4f3ec12
23ca1e9
 
4f3ec12
 
 
23ca1e9
4f3ec12
 
23ca1e9
4f3ec12
23ca1e9
4f3ec12
23ca1e9
4f3ec12
 
 
23ca1e9
4f3ec12
 
23ca1e9
4f3ec12
 
 
 
 
23ca1e9
4f3ec12
 
23ca1e9
4f3ec12
 
 
 
 
 
 
 
 
59dfad3
4f3ec12
23ca1e9
75b9522
 
 
 
 
 
 
 
 
 
 
23ca1e9
75b9522
 
23ca1e9
0af0f95
7a9e350
 
0b6e630
 
 
7a9e350
bf3e769
 
 
 
 
d85a29c
0af0f95
75b9522
fd9da3b
bf3e769
3800655
 
 
bf3e769
 
15ca953
bf3e769
3800655
bf3e769
3800655
bf3e769
3800655
fd9da3b
75b9522
96cfc8d
23ca1e9
 
 
 
96cfc8d
4f3ec12
 
23ca1e9
 
 
 
 
 
 
 
96cfc8d
 
4f3ec12
23ca1e9
dc989f7
3800655
dc989f7
 
 
704e535
4f3ec12
75b9522
fd9da3b
 
75b9522
23ca1e9
4f3ec12
0379612
96cfc8d
4f3ec12
704e535
d85a29c
23ca1e9
75b9522

from functools import partial
from pathlib import Path

import gradio as gr
from joeynmt.datasets import build_dataset
from joeynmt.helpers import (
    load_checkpoint,
    load_config,
    parse_train_args,
    resolve_ckpt_path,
)
from joeynmt.model import build_model
from joeynmt.prediction import predict
from joeynmt.tokenizers import build_tokenizer
from joeynmt.vocabulary import build_vocab

languages_scripts = {
    "Azeri Turkish in Persian": "AzeriTurkish-Persian",
    "Central Kurdish in Arabic": "Sorani-Arabic",
    "Central Kurdish in Persian": "Sorani-Persian",
    "Gilaki in Persian": "Gilaki-Persian",
    "Gorani in Arabic": "Gorani-Arabic",
    "Gorani in Central Kurdish": "Gorani-Sorani",
    "Gorani in Persian": "Gorani-Persian",
    "Kashmiri in Urdu": "Kashmiri-Urdu",
    "Mazandarani in Persian": "Mazandarani-Persian",
    "Northern Kurdish in Arabic": "Kurmanji-Arabic",
    "Northern Kurdish in Persian": "Kurmanji-Persian",
    "Sindhi in Urdu": "Sindhi-Urdu",
}


def normalize(text, language_script):
    cfg_file = "./models/%s/config.yaml" % languages_scripts[language_script]
    ckpt = "./models/%s/best.ckpt" % languages_scripts[language_script]

    cfg = load_config(Path(cfg_file))
    # parse and validate cfg
    model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(cfg["training"], mode="prediction")
    test_cfg = cfg["testing"]
    src_cfg = cfg["data"]["src"]
    trg_cfg = cfg["data"]["trg"]

    load_model = load_model if ckpt is None else Path(ckpt)
    ckpt = resolve_ckpt_path(load_model, model_dir)

    src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)

    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, device=device)
    model.load_state_dict(model_checkpoint["model_state"])

    if device.type == "cuda":
        model.to(device)

    tokenizer = build_tokenizer(cfg["data"])
    sequence_encoder = {
        src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
        trg_cfg["lang"]: None,
    }

    test_cfg["batch_size"] = 1  # CAUTION: this will raise an error if n_gpus > 1
    test_cfg["batch_type"] = "sentence"

    test_data = build_dataset(
        dataset_type="stream",
        path=None,
        src_lang=src_cfg["lang"],
        trg_lang=trg_cfg["lang"],
        split="test",
        tokenizer=tokenizer,
        sequence_encoder=sequence_encoder,
    )
    test_data.set_item(text.strip())

    cfg = test_cfg
    _, _, hypotheses, trg_tokens, trg_scores, _ = predict(
        model=model,
        data=test_data,
        compute_loss=False,
        device=device,
        n_gpu=n_gpu,
        normalization="none",
        num_workers=num_workers,
        cfg=cfg,
        fp16=fp16,
    )

    return hypotheses[0]


title = """
<center><strong><font size='8'>Script Normalization for Unconventional Writing<font></strong></center>

<div align="center">
    <img src="https://raw.githubusercontent.com/sinaahmadi/ScriptNormalization/b80b8fd9e3b77d0e58443ebd506c42173486f9a6/Perso-Arabic_scripts.jpg" alt="Perso-Arabic scripts used by the target languages in our paper" width="400">
</div>

<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem"> 
    [<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" style="color:blue;">Paper (ACL 2023)</a>] 
    [<a href="https://sinaahmadi.github.io/docs/slides/ahmadi2023acl_slides.pdf" style="color:blue;">Slides</a>]
    [<a href="https://github.com/sinaahmadi/ScriptNormalization" style="color:blue;">GitHub</a>]
    [<a href="https://s3.amazonaws.com/pf-user-files-01/u-59356/uploads/2023-06-04/rw32pwp/ACL2023.mp4" style="color:blue;">Presentation</a>]
</h3>
    """

description = """
<ul>
    <li style="font-size:120%;">&quot;<em>mar7aba!</em>&quot;</li>
    <li style="font-size:120%;">&quot;<em>هاو ئار یوو؟</em>&quot;</li>
    <li style="font-size:120%;">&quot;<em>Μπιάνβενου α σετ ντεμό!</em>&quot;</li>
</ul>

<p style="font-size:120%;">What do all these sentences have in common?  Being greeted in Arabic with &quot;<em>mar7aba</em>&quot; written in the Latin script, then asked how you are (&quot;<em>هاو ئار یوو؟</em>&quot;) in English using the Perso-Arabic script of Kurdish and then, welcomed to this demo in French (&quot;<em>Μπιάνβενου α σετ ντεμό!</em>&quot;) written in Greek script. All these sentences are written in an <strong>unconventional</strong> script.</p>

<p style="font-size:120%;">Although you may find these sentences risible, unconventional writing is a common practice among millions of speakers in bilingual communities. In our paper entitled &quot;<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" target="_blank"><strong>Script Normalization for Unconventional Writing of Under-Resourced Languages in Bilingual Communities</strong></a>&quot;, we shed light on this problem and propose an approach to normalize noisy text written in unconventional writing.</p>

<p style="font-size:120%;">This demo deploys a few models that are trained for <strong>the normalization of unconventional writing</strong>. Please note that this tool is not a spell-checker and cannot correct errors beyond character normalization. For better performance, you can apply hard-coded rules on the input and then pass it to the models, hence a hybrid system.</p>

<p style="font-size:120%;">For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a></p>
"""

examples = [
    [
        "بو شهرین نوفوسو ، 2014 نجی ایلين نوفوس ساییمی اساسيندا 41 نفر ایمیش .",
        "Azeri Turkish in Persian",
    ],  # "بۇ شهرین نۆفوسو ، 2014 نجی ایلين نۆفوس ساییمی اساسيندا 41 نفر ایمیش ."
    ["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
    ["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"],
    ["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"],
    ["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"],  # شۆنەو ئانەیەرە گەشت و گێڵی ناچارانەو ئۆجالانی دەستش پنەکەرد
    ["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"],  # ڕوٙو زوانی ئەڎایی چەنی پەیڎابی ؟
    ["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"],  # هەنگامەکان وزمیٛ وەرو چەمان ، بەپاو کریٛڵی بیەشان :
    ["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"],  # ربعی بن افکل ٲسؠ اَکھ صُحابی .
    ["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"],  # اینتا زوون گِنِشکَرون 85 میلیون نفر هسنه
    ["بة رطكا هة صطئن ژ دل هاطة  بة لافكرن", "Northern Kurdish in Arabic"],  # پەرتوکا هەستێن ژ دل هاتە بەلافکرن
    ["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"],  # سەرەکی هەمەرەنگ نەرمینێ دڤێت هندەک قوناغێن دی ببڕیت
    ["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"],  # هتي ڪجھ اپ ۽ تمام ڊائون ٽرينون بيھنديون آھن .
]


article = """
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
    <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
        <b>Created and deployed by Sina Ahmadi <a href="https://sinaahmadi.github.io/">(https://sinaahmadi.github.io/)</a>.
    </h3>
</div>
    """

demo = gr.Interface(
    title=title,
    description=description,
    fn=normalize,
    inputs=[
        gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"),
        gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
    ],
    outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"),
    examples=examples,
    article=article,
    examples_per_page=20,
)

demo.launch()