SinaAhmadi's picture
Update app.py
69a0c81
raw
history blame
5.17 kB
from pathlib import Path
from functools import partial
from joeynmt.prediction import predict
from joeynmt.helpers import (
check_version,
load_checkpoint,
load_config,
parse_train_args,
resolve_ckpt_path,
)
from joeynmt.model import build_model
from joeynmt.tokenizers import build_tokenizer
from joeynmt.vocabulary import build_vocab
from joeynmt.datasets import build_dataset
import gradio as gr
# INPUT = "سلاو لە ناو گلی کرد"
cfg_file = 'config.yaml'
ckpt = './models/Sorani-Arabic/best.ckpt'
cfg = load_config(Path(cfg_file))
# parse and validate cfg
model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(
cfg["training"], mode="prediction")
test_cfg = cfg["testing"]
src_cfg = cfg["data"]["src"]
trg_cfg = cfg["data"]["trg"]
load_model = load_model if ckpt is None else Path(ckpt)
ckpt = resolve_ckpt_path(load_model, model_dir)
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
# load model state from disk
model_checkpoint = load_checkpoint(ckpt, device=device)
model.load_state_dict(model_checkpoint["model_state"])
if device.type == "cuda":
model.to(device)
tokenizer = build_tokenizer(cfg["data"])
sequence_encoder = {
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
trg_cfg["lang"]: None,
}
test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
test_cfg["batch_type"] = "sentence"
test_data = build_dataset(
dataset_type="stream",
path=None,
src_lang=src_cfg["lang"],
trg_lang=trg_cfg["lang"],
split="test",
tokenizer=tokenizer,
sequence_encoder=sequence_encoder,
)
# test_data.set_item(INPUT.rstrip())
def _translate_data(test_data, cfg=test_cfg):
"""Translates given dataset, using parameters from outer scope."""
_, _, hypotheses, trg_tokens, trg_scores, _ = predict(
model=model,
data=test_data,
compute_loss=False,
device=device,
n_gpu=n_gpu,
normalization="none",
num_workers=num_workers,
cfg=cfg,
fp16=fp16,
)
return hypotheses[0]
def normalize(text, language_script):
test_data.set_item(text)
result = _translate_data(test_data)
return result
title = "Script Normalization for Unconventional Writing"
description = """
<ul>
<li>&quot;<em>mar7aba!</em>&quot;</li>
<li>&quot;<em>هاو ئار یوو؟</em>&quot;</li>
<li>&quot;<em>Μπιάνβενου α σετ ντεμό!</em>&quot;</li>
</ul>
<p>What all these sentences are in common? Being greeted in Arabic with &quot;<em>mar7aba</em>&quot; written in the Latin script, then asked how you are (&quot;<em>هاو ئار یوو؟</em>&quot;) in English using the Perso-Arabic script of Kurdish and then, welcomed to this demo in French (&quot;<em>Μπιάνβενου α σετ ντεμό!</em>&quot;) written in Greek script. All these sentences are written in an <strong>unconventional</strong> script.</p>
<p>Although you may find these sentences risible, unconventional writing is a common practice among millions of speakers in bilingual communities. In our paper entitled &quot;<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" target="_blank"><strong>Script Normalization for Unconventional Writing of Under-Resourced Languages in Bilingual Communities</strong></a>&quot;, we shed light on this problem and propose an approach to normalize noisy text written in unconventional writing.</p>
<p>This demo deploys a few models that are trained for <strong>the normalization of unconventional writing</strong>. Please note that this tool is not a spell-checker and cannot correct errors beyond character normalization.</p>
For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a>
"""
languages_scripts = {
"Azeri Turkish in Persian": "AzeriTurkish-Persian",
"Central Kurdish in Arabic": "Sorani-Arabic",
"Central Kurdish in Persian": "Sorani-Persian",
"Gilaki in Persian": "Gilaki-Persian",
"Gorani in Arabic": "Gorani-Arabic",
"Gorani in Central Kurdish": "Gorani-Sorani",
"Gorani in Persian": "Gorani-Persian",
"Kashmiri in Urdu": "Kashmiri-Urdu",
"Mazandarani in Persian": "Mazandarani-Persian",
"Northern Kurdish in Arabic": "Kurmanji-Arabic",
"Northern Kurdish in Persian": "Kurmanji-Persian",
"Sindhi in Urdu": "Sindhi-Urdu"
}
examples = [
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
["سلاو برا جونی؟", "Central Kurdish in Arabic"],
]
demo = gr.Interface(
title=title,
description=description,
fn=normalize,
inputs = [
gr.inputs.Textbox(lines=4, label="Noisy Text"),
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
],
outputs=gr.outputs.Textbox(label="Normalized Text"),
examples=examples
)
demo.launch()