TwentyNine's picture
Added an example of an incorrect result to the demo code.
19c3ccf verified
metadata
language:
  - ain
pipeline_tag: translation
license: cc-by-nc-4.0

Disclaimer

This model is only a preliminary experimental result. This model's capability is at best limited and unreliable.

Acknowledgements

I am indebted to Michal Ptaszynski for his guidance and encouragement, Karol Nowakowski for his work to compile an expansive parallel corpus, David Dale for his Medium article that helped me to quickly and smoothly take my first steps.

How to use this model

The following is adapted from slone/nllb-rus-tyv-v1.

# the version of transformers is important!
!pip install sentencepiece transformers==4.33 > /dev/null
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM

def fix_tokenizer(tokenizer, new_lang):
    """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = new_lang
    # always move "mask" to the last position
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)
    # clear the added token encoder; otherwise a new token may end up there by mistake
    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}

MODEL_URL = "TwentyNine/nllb-ain-kana-latin-converter-v1"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
fix_tokenizer(tokenizer, 'ain_Japn')
fix_tokenizer(tokenizer, 'ain_Latn')

def convert(
    text,
    model=model,
    tokenizer=tokenizer,
    src_lang='ain_Japn',
    tgt_lang='ain_Latn',
    max_length='auto',
    num_beams=4,
    n_out=None,
    **kwargs
):
    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    if max_length == 'auto':
        max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
    model.eval()
    generated_tokens = model.generate(
        **encoded.to(model.device),
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=n_out or 1,
        **kwargs
    )
    out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    if isinstance(text, str) and n_out is None:
        return out[0]
    return

convert("ポむ セタ クコン ルスむ")
# GOOD: 'pon seta ku=kor rusuy'

convert("γ‚Ώγƒ³γƒˆ γŒγ£γ“γ†γ€€γ‚ͺルン パむェ")
# OK:    'tanto γŒγ£γ“γ† or un paye'
# IDEAL: 'tanto GAKKO or un paye' or  'tanto GAKKOU or un paye'

convert("セコロ ハウェをン コロ むシレニネ")
# WRONG: 'sekor hawean korsiren hine'
# IDEAL: 'sekor hawean kor i=siren hine'