Disclaimer

This model is only a preliminary experimental result and is not suitable for any sort of serious use. This model's capability is at best extremely limited and unreliable.

That said, look forward to good things to come. This is my debut to the field of Ainu NLP.

Acknowledgements

I am indebted to Michal Ptaszynski for his guidance and encouragement, Karol Nowakowski for his work to compile an expansive parallel corpus, David Dale for his Medium article that helped me to quickly and smoothly take this first step.

How to use this model

The following is adapted from slone/nllb-rus-tyv-v1.

# the version of transformers is important!
!pip install sentencepiece transformers==4.33
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM

def fix_tokenizer(tokenizer, new_lang='ain_Latn'):
    """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = new_lang
    # always move "mask" to the last position
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)
    # clear the added token encoder; otherwise a new token may end up there by mistake
    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}

MODEL_URL = "TwentyNine/nllb-jpn-ain-v1"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
fix_tokenizer(tokenizer)

def translate(
    text,
    model,
    tokenizer,
    src_lang='jpn_Jpan',
    tgt_lang='ain_Latn',
    max_length='auto',
    num_beams=4,
    n_out=None,
    **kwargs
):
    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    if max_length == 'auto':
        max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
    model.eval()
    generated_tokens = model.generate(
        **encoded.to(model.device),
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=n_out or 1,
        **kwargs
    )
    out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    if isinstance(text, str) and n_out is None:
        return out[0]
    return 

translate("肉が食べたいな。", model=model, tokenizer=tokenizer)
# 'kam c=e rusuy na.'