--- language: - ja - ain pipeline_tag: translation license: cc-by-nc-4.0 --- # Disclaimer This model is only a preliminary experimental result and is not suitable for any sort of serious use. This model's capability is at best extremely limited and unreliable. That said, look forward to good things to come. This is my debut to the field of Ainu NLP. # Acknowledgements I am indebted to [Michal Ptaszynski](https://huggingface.co/ptaszynski) for his guidance and encouragement, [Karol Nowakowski](https://huggingface.co/karolnowakowski) for his work to compile an expansive parallel corpus, [David Dale](https://huggingface.co/cointegrated) for his [Medium article](https://cointegrated.medium.com/how-to-fine-tune-a-nllb-200-model-for-translating-a-new-language-a37fc706b865) that helped me to quickly and smoothly take this first step. # How to use this model The following is adapted from [slone/nllb-rus-tyv-v1](https://huggingface.co/slone/nllb-rus-tyv-v1). ```Python # the version of transformers is important! !pip install sentencepiece transformers==4.33 import torch from transformers import NllbTokenizer, AutoModelForSeq2SeqLM def fix_tokenizer(tokenizer, new_lang='ain_Latn'): """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder) tokenizer.lang_code_to_id[new_lang] = old_len-1 tokenizer.id_to_lang_code[old_len-1] = new_lang # always move "mask" to the last position tokenizer.fairseq_tokens_to_ids[""] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id) tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()} if new_lang not in tokenizer._additional_special_tokens: tokenizer._additional_special_tokens.append(new_lang) # clear the added token encoder; otherwise a new token may end up there by mistake tokenizer.added_tokens_encoder = {} tokenizer.added_tokens_decoder = {} MODEL_URL = "TwentyNine/nllb-jpn-ain-v1" model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL) tokenizer = NllbTokenizer.from_pretrained(MODEL_URL) fix_tokenizer(tokenizer) def translate( text, model, tokenizer, src_lang='jpn_Jpan', tgt_lang='ain_Latn', max_length='auto', num_beams=4, n_out=None, **kwargs ): tokenizer.src_lang = src_lang encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) if max_length == 'auto': max_length = int(32 + 2.0 * encoded.input_ids.shape[1]) model.eval() generated_tokens = model.generate( **encoded.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang], max_length=max_length, num_beams=num_beams, num_return_sequences=n_out or 1, **kwargs ) out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) if isinstance(text, str) and n_out is None: return out[0] return translate("肉が食べたいな。", model=model, tokenizer=tokenizer) # 'kam c=e rusuy na.' ```