TwentyNine commited on
Commit
6d0cacf
1 Parent(s): 8d7d094

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +75 -0
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ain
4
+ pipeline_tag: translation
5
+ license: cc-by-nc-4.0
6
+ ---
7
+
8
+ # Disclaimer
9
+ This model is only a preliminary experimental result. This model's capability is at best limited and unreliable.
10
+
11
+ # Acknowledgements
12
+ I am indebted to [Michal Ptaszynski](https://huggingface.co/ptaszynski) for his guidance and encouragement, [Karol Nowakowski](https://huggingface.co/karolnowakowski) for his work to compile an expansive parallel corpus, [David Dale](https://huggingface.co/cointegrated) for his [Medium article](https://cointegrated.medium.com/how-to-fine-tune-a-nllb-200-model-for-translating-a-new-language-a37fc706b865) that helped me to quickly and smoothly take my first steps.
13
+
14
+ # How to use this model
15
+ The following is adapted from [slone/nllb-rus-tyv-v1](https://huggingface.co/slone/nllb-rus-tyv-v1).
16
+
17
+ ```Python
18
+ # the version of transformers is important!
19
+ !pip install sentencepiece transformers==4.33
20
+ import torch
21
+ from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
22
+
23
+ def fix_tokenizer(tokenizer, new_lang='ain_Latn'):
24
+ """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
25
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
26
+ tokenizer.lang_code_to_id[new_lang] = old_len-1
27
+ tokenizer.id_to_lang_code[old_len-1] = new_lang
28
+ # always move "mask" to the last position
29
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
30
+
31
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
32
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
33
+ if new_lang not in tokenizer._additional_special_tokens:
34
+ tokenizer._additional_special_tokens.append(new_lang)
35
+ # clear the added token encoder; otherwise a new token may end up there by mistake
36
+ tokenizer.added_tokens_encoder = {}
37
+ tokenizer.added_tokens_decoder = {}
38
+
39
+ MODEL_URL = "TwentyNine/nllb-jpn-ain-v1"
40
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
41
+ tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
42
+ fix_tokenizer(tokenizer)
43
+
44
+ def translate(
45
+ text,
46
+ model,
47
+ tokenizer,
48
+ src_lang='ain_Jpan',
49
+ tgt_lang='ain_Latn',
50
+ max_length='auto',
51
+ num_beams=4,
52
+ n_out=None,
53
+ **kwargs
54
+ ):
55
+ tokenizer.src_lang = src_lang
56
+ encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
57
+ if max_length == 'auto':
58
+ max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
59
+ model.eval()
60
+ generated_tokens = model.generate(
61
+ **encoded.to(model.device),
62
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
63
+ max_length=max_length,
64
+ num_beams=num_beams,
65
+ num_return_sequences=n_out or 1,
66
+ **kwargs
67
+ )
68
+ out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
69
+ if isinstance(text, str) and n_out is None:
70
+ return out[0]
71
+ return
72
+
73
+ translate("ポイ セタ クコン ルスイ", model=model, tokenizer=tokenizer)
74
+ # 'pon seta ku=kor rusuy'
75
+ ```