Update README.md
Browse files
README.md
CHANGED
@@ -39,26 +39,6 @@ model = AutoModelForSeq2SeqLM.from_pretrained('cifope/nllb-200-wo-fr-distilled-6
|
|
39 |
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
|
40 |
```
|
41 |
|
42 |
-
## Tokenizer Customization
|
43 |
-
|
44 |
-
To integrate specific features like new language codes into the tokenizer, you can use the `fix_tokenizer` function:
|
45 |
-
|
46 |
-
```python
|
47 |
-
def fix_tokenizer(tokenizer, new_lang='wol_Wol'):
|
48 |
-
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
|
49 |
-
tokenizer.lang_code_to_id[new_lang] = old_len-1
|
50 |
-
tokenizer.id_to_lang_code[old_len-1] = new_lang
|
51 |
-
tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
|
52 |
-
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
|
53 |
-
tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
|
54 |
-
if new_lang not in tokenizer._additional_special_tokens:
|
55 |
-
tokenizer._additional_special_tokens.append(new_lang)
|
56 |
-
tokenizer.added_tokens_encoder = {}
|
57 |
-
tokenizer.added_tokens_decoder = {}
|
58 |
-
|
59 |
-
fix_tokenizer(tokenizer)
|
60 |
-
```
|
61 |
-
|
62 |
## Translation Functions
|
63 |
|
64 |
### Translate from French to Wolof
|
@@ -66,7 +46,7 @@ fix_tokenizer(tokenizer)
|
|
66 |
The `translate` function translates text from French to Wolof:
|
67 |
|
68 |
```python
|
69 |
-
def translate(text, src_lang='fra_Latn', tgt_lang='
|
70 |
tokenizer.src_lang = src_lang
|
71 |
tokenizer.tgt_lang = tgt_lang
|
72 |
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
|
@@ -84,7 +64,7 @@ def translate(text, src_lang='fra_Latn', tgt_lang='wol_Wol', a=16, b=1.5, max_in
|
|
84 |
The `reversed_translate` function translates text from Wolof to French:
|
85 |
|
86 |
```python
|
87 |
-
def reversed_translate(text, src_lang='
|
88 |
tokenizer.src_lang = src_lang
|
89 |
tokenizer.tgt_lang = tgt_lang
|
90 |
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
|
@@ -109,5 +89,8 @@ print(wolof_translation)
|
|
109 |
wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
|
110 |
french_translation = reversed_translate(wolof_text)
|
111 |
print(french_translation)
|
112 |
-
```
|
113 |
|
|
|
|
|
|
|
|
|
|
39 |
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
|
40 |
```
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
## Translation Functions
|
43 |
|
44 |
### Translate from French to Wolof
|
|
|
46 |
The `translate` function translates text from French to Wolof:
|
47 |
|
48 |
```python
|
49 |
+
def translate(text, src_lang='fra_Latn', tgt_lang='wol_Latn', a=16, b=1.5, max_input_length=1024, **kwargs):
|
50 |
tokenizer.src_lang = src_lang
|
51 |
tokenizer.tgt_lang = tgt_lang
|
52 |
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
|
|
|
64 |
The `reversed_translate` function translates text from Wolof to French:
|
65 |
|
66 |
```python
|
67 |
+
def reversed_translate(text, src_lang='wol_Latn', tgt_lang='fra_Latn', a=16, b=1.5, max_input_length=1024, **kwargs):
|
68 |
tokenizer.src_lang = src_lang
|
69 |
tokenizer.tgt_lang = tgt_lang
|
70 |
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
|
|
|
89 |
wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
|
90 |
french_translation = reversed_translate(wolof_text)
|
91 |
print(french_translation)
|
|
|
92 |
|
93 |
+
wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
|
94 |
+
english_translation = reversed_translate(wolof_text,tgt_lang="eng_Latn")
|
95 |
+
print(english_translation)
|
96 |
+
```
|