cifope commited on
Commit
03b4420
1 Parent(s): a1ed08a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -23
README.md CHANGED
@@ -39,26 +39,6 @@ model = AutoModelForSeq2SeqLM.from_pretrained('cifope/nllb-200-wo-fr-distilled-6
39
  tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
40
  ```
41
 
42
- ## Tokenizer Customization
43
-
44
- To integrate specific features like new language codes into the tokenizer, you can use the `fix_tokenizer` function:
45
-
46
- ```python
47
- def fix_tokenizer(tokenizer, new_lang='wol_Wol'):
48
- old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
49
- tokenizer.lang_code_to_id[new_lang] = old_len-1
50
- tokenizer.id_to_lang_code[old_len-1] = new_lang
51
- tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
52
- tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
53
- tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
54
- if new_lang not in tokenizer._additional_special_tokens:
55
- tokenizer._additional_special_tokens.append(new_lang)
56
- tokenizer.added_tokens_encoder = {}
57
- tokenizer.added_tokens_decoder = {}
58
-
59
- fix_tokenizer(tokenizer)
60
- ```
61
-
62
  ## Translation Functions
63
 
64
  ### Translate from French to Wolof
@@ -66,7 +46,7 @@ fix_tokenizer(tokenizer)
66
  The `translate` function translates text from French to Wolof:
67
 
68
  ```python
69
- def translate(text, src_lang='fra_Latn', tgt_lang='wol_Wol', a=16, b=1.5, max_input_length=1024, **kwargs):
70
  tokenizer.src_lang = src_lang
71
  tokenizer.tgt_lang = tgt_lang
72
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
@@ -84,7 +64,7 @@ def translate(text, src_lang='fra_Latn', tgt_lang='wol_Wol', a=16, b=1.5, max_in
84
  The `reversed_translate` function translates text from Wolof to French:
85
 
86
  ```python
87
- def reversed_translate(text, src_lang='wol_Wol', tgt_lang='fra_Latn', a=16, b=1.5, max_input_length=1024, **kwargs):
88
  tokenizer.src_lang = src_lang
89
  tokenizer.tgt_lang = tgt_lang
90
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
@@ -109,5 +89,8 @@ print(wolof_translation)
109
  wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
110
  french_translation = reversed_translate(wolof_text)
111
  print(french_translation)
112
- ```
113
 
 
 
 
 
 
39
  tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
40
  ```
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ## Translation Functions
43
 
44
  ### Translate from French to Wolof
 
46
  The `translate` function translates text from French to Wolof:
47
 
48
  ```python
49
+ def translate(text, src_lang='fra_Latn', tgt_lang='wol_Latn', a=16, b=1.5, max_input_length=1024, **kwargs):
50
  tokenizer.src_lang = src_lang
51
  tokenizer.tgt_lang = tgt_lang
52
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
 
64
  The `reversed_translate` function translates text from Wolof to French:
65
 
66
  ```python
67
+ def reversed_translate(text, src_lang='wol_Latn', tgt_lang='fra_Latn', a=16, b=1.5, max_input_length=1024, **kwargs):
68
  tokenizer.src_lang = src_lang
69
  tokenizer.tgt_lang = tgt_lang
70
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
 
89
  wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
90
  french_translation = reversed_translate(wolof_text)
91
  print(french_translation)
 
92
 
93
+ wolof_text = "alkaati yi tàmbali nañu xàll léegi kilifa gi ñów"
94
+ english_translation = reversed_translate(wolof_text,tgt_lang="eng_Latn")
95
+ print(english_translation)
96
+ ```