chtran commited on
Commit
d444583
1 Parent(s): 131f4db

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -2
README.md CHANGED
@@ -17,6 +17,9 @@ To force the target language id as the first generated token, pass the `forced_b
17
  *Note: `M2M100Tokenizer` depends on `sentencepiece`, so make sure to install it before running the example.*
18
  To install `sentencepiece` run `pip install sentencepiece`
19
 
 
 
 
20
 
21
  ```python
22
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
@@ -26,14 +29,14 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/wmt21-dense-24-wide-x-en")
26
 
27
  # translate German to English
28
  tokenizer.src_lang = "de"
29
- inputs = tokenizer("Ein Modell für viele Sprachen", return_tensors="pt")
30
  generated_tokens = model.generate(**inputs)
31
  tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
32
  # => "A model for many languages"
33
 
34
  # translate Icelandic to English
35
  tokenizer.src_lang = "is"
36
- inputs = tokenizer("Ein fyrirmynd fyrir mörg tungumál", return_tensors="pt")
37
  generated_tokens = model.generate(**inputs)
38
  tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
39
  # => "One model for many languages"
17
  *Note: `M2M100Tokenizer` depends on `sentencepiece`, so make sure to install it before running the example.*
18
  To install `sentencepiece` run `pip install sentencepiece`
19
 
20
+ Since the model was trained with domain tags, you should prepend them to the input as well.
21
+ * "wmtdata newsdomain": Use for sentences in the news domain
22
+ * "wmtdata otherdomain": Use for sentences in all other domain
23
 
24
  ```python
25
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
29
 
30
  # translate German to English
31
  tokenizer.src_lang = "de"
32
+ inputs = tokenizer("wmtdata newsdomain Ein Modell für viele Sprachen", return_tensors="pt")
33
  generated_tokens = model.generate(**inputs)
34
  tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
35
  # => "A model for many languages"
36
 
37
  # translate Icelandic to English
38
  tokenizer.src_lang = "is"
39
+ inputs = tokenizer("wmtdata newsdomain Ein fyrirmynd fyrir mörg tungumál", return_tensors="pt")
40
  generated_tokens = model.generate(**inputs)
41
  tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
42
  # => "One model for many languages"