cointegrated commited on
Commit
193923d
1 Parent(s): 3c7eb3b

evolve from the myv-rus demo

Browse files
Files changed (4) hide show
  1. README.md +4 -6
  2. app.py +1 -1
  3. requirements.txt +1 -1
  4. translation.py +14 -30
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: NLLB Erzya translation demo
3
- emoji: 🦊
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.46.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: NLLB-extended translation demo
3
+ emoji: 🐘
4
+ colorFrom: cyan
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.46.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
 
app.py CHANGED
@@ -23,7 +23,7 @@ def translate_wrapper(text, src, trg, by_sentence=True, preprocess=True, random=
23
 
24
 
25
  article = f"""
26
- This is the demo for a NLLB-200-600M model fine-tuned for translation between Russian and Erzya languages.
27
 
28
  The model itself is available at https://huggingface.co/{MODEL_URL}
29
 
 
23
 
24
 
25
  article = f"""
26
+ This is the demo for a NLLB-200-600M model fine-tuned for a few (mostly new) languages.
27
 
28
  The model itself is available at https://huggingface.co/{MODEL_URL}
29
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- transformers==4.33
2
  sentencepiece
3
  gradio>=3.18.0
4
  torch
 
1
+ transformers==4.39
2
  sentencepiece
3
  gradio>=3.18.0
4
  torch
translation.py CHANGED
@@ -8,13 +8,23 @@ from sacremoses import MosesPunctNormalizer
8
  from sentence_splitter import SentenceSplitter
9
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
10
 
11
- MODEL_URL = "slone/nllb-rus-myv-v1-extvoc"
12
  LANGUAGES = {
13
- "Рузонь | Русский | Russian": "rus_Cyrl",
14
- "Эрзянь | Эрзянский | Erzya": "myv_Cyrl",
 
 
 
 
 
 
 
 
 
 
15
  }
16
  L1 = "rus_Cyrl"
17
- L2 = "myv_Cyrl"
18
 
19
 
20
  def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
@@ -54,31 +64,6 @@ class TextPreprocessor:
54
  return clean
55
 
56
 
57
- def fix_tokenizer(tokenizer, new_lang=L2):
58
- """Add a new language token to the tokenizer vocabulary
59
- (this should be done each time after its initialization)
60
- """
61
- old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
62
- tokenizer.lang_code_to_id[new_lang] = old_len - 1
63
- tokenizer.id_to_lang_code[old_len - 1] = new_lang
64
- # always move "mask" to the last position
65
- tokenizer.fairseq_tokens_to_ids["<mask>"] = (
66
- len(tokenizer.sp_model)
67
- + len(tokenizer.lang_code_to_id)
68
- + tokenizer.fairseq_offset
69
- )
70
-
71
- tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
72
- tokenizer.fairseq_ids_to_tokens = {
73
- v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()
74
- }
75
- if new_lang not in tokenizer._additional_special_tokens:
76
- tokenizer._additional_special_tokens.append(new_lang)
77
- # clear the added token encoder; otherwise a new token may end up there by mistake
78
- tokenizer.added_tokens_encoder = {}
79
- tokenizer.added_tokens_decoder = {}
80
-
81
-
82
  def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
83
  """Apply a sentence splitter and return the sentences and all separators before and after them"""
84
  if fix_double_space:
@@ -104,7 +89,6 @@ class Translator:
104
  if torch.cuda.is_available():
105
  self.model.cuda()
106
  self.tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
107
- fix_tokenizer(self.tokenizer)
108
 
109
  self.splitter = SentenceSplitter("ru")
110
  self.preprocessor = TextPreprocessor()
 
8
  from sentence_splitter import SentenceSplitter
9
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
10
 
11
+ MODEL_URL = "slone/nllb-210-v1"
12
  LANGUAGES = {
13
+ "Русский | Russian": "rus_Cyrl",
14
+ "English | Английский": "eng_Latn",
15
+ "Azərbaycan | Azerbaijani | Азербайджанский": "azj_Latn",
16
+ "Башҡорт | Bashkir | Башкирский": "bak_Cyrl",
17
+ "Буряад | Buryat | Бурятский": "bxr_Cyrl",
18
+ "Чӑваш | Chuvash | Чувашский": "chv_Cyrl",
19
+ "Хакас | Khakas | Хакасский": "kjh_Cyrl",
20
+ "Къарачай-малкъар | Karachay-Balkar | Карачаево-балкарский": "krc_Cyrl",
21
+ "Марий | Meadow Mari | Марийский": "mhr_Cyrl",
22
+ "Эрзянь | Erzya | Эрзянский": "myv_Cyrl",
23
+ "Татар | Tatar | Татарский": "tat_Cyrl",
24
+ "Тыва | Тувинский | Tuvan ": "tyv_Cyrl",
25
  }
26
  L1 = "rus_Cyrl"
27
+ L2 = "eng_Latn"
28
 
29
 
30
  def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
 
64
  return clean
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
68
  """Apply a sentence splitter and return the sentences and all separators before and after them"""
69
  if fix_double_space:
 
89
  if torch.cuda.is_available():
90
  self.model.cuda()
91
  self.tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
 
92
 
93
  self.splitter = SentenceSplitter("ru")
94
  self.preprocessor = TextPreprocessor()