cointegrated commited on
Commit
fc74260
1 Parent(s): 6a07f61

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +211 -207
README.md CHANGED
@@ -1,208 +1,208 @@
1
  ---
2
  license: cc-by-nc-4.0
3
  language:
4
- - ace # ace_Arab Acehnese (Arabic script)
5
- - ace # ace_Latn Acehnese (Latin script)
6
- - acm # acm_Arab Mesopotamian Arabic
7
- - acq # acq_Arab Ta’izzi-Adeni Arabic
8
- - aeb # aeb_Arab Tunisian Arabic
9
- - af # afr_Latn Afrikaans
10
- - ajp # ajp_Arab South Levantine Arabic
11
- - ak # aka_Latn Akan
12
- - am # amh_Ethi Amharic
13
- - apc # apc_Arab North Levantine Arabic
14
- - ar # arb_Arab Modern Standard Arabic
15
- - ars # ars_Arab Najdi Arabic
16
- - ary # ary_Arab Moroccan Arabic
17
- - arz # arz_Arab Egyptian Arabic
18
- - as # asm_Beng Assamese
19
- - ast # ast_Latn Asturian
20
- - awa # awa_Deva Awadhi
21
- - ay # ayr_Latn Central Aymara
22
- - azb # azb_Arab South Azerbaijani
23
- - azj # azj_Latn North Azerbaijani
24
- - ba # bak_Cyrl Bashkir
25
- - bm # bam_Latn Bambara
26
- - ban # ban_Latn Balinese
27
- - be # bel_Cyrl Belarusian
28
- - bem # bem_Latn Bemba
29
- - bn # ben_Beng Bengali
30
- - bho # bho_Deva Bhojpuri
31
- - bjn # bjn_Arab Banjar (Arabic script)
32
- - bjn # bjn_Latn Banjar (Latin script)
33
- - bo # bod_Tibt Standard Tibetan
34
- - bs # bos_Latn Bosnian
35
- - bug # bug_Latn Buginese
36
- - bg # bul_Cyrl Bulgarian
37
- - ca # cat_Latn Catalan
38
- - ceb # ceb_Latn Cebuano
39
- - cs # ces_Latn Czech
40
- - cjk # cjk_Latn Chokwe
41
- - ckb # ckb_Arab Central Kurdish
42
- - crh # crh_Latn Crimean Tatar
43
- - cy # cym_Latn Welsh
44
- - da # dan_Latn Danish
45
- - de # deu_Latn German
46
- - dik # dik_Latn Southwestern Dinka
47
- - dyu # dyu_Latn Dyula
48
- - dz # dzo_Tibt Dzongkha
49
- - el # ell_Grek Greek
50
- - en # eng_Latn English
51
- - eo # epo_Latn Esperanto
52
- - et # est_Latn Estonian
53
- - eu # eus_Latn Basque
54
- - ee # ewe_Latn Ewe
55
- - fo # fao_Latn Faroese
56
- - fa # pes_Arab Western Persian
57
- - fj # fij_Latn Fijian
58
- - fi # fin_Latn Finnish
59
- - fon # fon_Latn Fon
60
- - fr # fra_Latn French
61
- - fur # fur_Latn Friulian
62
- - ff # fuv_Latn Nigerian Fulfulde
63
- - gd # gla_Latn Scottish Gaelic
64
- - ga # gle_Latn Irish
65
- - gl # glg_Latn Galician
66
- - gn # grn_Latn Guarani
67
- - gu # guj_Gujr Gujarati
68
- - ht # hat_Latn Haitian Creole
69
- - ha # hau_Latn Hausa
70
- - he # heb_Hebr Hebrew
71
- - hi # hin_Deva Hindi
72
- - hne # hne_Deva Chhattisgarhi
73
- - hr # hrv_Latn Croatian
74
- - hu # hun_Latn Hungarian
75
- - hy # hye_Armn Armenian
76
- - ig # ibo_Latn Igbo
77
- - ilo # ilo_Latn Ilocano
78
- - id # ind_Latn Indonesian
79
- - is # isl_Latn Icelandic
80
- - it # ita_Latn Italian
81
- - jv # jav_Latn Javanese
82
- - ja # jpn_Jpan Japanese
83
- - kab # kab_Latn Kabyle
84
- - kac # kac_Latn Jingpho
85
- - kam # kam_Latn Kamba
86
- - kn # kan_Knda Kannada
87
- - ks # kas_Arab Kashmiri (Arabic script)
88
- - ks # kas_Deva Kashmiri (Devanagari script)
89
- - ka # kat_Geor Georgian
90
- - kr # knc_Arab Central Kanuri (Arabic script)
91
- - kr # knc_Latn Central Kanuri (Latin script)
92
- - kk # kaz_Cyrl Kazakh
93
- - kbp # kbp_Latn Kabiyè
94
- - kea # kea_Latn Kabuverdianu
95
- - km # khm_Khmr Khmer
96
- - ki # kik_Latn Kikuyu
97
- - rw # kin_Latn Kinyarwanda
98
- - ky # kir_Cyrl Kyrgyz
99
- - kmb # kmb_Latn Kimbundu
100
- - kg # kon_Latn Kikongo
101
- - ko # kor_Hang Korean
102
- - kmr # kmr_Latn Northern Kurdish
103
- - lo # lao_Laoo Lao
104
- - lv # lvs_Latn Standard Latvian
105
- - lij # lij_Latn Ligurian
106
- - li # lim_Latn Limburgish
107
- - ln # lin_Latn Lingala
108
- - lt # lit_Latn Lithuanian
109
- - lmo # lmo_Latn Lombard
110
- - ltg # ltg_Latn Latgalian
111
- - lb # ltz_Latn Luxembourgish
112
- - lua # lua_Latn Luba-Kasai
113
- - lg # lug_Latn Ganda
114
- - luo # luo_Latn Luo
115
- - lus # lus_Latn Mizo
116
- - mag # mag_Deva Magahi
117
- - mai # mai_Deva Maithili
118
- - ml # mal_Mlym Malayalam
119
- - mr # mar_Deva Marathi
120
- - min # min_Latn Minangkabau (Latin script)
121
- - mk # mkd_Cyrl Macedonian
122
- - plt # plt_Latn Plateau Malagasy
123
- - mt # mlt_Latn Maltese
124
- - mni # mni_Beng Meitei (Bengali script)
125
- - mn # khk_Cyrl Halh Mongolian
126
- - mos # mos_Latn Mossi
127
- - mi # mri_Latn Maori
128
- - ms # zsm_Latn Standard Malay
129
- - my # mya_Mymr Burmese
130
- - nl # nld_Latn Dutch
131
- - nn # nno_Latn Norwegian Nynorsk
132
- - nb # nob_Latn Norwegian Bokmål
133
- - ne # npi_Deva Nepali
134
- - nso # nso_Latn Northern Sotho
135
- - nus # nus_Latn Nuer
136
- - ny # nya_Latn Nyanja
137
- - oc # oci_Latn Occitan
138
- - gaz # gaz_Latn West Central Oromo
139
- - ory # ory_Orya Odia
140
- - pag # pag_Latn Pangasinan
141
- - pa # pan_Guru Eastern Panjabi
142
- - pap # pap_Latn Papiamento
143
- - pl # pol_Latn Polish
144
- - pt # por_Latn Portuguese
145
- - prs # prs_Arab Dari
146
- - pbt # pbt_Arab Southern Pashto
147
- - qu # quy_Latn Ayacucho Quechua
148
- - ro # ron_Latn Romanian
149
- - rn # run_Latn Rundi
150
- - ru # rus_Cyrl Russian
151
- - sg # sag_Latn Sango
152
- - sa # san_Deva Sanskrit
153
- - sat # sat_Beng ?
154
- - scn # scn_Latn Sicilian
155
- - shn # shn_Mymr Shan
156
- - si # sin_Sinh Sinhala
157
- - sk # slk_Latn Slovak
158
- - sl # slv_Latn Slovenian
159
- - sm # smo_Latn Samoan
160
- - sn # sna_Latn Shona
161
- - sd # snd_Arab Sindhi
162
- - so # som_Latn Somali
163
- - st # sot_Latn Southern Sotho
164
- - es # spa_Latn Spanish
165
- - als # als_Latn Tosk Albanian
166
- - sc # srd_Latn Sardinian
167
- - sr # srp_Cyrl Serbian
168
- - ss # ssw_Latn Swati
169
- - su # sun_Latn Sundanese
170
- - sv # swe_Latn Swedish
171
- - sw # swh_Latn Swahili
172
- - szl # szl_Latn Silesian
173
- - ta # tam_Taml Tamil
174
- - tt # tat_Cyrl Tatar
175
- - te # tel_Telu Telugu
176
- - tg # tgk_Cyrl Tajik
177
- - tl # tgl_Latn Tagalog
178
- - th # tha_Thai Thai
179
- - ti # tir_Ethi Tigrinya
180
- - taq # taq_Latn Tamasheq (Latin script)
181
- - taq # taq_Tfng Tamasheq (Tifinagh script)
182
- - tpi # tpi_Latn Tok Pisin
183
- - tn # tsn_Latn Tswana
184
- - ts # tso_Latn Tsonga
185
- - tk # tuk_Latn Turkmen
186
- - tum # tum_Latn Tumbuka
187
- - tr # tur_Latn Turkish
188
- - tw # twi_Latn Twi
189
- - tzm # tzm_Tfng Central Atlas Tamazight
190
- - ug # uig_Arab Uyghur
191
- - uk # ukr_Cyrl Ukrainian
192
- - umb # umb_Latn Umbundu
193
- - ur # urd_Arab Urdu
194
- - uz # uzn_Latn Northern Uzbek
195
- - vec # vec_Latn Venetian
196
- - vi # vie_Latn Vietnamese
197
- - war # war_Latn Waray
198
- - wo # wol_Latn Wolof
199
- - xh # xho_Latn Xhosa
200
- - yi # ydd_Hebr Eastern Yiddish
201
- - yo # yor_Latn Yoruba
202
- - yue # yue_Hant Yue Chinese
203
- - zh # zho_Hans Chinese (Simplified)
204
- - zh # zho_Hant Chinese (Traditional)
205
- - zu # zul_Latn Zulu
206
  language_details: >-
207
  ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab,
208
  aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab,
@@ -233,16 +233,18 @@ language_details: >-
233
  tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab,
234
  uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr,
235
  yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn
 
236
  ---
237
- This is a port of the multilingual text encoder from https://huggingface.co/facebook/SONAR to `transformers` format from `fairseq2`.
238
 
239
- It supports the same 202 languages as [NLLB-200](https://huggingface.co/facebook/nllb-200-distilled-600M)
 
 
240
  (see also [the source model card](https://github.com/facebookresearch/SONAR/blob/main/sonar/store/cards/text_sonar_basic_encoder.yaml#L14)
241
  and [FLORES-200 lang code mapping](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)).
242
 
243
- For advanced examples of usage, please take a look at https://github.com/facebookresearch/SONAR.
244
 
245
- How to use:
246
  ```Python
247
  # !pip install transformers sentencepiece -q
248
 
@@ -272,4 +274,6 @@ print(embs.shape)
272
  print(embs)
273
  # tensor([[-0.0053, 0.0020, -0.0006, ..., 0.0094, -0.0009, 0.0070],
274
  # [-0.0003, -0.0071, 0.0076, ..., 0.0055, 0.0022, -0.0083]])
275
- ```
 
 
 
1
  ---
2
  license: cc-by-nc-4.0
3
  language:
4
+ - ace
5
+ - ace
6
+ - acm
7
+ - acq
8
+ - aeb
9
+ - af
10
+ - ajp
11
+ - ak
12
+ - am
13
+ - apc
14
+ - ar
15
+ - ars
16
+ - ary
17
+ - arz
18
+ - as
19
+ - ast
20
+ - awa
21
+ - ay
22
+ - azb
23
+ - azj
24
+ - ba
25
+ - bm
26
+ - ban
27
+ - be
28
+ - bem
29
+ - bn
30
+ - bho
31
+ - bjn
32
+ - bjn
33
+ - bo
34
+ - bs
35
+ - bug
36
+ - bg
37
+ - ca
38
+ - ceb
39
+ - cs
40
+ - cjk
41
+ - ckb
42
+ - crh
43
+ - cy
44
+ - da
45
+ - de
46
+ - dik
47
+ - dyu
48
+ - dz
49
+ - el
50
+ - en
51
+ - eo
52
+ - et
53
+ - eu
54
+ - ee
55
+ - fo
56
+ - fa
57
+ - fj
58
+ - fi
59
+ - fon
60
+ - fr
61
+ - fur
62
+ - ff
63
+ - gd
64
+ - ga
65
+ - gl
66
+ - gn
67
+ - gu
68
+ - ht
69
+ - ha
70
+ - he
71
+ - hi
72
+ - hne
73
+ - hr
74
+ - hu
75
+ - hy
76
+ - ig
77
+ - ilo
78
+ - id
79
+ - is
80
+ - it
81
+ - jv
82
+ - ja
83
+ - kab
84
+ - kac
85
+ - kam
86
+ - kn
87
+ - ks
88
+ - ks
89
+ - ka
90
+ - kr
91
+ - kr
92
+ - kk
93
+ - kbp
94
+ - kea
95
+ - km
96
+ - ki
97
+ - rw
98
+ - ky
99
+ - kmb
100
+ - kg
101
+ - ko
102
+ - kmr
103
+ - lo
104
+ - lv
105
+ - lij
106
+ - li
107
+ - ln
108
+ - lt
109
+ - lmo
110
+ - ltg
111
+ - lb
112
+ - lua
113
+ - lg
114
+ - luo
115
+ - lus
116
+ - mag
117
+ - mai
118
+ - ml
119
+ - mr
120
+ - min
121
+ - mk
122
+ - plt
123
+ - mt
124
+ - mni
125
+ - mn
126
+ - mos
127
+ - mi
128
+ - ms
129
+ - my
130
+ - nl
131
+ - nn
132
+ - nb
133
+ - ne
134
+ - nso
135
+ - nus
136
+ - ny
137
+ - oc
138
+ - gaz
139
+ - ory
140
+ - pag
141
+ - pa
142
+ - pap
143
+ - pl
144
+ - pt
145
+ - prs
146
+ - pbt
147
+ - qu
148
+ - ro
149
+ - rn
150
+ - ru
151
+ - sg
152
+ - sa
153
+ - sat
154
+ - scn
155
+ - shn
156
+ - si
157
+ - sk
158
+ - sl
159
+ - sm
160
+ - sn
161
+ - sd
162
+ - so
163
+ - st
164
+ - es
165
+ - als
166
+ - sc
167
+ - sr
168
+ - ss
169
+ - su
170
+ - sv
171
+ - sw
172
+ - szl
173
+ - ta
174
+ - tt
175
+ - te
176
+ - tg
177
+ - tl
178
+ - th
179
+ - ti
180
+ - taq
181
+ - taq
182
+ - tpi
183
+ - tn
184
+ - ts
185
+ - tk
186
+ - tum
187
+ - tr
188
+ - tw
189
+ - tzm
190
+ - ug
191
+ - uk
192
+ - umb
193
+ - ur
194
+ - uz
195
+ - vec
196
+ - vi
197
+ - war
198
+ - wo
199
+ - xh
200
+ - yi
201
+ - yo
202
+ - yue
203
+ - zh
204
+ - zh
205
+ - zu
206
  language_details: >-
207
  ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab,
208
  aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab,
 
233
  tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab,
234
  uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr,
235
  yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn
236
+ pipeline_tag: sentence-similarity
237
  ---
238
+ This is a port of the multilingual SONAR text encoder (https://huggingface.co/facebook/SONAR) to the `transformers` format from `fairseq2`.
239
 
240
+ Its embeddings are expected be equal to those the official implementation (https://github.com/facebookresearch/SONAR), but the latter stays the source of truth.
241
+
242
+ The encoder supports the same 202 languages as [NLLB-200](https://huggingface.co/facebook/nllb-200-distilled-600M)
243
  (see also [the source model card](https://github.com/facebookresearch/SONAR/blob/main/sonar/store/cards/text_sonar_basic_encoder.yaml#L14)
244
  and [FLORES-200 lang code mapping](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)).
245
 
 
246
 
247
+ How to compute embeddings:
248
  ```Python
249
  # !pip install transformers sentencepiece -q
250
 
 
274
  print(embs)
275
  # tensor([[-0.0053, 0.0020, -0.0006, ..., 0.0094, -0.0009, 0.0070],
276
  # [-0.0003, -0.0071, 0.0076, ..., 0.0055, 0.0022, -0.0083]])
277
+ ```
278
+
279
+ For advanced examples of usage, please take a look at the readme in https://github.com/facebookresearch/SONAR.