tiedeman commited on
Commit
d627084
1 Parent(s): 60df009

Initial commit

Browse files
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,851 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - be
4
+ - ca
5
+ - es
6
+ - fr
7
+ - gl
8
+ - it
9
+ - itc
10
+ - pt
11
+ - ro
12
+ - ru
13
+ - rue
14
+ - uk
15
+ - xx
16
+ - zle
17
+ language_bcp47:
18
+ - be
19
+ - ca
20
+ - es
21
+ - fr
22
+ - gl
23
+ - it
24
+ - itc
25
+ - pt
26
+ - ro
27
+ - ru
28
+ - rue
29
+ - uk
30
+ - zle
31
+
32
+ tags:
33
+ - translation
34
+ - opus-mt-tc
35
+
36
+ license: cc-by-4.0
37
+ model-index:
38
+ - name: opus-mt-tc-big-zle-itc
39
+ results:
40
+ - task:
41
+ name: Translation bel-cat
42
+ type: translation
43
+ args: bel-cat
44
+ dataset:
45
+ name: flores101-devtest
46
+ type: flores_101
47
+ args: bel cat devtest
48
+ metrics:
49
+ - name: BLEU
50
+ type: bleu
51
+ value: 16.8
52
+ - name: chr-F
53
+ type: chrf
54
+ value: 0.48374
55
+ - task:
56
+ name: Translation bel-fra
57
+ type: translation
58
+ args: bel-fra
59
+ dataset:
60
+ name: flores101-devtest
61
+ type: flores_101
62
+ args: bel fra devtest
63
+ metrics:
64
+ - name: BLEU
65
+ type: bleu
66
+ value: 19.4
67
+ - name: chr-F
68
+ type: chrf
69
+ value: 0.51278
70
+ - task:
71
+ name: Translation bel-glg
72
+ type: translation
73
+ args: bel-glg
74
+ dataset:
75
+ name: flores101-devtest
76
+ type: flores_101
77
+ args: bel glg devtest
78
+ metrics:
79
+ - name: BLEU
80
+ type: bleu
81
+ value: 15.3
82
+ - name: chr-F
83
+ type: chrf
84
+ value: 0.45665
85
+ - task:
86
+ name: Translation bel-ita
87
+ type: translation
88
+ args: bel-ita
89
+ dataset:
90
+ name: flores101-devtest
91
+ type: flores_101
92
+ args: bel ita devtest
93
+ metrics:
94
+ - name: BLEU
95
+ type: bleu
96
+ value: 14.6
97
+ - name: chr-F
98
+ type: chrf
99
+ value: 0.47204
100
+ - task:
101
+ name: Translation bel-por
102
+ type: translation
103
+ args: bel-por
104
+ dataset:
105
+ name: flores101-devtest
106
+ type: flores_101
107
+ args: bel por devtest
108
+ metrics:
109
+ - name: BLEU
110
+ type: bleu
111
+ value: 17.3
112
+ - name: chr-F
113
+ type: chrf
114
+ value: 0.49561
115
+ - task:
116
+ name: Translation bel-ron
117
+ type: translation
118
+ args: bel-ron
119
+ dataset:
120
+ name: flores101-devtest
121
+ type: flores_101
122
+ args: bel ron devtest
123
+ metrics:
124
+ - name: BLEU
125
+ type: bleu
126
+ value: 14.9
127
+ - name: chr-F
128
+ type: chrf
129
+ value: 0.46315
130
+ - task:
131
+ name: Translation bel-spa
132
+ type: translation
133
+ args: bel-spa
134
+ dataset:
135
+ name: flores101-devtest
136
+ type: flores_101
137
+ args: bel spa devtest
138
+ metrics:
139
+ - name: BLEU
140
+ type: bleu
141
+ value: 15.3
142
+ - name: chr-F
143
+ type: chrf
144
+ value: 0.46011
145
+ - task:
146
+ name: Translation rus-ast
147
+ type: translation
148
+ args: rus-ast
149
+ dataset:
150
+ name: flores101-devtest
151
+ type: flores_101
152
+ args: rus ast devtest
153
+ metrics:
154
+ - name: BLEU
155
+ type: bleu
156
+ value: 13.6
157
+ - name: chr-F
158
+ type: chrf
159
+ value: 0.45411
160
+ - task:
161
+ name: Translation rus-cat
162
+ type: translation
163
+ args: rus-cat
164
+ dataset:
165
+ name: flores101-devtest
166
+ type: flores_101
167
+ args: rus cat devtest
168
+ metrics:
169
+ - name: BLEU
170
+ type: bleu
171
+ value: 28.3
172
+ - name: chr-F
173
+ type: chrf
174
+ value: 0.55262
175
+ - task:
176
+ name: Translation rus-fra
177
+ type: translation
178
+ args: rus-fra
179
+ dataset:
180
+ name: flores101-devtest
181
+ type: flores_101
182
+ args: rus fra devtest
183
+ metrics:
184
+ - name: BLEU
185
+ type: bleu
186
+ value: 32.9
187
+ - name: chr-F
188
+ type: chrf
189
+ value: 0.59498
190
+ - task:
191
+ name: Translation rus-glg
192
+ type: translation
193
+ args: rus-glg
194
+ dataset:
195
+ name: flores101-devtest
196
+ type: flores_101
197
+ args: rus glg devtest
198
+ metrics:
199
+ - name: BLEU
200
+ type: bleu
201
+ value: 23.5
202
+ - name: chr-F
203
+ type: chrf
204
+ value: 0.51668
205
+ - task:
206
+ name: Translation rus-ita
207
+ type: translation
208
+ args: rus-ita
209
+ dataset:
210
+ name: flores101-devtest
211
+ type: flores_101
212
+ args: rus ita devtest
213
+ metrics:
214
+ - name: BLEU
215
+ type: bleu
216
+ value: 22.7
217
+ - name: chr-F
218
+ type: chrf
219
+ value: 0.52402
220
+ - task:
221
+ name: Translation rus-oci
222
+ type: translation
223
+ args: rus-oci
224
+ dataset:
225
+ name: flores101-devtest
226
+ type: flores_101
227
+ args: rus oci devtest
228
+ metrics:
229
+ - name: BLEU
230
+ type: bleu
231
+ value: 12.9
232
+ - name: chr-F
233
+ type: chrf
234
+ value: 0.42301
235
+ - task:
236
+ name: Translation rus-por
237
+ type: translation
238
+ args: rus-por
239
+ dataset:
240
+ name: flores101-devtest
241
+ type: flores_101
242
+ args: rus por devtest
243
+ metrics:
244
+ - name: BLEU
245
+ type: bleu
246
+ value: 31.4
247
+ - name: chr-F
248
+ type: chrf
249
+ value: 0.58045
250
+ - task:
251
+ name: Translation rus-ron
252
+ type: translation
253
+ args: rus-ron
254
+ dataset:
255
+ name: flores101-devtest
256
+ type: flores_101
257
+ args: rus ron devtest
258
+ metrics:
259
+ - name: BLEU
260
+ type: bleu
261
+ value: 24.7
262
+ - name: chr-F
263
+ type: chrf
264
+ value: 0.52560
265
+ - task:
266
+ name: Translation rus-spa
267
+ type: translation
268
+ args: rus-spa
269
+ dataset:
270
+ name: flores101-devtest
271
+ type: flores_101
272
+ args: rus spa devtest
273
+ metrics:
274
+ - name: BLEU
275
+ type: bleu
276
+ value: 21.8
277
+ - name: chr-F
278
+ type: chrf
279
+ value: 0.50622
280
+ - task:
281
+ name: Translation ukr-ast
282
+ type: translation
283
+ args: ukr-ast
284
+ dataset:
285
+ name: flores101-devtest
286
+ type: flores_101
287
+ args: ukr ast devtest
288
+ metrics:
289
+ - name: BLEU
290
+ type: bleu
291
+ value: 14.1
292
+ - name: chr-F
293
+ type: chrf
294
+ value: 0.45629
295
+ - task:
296
+ name: Translation ukr-cat
297
+ type: translation
298
+ args: ukr-cat
299
+ dataset:
300
+ name: flores101-devtest
301
+ type: flores_101
302
+ args: ukr cat devtest
303
+ metrics:
304
+ - name: BLEU
305
+ type: bleu
306
+ value: 29.5
307
+ - name: chr-F
308
+ type: chrf
309
+ value: 0.56383
310
+ - task:
311
+ name: Translation ukr-fra
312
+ type: translation
313
+ args: ukr-fra
314
+ dataset:
315
+ name: flores101-devtest
316
+ type: flores_101
317
+ args: ukr fra devtest
318
+ metrics:
319
+ - name: BLEU
320
+ type: bleu
321
+ value: 34.5
322
+ - name: chr-F
323
+ type: chrf
324
+ value: 0.60596
325
+ - task:
326
+ name: Translation ukr-glg
327
+ type: translation
328
+ args: ukr-glg
329
+ dataset:
330
+ name: flores101-devtest
331
+ type: flores_101
332
+ args: ukr glg devtest
333
+ metrics:
334
+ - name: BLEU
335
+ type: bleu
336
+ value: 24.2
337
+ - name: chr-F
338
+ type: chrf
339
+ value: 0.52217
340
+ - task:
341
+ name: Translation ukr-ita
342
+ type: translation
343
+ args: ukr-ita
344
+ dataset:
345
+ name: flores101-devtest
346
+ type: flores_101
347
+ args: ukr ita devtest
348
+ metrics:
349
+ - name: BLEU
350
+ type: bleu
351
+ value: 23.0
352
+ - name: chr-F
353
+ type: chrf
354
+ value: 0.52610
355
+ - task:
356
+ name: Translation ukr-oci
357
+ type: translation
358
+ args: ukr-oci
359
+ dataset:
360
+ name: flores101-devtest
361
+ type: flores_101
362
+ args: ukr oci devtest
363
+ metrics:
364
+ - name: BLEU
365
+ type: bleu
366
+ value: 13.7
367
+ - name: chr-F
368
+ type: chrf
369
+ value: 0.42937
370
+ - task:
371
+ name: Translation ukr-por
372
+ type: translation
373
+ args: ukr-por
374
+ dataset:
375
+ name: flores101-devtest
376
+ type: flores_101
377
+ args: ukr por devtest
378
+ metrics:
379
+ - name: BLEU
380
+ type: bleu
381
+ value: 32.5
382
+ - name: chr-F
383
+ type: chrf
384
+ value: 0.59036
385
+ - task:
386
+ name: Translation ukr-ron
387
+ type: translation
388
+ args: ukr-ron
389
+ dataset:
390
+ name: flores101-devtest
391
+ type: flores_101
392
+ args: ukr ron devtest
393
+ metrics:
394
+ - name: BLEU
395
+ type: bleu
396
+ value: 26.0
397
+ - name: chr-F
398
+ type: chrf
399
+ value: 0.53883
400
+ - task:
401
+ name: Translation ukr-spa
402
+ type: translation
403
+ args: ukr-spa
404
+ dataset:
405
+ name: flores101-devtest
406
+ type: flores_101
407
+ args: ukr spa devtest
408
+ metrics:
409
+ - name: BLEU
410
+ type: bleu
411
+ value: 22.5
412
+ - name: chr-F
413
+ type: chrf
414
+ value: 0.51018
415
+ - task:
416
+ name: Translation bel-fra
417
+ type: translation
418
+ args: bel-fra
419
+ dataset:
420
+ name: tatoeba-test-v2021-08-07
421
+ type: tatoeba_mt
422
+ args: bel-fra
423
+ metrics:
424
+ - name: BLEU
425
+ type: bleu
426
+ value: 49.1
427
+ - name: chr-F
428
+ type: chrf
429
+ value: 0.66784
430
+ - task:
431
+ name: Translation bel-ita
432
+ type: translation
433
+ args: bel-ita
434
+ dataset:
435
+ name: tatoeba-test-v2021-08-07
436
+ type: tatoeba_mt
437
+ args: bel-ita
438
+ metrics:
439
+ - name: BLEU
440
+ type: bleu
441
+ value: 47.6
442
+ - name: chr-F
443
+ type: chrf
444
+ value: 0.64145
445
+ - task:
446
+ name: Translation bel-spa
447
+ type: translation
448
+ args: bel-spa
449
+ dataset:
450
+ name: tatoeba-test-v2021-08-07
451
+ type: tatoeba_mt
452
+ args: bel-spa
453
+ metrics:
454
+ - name: BLEU
455
+ type: bleu
456
+ value: 46.9
457
+ - name: chr-F
458
+ type: chrf
459
+ value: 0.65485
460
+ - task:
461
+ name: Translation rus-fra
462
+ type: translation
463
+ args: rus-fra
464
+ dataset:
465
+ name: tatoeba-test-v2021-08-07
466
+ type: tatoeba_mt
467
+ args: rus-fra
468
+ metrics:
469
+ - name: BLEU
470
+ type: bleu
471
+ value: 52.1
472
+ - name: chr-F
473
+ type: chrf
474
+ value: 0.68174
475
+ - task:
476
+ name: Translation rus-ita
477
+ type: translation
478
+ args: rus-ita
479
+ dataset:
480
+ name: tatoeba-test-v2021-08-07
481
+ type: tatoeba_mt
482
+ args: rus-ita
483
+ metrics:
484
+ - name: BLEU
485
+ type: bleu
486
+ value: 42.7
487
+ - name: chr-F
488
+ type: chrf
489
+ value: 0.63277
490
+ - task:
491
+ name: Translation rus-por
492
+ type: translation
493
+ args: rus-por
494
+ dataset:
495
+ name: tatoeba-test-v2021-08-07
496
+ type: tatoeba_mt
497
+ args: rus-por
498
+ metrics:
499
+ - name: BLEU
500
+ type: bleu
501
+ value: 42.6
502
+ - name: chr-F
503
+ type: chrf
504
+ value: 0.63606
505
+ - task:
506
+ name: Translation rus-ron
507
+ type: translation
508
+ args: rus-ron
509
+ dataset:
510
+ name: tatoeba-test-v2021-08-07
511
+ type: tatoeba_mt
512
+ args: rus-ron
513
+ metrics:
514
+ - name: BLEU
515
+ type: bleu
516
+ value: 37.5
517
+ - name: chr-F
518
+ type: chrf
519
+ value: 0.60796
520
+ - task:
521
+ name: Translation rus-spa
522
+ type: translation
523
+ args: rus-spa
524
+ dataset:
525
+ name: tatoeba-test-v2021-08-07
526
+ type: tatoeba_mt
527
+ args: rus-spa
528
+ metrics:
529
+ - name: BLEU
530
+ type: bleu
531
+ value: 51.3
532
+ - name: chr-F
533
+ type: chrf
534
+ value: 0.69108
535
+ - task:
536
+ name: Translation ukr-cat
537
+ type: translation
538
+ args: ukr-cat
539
+ dataset:
540
+ name: tatoeba-test-v2021-08-07
541
+ type: tatoeba_mt
542
+ args: ukr-cat
543
+ metrics:
544
+ - name: BLEU
545
+ type: bleu
546
+ value: 52.9
547
+ - name: chr-F
548
+ type: chrf
549
+ value: 0.69275
550
+ - task:
551
+ name: Translation ukr-fra
552
+ type: translation
553
+ args: ukr-fra
554
+ dataset:
555
+ name: tatoeba-test-v2021-08-07
556
+ type: tatoeba_mt
557
+ args: ukr-fra
558
+ metrics:
559
+ - name: BLEU
560
+ type: bleu
561
+ value: 51.3
562
+ - name: chr-F
563
+ type: chrf
564
+ value: 0.67392
565
+ - task:
566
+ name: Translation ukr-ita
567
+ type: translation
568
+ args: ukr-ita
569
+ dataset:
570
+ name: tatoeba-test-v2021-08-07
571
+ type: tatoeba_mt
572
+ args: ukr-ita
573
+ metrics:
574
+ - name: BLEU
575
+ type: bleu
576
+ value: 49.6
577
+ - name: chr-F
578
+ type: chrf
579
+ value: 0.69157
580
+ - task:
581
+ name: Translation ukr-por
582
+ type: translation
583
+ args: ukr-por
584
+ dataset:
585
+ name: tatoeba-test-v2021-08-07
586
+ type: tatoeba_mt
587
+ args: ukr-por
588
+ metrics:
589
+ - name: BLEU
590
+ type: bleu
591
+ value: 45.0
592
+ - name: chr-F
593
+ type: chrf
594
+ value: 0.64722
595
+ - task:
596
+ name: Translation ukr-spa
597
+ type: translation
598
+ args: ukr-spa
599
+ dataset:
600
+ name: tatoeba-test-v2021-08-07
601
+ type: tatoeba_mt
602
+ args: ukr-spa
603
+ metrics:
604
+ - name: BLEU
605
+ type: bleu
606
+ value: 50.7
607
+ - name: chr-F
608
+ type: chrf
609
+ value: 0.68409
610
+ - task:
611
+ name: Translation rus-fra
612
+ type: translation
613
+ args: rus-fra
614
+ dataset:
615
+ name: newstest2012
616
+ type: wmt-2012-news
617
+ args: rus-fra
618
+ metrics:
619
+ - name: BLEU
620
+ type: bleu
621
+ value: 25.0
622
+ - name: chr-F
623
+ type: chrf
624
+ value: 0.53481
625
+ - task:
626
+ name: Translation rus-spa
627
+ type: translation
628
+ args: rus-spa
629
+ dataset:
630
+ name: newstest2012
631
+ type: wmt-2012-news
632
+ args: rus-spa
633
+ metrics:
634
+ - name: BLEU
635
+ type: bleu
636
+ value: 28.7
637
+ - name: chr-F
638
+ type: chrf
639
+ value: 0.54814
640
+ - task:
641
+ name: Translation rus-fra
642
+ type: translation
643
+ args: rus-fra
644
+ dataset:
645
+ name: newstest2013
646
+ type: wmt-2013-news
647
+ args: rus-fra
648
+ metrics:
649
+ - name: BLEU
650
+ type: bleu
651
+ value: 29.0
652
+ - name: chr-F
653
+ type: chrf
654
+ value: 0.55745
655
+ - task:
656
+ name: Translation rus-spa
657
+ type: translation
658
+ args: rus-spa
659
+ dataset:
660
+ name: newstest2013
661
+ type: wmt-2013-news
662
+ args: rus-spa
663
+ metrics:
664
+ - name: BLEU
665
+ type: bleu
666
+ value: 31.5
667
+ - name: chr-F
668
+ type: chrf
669
+ value: 0.56582
670
+ ---
671
+ # opus-mt-tc-big-zle-itc
672
+
673
+ ## Table of Contents
674
+ - [Model Details](#model-details)
675
+ - [Uses](#uses)
676
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
677
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
678
+ - [Training](#training)
679
+ - [Evaluation](#evaluation)
680
+ - [Citation Information](#citation-information)
681
+ - [Acknowledgements](#acknowledgements)
682
+
683
+ ## Model Details
684
+
685
+ Neural machine translation model for translating from East Slavic languages (zle) to Italic languages (itc).
686
+
687
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
688
+ **Model Description:**
689
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
690
+ - **Model Type:** Translation (transformer-big)
691
+ - **Release**: 2022-08-03
692
+ - **License:** CC-BY-4.0
693
+ - **Language(s):**
694
+ - Source Language(s): bel rue rus ukr
695
+ - Target Language(s): cat fra glg ita lad_Latn por ron spa
696
+ - Valid Target Language Labels: >>cat<< >>fra<< >>glg<< >>ita<< >>lad_Latn<< >>por<< >>ron<< >>spa<<
697
+ - **Original Model**: [opusTCv20210807_transformer-big_2022-08-03.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/zle-itc/opusTCv20210807_transformer-big_2022-08-03.zip)
698
+ - **Resources for more information:**
699
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
700
+ - More information about released models for this language pair: [OPUS-MT zle-itc README](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/zle-itc/README.md)
701
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
702
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/
703
+
704
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>cat<<`
705
+
706
+ ## Uses
707
+
708
+ This model can be used for translation and text-to-text generation.
709
+
710
+ ## Risks, Limitations and Biases
711
+
712
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
713
+
714
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
715
+
716
+ ## How to Get Started With the Model
717
+
718
+ A short example code:
719
+
720
+ ```python
721
+ from transformers import MarianMTModel, MarianTokenizer
722
+
723
+ src_text = [
724
+ ">>fra<< Вони не мої справжні батьки.",
725
+ ">>por<< Мне нужно в школу."
726
+ ]
727
+
728
+ model_name = "pytorch-models/opus-mt-tc-big-zle-itc"
729
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
730
+ model = MarianMTModel.from_pretrained(model_name)
731
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
732
+
733
+ for t in translated:
734
+ print( tokenizer.decode(t, skip_special_tokens=True) )
735
+
736
+ # expected output:
737
+ # Ce ne sont pas mes vrais parents.
738
+ # Tenho de ir para a escola.
739
+ ```
740
+
741
+ You can also use OPUS-MT models with the transformers pipelines, for example:
742
+
743
+ ```python
744
+ from transformers import pipeline
745
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-zle-itc")
746
+ print(pipe(">>fra<< Вони не мої справжні батьки."))
747
+
748
+ # expected output: Ce ne sont pas mes vrais parents.
749
+ ```
750
+
751
+ ## Training
752
+
753
+ - **Data**: opusTCv20210807 ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
754
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
755
+ - **Model Type:** transformer-big
756
+ - **Original MarianNMT Model**: [opusTCv20210807_transformer-big_2022-08-03.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/zle-itc/opusTCv20210807_transformer-big_2022-08-03.zip)
757
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
758
+
759
+ ## Evaluation
760
+
761
+ * test set translations: [opusTCv20210807_transformer-big_2022-08-03.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zle-itc/opusTCv20210807_transformer-big_2022-08-03.test.txt)
762
+ * test set scores: [opusTCv20210807_transformer-big_2022-08-03.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zle-itc/opusTCv20210807_transformer-big_2022-08-03.eval.txt)
763
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
764
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
765
+
766
+ | langpair | testset | chr-F | BLEU | #sent | #words |
767
+ |----------|---------|-------|-------|-------|--------|
768
+ | bel-fra | tatoeba-test-v2021-08-07 | 0.66784 | 49.1 | 283 | 2005 |
769
+ | bel-ita | tatoeba-test-v2021-08-07 | 0.64145 | 47.6 | 264 | 1681 |
770
+ | bel-spa | tatoeba-test-v2021-08-07 | 0.65485 | 46.9 | 205 | 1412 |
771
+ | rus-fra | tatoeba-test-v2021-08-07 | 0.68174 | 52.1 | 11490 | 80579 |
772
+ | rus-ita | tatoeba-test-v2021-08-07 | 0.63277 | 42.7 | 10045 | 71584 |
773
+ | rus-por | tatoeba-test-v2021-08-07 | 0.63606 | 42.6 | 10000 | 74713 |
774
+ | rus-ron | tatoeba-test-v2021-08-07 | 0.60796 | 37.5 | 782 | 4772 |
775
+ | rus-spa | tatoeba-test-v2021-08-07 | 0.69108 | 51.3 | 10506 | 75246 |
776
+ | ukr-cat | tatoeba-test-v2021-08-07 | 0.69275 | 52.9 | 456 | 2675 |
777
+ | ukr-fra | tatoeba-test-v2021-08-07 | 0.67392 | 51.3 | 10035 | 63227 |
778
+ | ukr-ita | tatoeba-test-v2021-08-07 | 0.69157 | 49.6 | 5000 | 27846 |
779
+ | ukr-por | tatoeba-test-v2021-08-07 | 0.64722 | 45.0 | 3372 | 21315 |
780
+ | ukr-spa | tatoeba-test-v2021-08-07 | 0.68409 | 50.7 | 10115 | 59284 |
781
+ | bel-ast | flores101-devtest | 0.40942 | 8.7 | 1012 | 24572 |
782
+ | bel-cat | flores101-devtest | 0.48374 | 16.8 | 1012 | 27304 |
783
+ | bel-fra | flores101-devtest | 0.51278 | 19.4 | 1012 | 28343 |
784
+ | bel-glg | flores101-devtest | 0.45665 | 15.3 | 1012 | 26582 |
785
+ | bel-ita | flores101-devtest | 0.47204 | 14.6 | 1012 | 27306 |
786
+ | bel-por | flores101-devtest | 0.49561 | 17.3 | 1012 | 26519 |
787
+ | bel-ron | flores101-devtest | 0.46315 | 14.9 | 1012 | 26799 |
788
+ | bel-spa | flores101-devtest | 0.46011 | 15.3 | 1012 | 29199 |
789
+ | rus-ast | flores101-devtest | 0.45411 | 13.6 | 1012 | 24572 |
790
+ | rus-cat | flores101-devtest | 0.55262 | 28.3 | 1012 | 27304 |
791
+ | rus-fra | flores101-devtest | 0.59498 | 32.9 | 1012 | 28343 |
792
+ | rus-glg | flores101-devtest | 0.51668 | 23.5 | 1012 | 26582 |
793
+ | rus-ita | flores101-devtest | 0.52402 | 22.7 | 1012 | 27306 |
794
+ | rus-oci | flores101-devtest | 0.42301 | 12.9 | 1012 | 27305 |
795
+ | rus-por | flores101-devtest | 0.58045 | 31.4 | 1012 | 26519 |
796
+ | rus-ron | flores101-devtest | 0.52560 | 24.7 | 1012 | 26799 |
797
+ | rus-spa | flores101-devtest | 0.50622 | 21.8 | 1012 | 29199 |
798
+ | ukr-ast | flores101-devtest | 0.45629 | 14.1 | 1012 | 24572 |
799
+ | ukr-cat | flores101-devtest | 0.56383 | 29.5 | 1012 | 27304 |
800
+ | ukr-fra | flores101-devtest | 0.60596 | 34.5 | 1012 | 28343 |
801
+ | ukr-glg | flores101-devtest | 0.52217 | 24.2 | 1012 | 26582 |
802
+ | ukr-ita | flores101-devtest | 0.52610 | 23.0 | 1012 | 27306 |
803
+ | ukr-oci | flores101-devtest | 0.42937 | 13.7 | 1012 | 27305 |
804
+ | ukr-por | flores101-devtest | 0.59036 | 32.5 | 1012 | 26519 |
805
+ | ukr-ron | flores101-devtest | 0.53883 | 26.0 | 1012 | 26799 |
806
+ | ukr-spa | flores101-devtest | 0.51018 | 22.5 | 1012 | 29199 |
807
+ | rus-fra | newstest2012 | 0.53481 | 25.0 | 3003 | 78011 |
808
+ | rus-spa | newstest2012 | 0.54814 | 28.7 | 3003 | 79006 |
809
+ | rus-fra | newstest2013 | 0.55745 | 29.0 | 3000 | 70037 |
810
+ | rus-spa | newstest2013 | 0.56582 | 31.5 | 3000 | 70528 |
811
+
812
+ ## Citation Information
813
+
814
+ * Publications: [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
815
+
816
+ ```
817
+ @inproceedings{tiedemann-thottingal-2020-opus,
818
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
819
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
820
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
821
+ month = nov,
822
+ year = "2020",
823
+ address = "Lisboa, Portugal",
824
+ publisher = "European Association for Machine Translation",
825
+ url = "https://aclanthology.org/2020.eamt-1.61",
826
+ pages = "479--480",
827
+ }
828
+
829
+ @inproceedings{tiedemann-2020-tatoeba,
830
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
831
+ author = {Tiedemann, J{\"o}rg},
832
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
833
+ month = nov,
834
+ year = "2020",
835
+ address = "Online",
836
+ publisher = "Association for Computational Linguistics",
837
+ url = "https://aclanthology.org/2020.wmt-1.139",
838
+ pages = "1174--1182",
839
+ }
840
+ ```
841
+
842
+ ## Acknowledgements
843
+
844
+ The work is supported by the [European Language Grid](https://www.european-language-grid.eu/) as [pilot project 2866](https://live.european-language-grid.eu/catalogue/#/resource/projects/2866), by the [FoTran project](https://www.helsinki.fi/en/researchgroups/natural-language-understanding-with-cross-lingual-grounding), funded by the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 771113), and the [MeMAD project](https://memad.eu/), funded by the European Union’s Horizon 2020 Research and Innovation Programme under grant agreement No 780069. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland.
845
+
846
+ ## Model conversion info
847
+
848
+ * transformers version: 4.16.2
849
+ * OPUS-MT git hash: 8b9f0b0
850
+ * port time: Fri Aug 12 15:10:16 EEST 2022
851
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bel-ast flores101-dev 0.40365 8.6 997 23232
2
+ bel-cat flores101-dev 0.48240 16.6 997 25962
3
+ bel-fra flores101-dev 0.51049 18.9 997 26706
4
+ bel-glg flores101-dev 0.45486 14.8 997 25265
5
+ bel-ita flores101-dev 0.47154 14.4 997 25840
6
+ bel-oci flores101-dev 0.36399 8.0 997 25819
7
+ bel-por flores101-dev 0.48961 16.7 997 25287
8
+ bel-ron flores101-dev 0.46522 15.0 997 25616
9
+ bel-spa flores101-dev 0.45442 15.1 997 27793
10
+ rus-ast flores101-dev 0.45122 13.9 997 23232
11
+ rus-cat flores101-dev 0.55398 27.5 997 25962
12
+ rus-fra flores101-dev 0.59981 33.5 997 26706
13
+ rus-glg flores101-dev 0.51495 23.4 997 25265
14
+ rus-ita flores101-dev 0.52531 22.8 997 25840
15
+ rus-oci flores101-dev 0.42042 13.0 997 25819
16
+ rus-por flores101-dev 0.57848 30.7 997 25287
17
+ rus-ron flores101-dev 0.53105 25.5 997 25616
18
+ rus-spa flores101-dev 0.50041 21.7 997 27793
19
+ bel-ast flores101-devtest 0.40942 8.7 1012 24572
20
+ bel-cat flores101-devtest 0.48374 16.8 1012 27304
21
+ bel-fra flores101-devtest 0.51278 19.4 1012 28343
22
+ bel-glg flores101-devtest 0.45665 15.3 1012 26582
23
+ bel-ita flores101-devtest 0.47204 14.6 1012 27306
24
+ bel-oci flores101-devtest 0.36646 8.1 1012 27305
25
+ bel-por flores101-devtest 0.49561 17.3 1012 26519
26
+ bel-ron flores101-devtest 0.46315 14.9 1012 26799
27
+ bel-spa flores101-devtest 0.46011 15.3 1012 29199
28
+ rus-ast flores101-devtest 0.45411 13.6 1012 24572
29
+ rus-cat flores101-devtest 0.55262 28.3 1012 27304
30
+ rus-fra flores101-devtest 0.59498 32.9 1012 28343
31
+ rus-glg flores101-devtest 0.51668 23.5 1012 26582
32
+ rus-ita flores101-devtest 0.52402 22.7 1012 27306
33
+ rus-oci flores101-devtest 0.42301 12.9 1012 27305
34
+ rus-por flores101-devtest 0.58045 31.4 1012 26519
35
+ rus-ron flores101-devtest 0.52560 24.7 1012 26799
36
+ rus-spa flores101-devtest 0.50622 21.8 1012 29199
37
+ ukr-ast flores101-devtest 0.45629 14.1 1012 24572
38
+ ukr-cat flores101-devtest 0.56383 29.5 1012 27304
39
+ ukr-fra flores101-devtest 0.60596 34.5 1012 28343
40
+ ukr-glg flores101-devtest 0.52217 24.2 1012 26582
41
+ ukr-ita flores101-devtest 0.52610 23.0 1012 27306
42
+ ukr-oci flores101-devtest 0.42937 13.7 1012 27305
43
+ ukr-por flores101-devtest 0.59036 32.5 1012 26519
44
+ ukr-ron flores101-devtest 0.53883 26.0 1012 26799
45
+ ukr-spa flores101-devtest 0.51018 22.5 1012 29199
46
+ ukr-ast flores101-dev 0.45607 14.4 997 23232
47
+ ukr-cat flores101-dev 0.55788 29.0 997 25962
48
+ ukr-fra flores101-dev 0.60418 35.1 997 26706
49
+ ukr-glg flores101-dev 0.51728 23.9 997 25265
50
+ ukr-ita flores101-dev 0.51952 21.8 997 25840
51
+ ukr-oci flores101-dev 0.42508 13.8 997 25819
52
+ ukr-por flores101-dev 0.58233 32.1 997 25287
53
+ ukr-ron flores101-dev 0.54171 26.5 997 25616
54
+ ukr-spa flores101-dev 0.50349 22.3 997 27793
55
+ rus-fra newstest2012 0.53481 25.0 3003 78011
56
+ rus-spa newstest2012 0.54814 28.7 3003 79006
57
+ rus-fra newstest2013 0.55745 29.0 3000 70037
58
+ rus-spa newstest2013 0.56582 31.5 3000 70528
59
+ rus-fra tatoeba-test-v2020-07-28 0.68104 52.3 10000 70132
60
+ rus-ita tatoeba-test-v2020-07-28 0.63269 42.7 10000 71254
61
+ rus-ron tatoeba-test-v2020-07-28 0.60630 37.4 782 4768
62
+ rus-spa tatoeba-test-v2020-07-28 0.69114 51.3 10000 71496
63
+ ukr-cat tatoeba-test-v2020-07-28 0.69272 52.9 455 2670
64
+ ukr-fra tatoeba-test-v2020-07-28 0.67370 51.3 10000 62877
65
+ ukr-spa tatoeba-test-v2020-07-28 0.68381 50.7 10000 58486
66
+ bel-fra tatoeba-test-v2021-03-30 0.66694 48.9 285 2017
67
+ bel-ita tatoeba-test-v2021-03-30 0.64279 47.8 265 1687
68
+ bel-spa tatoeba-test-v2021-03-30 0.65638 46.9 207 1432
69
+ rus-fra tatoeba-test-v2021-03-30 0.68151 52.3 10633 74451
70
+ rus-ita tatoeba-test-v2021-03-30 0.63285 42.7 10010 71323
71
+ rus-ron tatoeba-test-v2021-03-30 0.60882 37.5 794 4840
72
+ rus-spa tatoeba-test-v2021-03-30 0.69084 51.3 10272 73506
73
+ ukr-cat tatoeba-test-v2021-03-30 0.69260 52.9 457 2682
74
+ ukr-fra tatoeba-test-v2021-03-30 0.67395 51.4 10035 63101
75
+ ukr-spa tatoeba-test-v2021-03-30 0.68382 50.7 10027 58644
76
+ bel-fra tatoeba-test-v2021-08-07 0.66784 49.1 283 2005
77
+ bel-ita tatoeba-test-v2021-08-07 0.64145 47.6 264 1681
78
+ bel-spa tatoeba-test-v2021-08-07 0.65485 46.9 205 1412
79
+ rus-fra tatoeba-test-v2021-08-07 0.68174 52.1 11490 80579
80
+ rus-ita tatoeba-test-v2021-08-07 0.63277 42.7 10045 71584
81
+ rus-por tatoeba-test-v2021-08-07 0.63606 42.6 10000 74713
82
+ rus-ron tatoeba-test-v2021-08-07 0.60796 37.5 782 4772
83
+ rus-spa tatoeba-test-v2021-08-07 0.69108 51.3 10506 75246
84
+ ukr-cat tatoeba-test-v2021-08-07 0.69275 52.9 456 2675
85
+ ukr-fra tatoeba-test-v2021-08-07 0.67392 51.3 10035 63227
86
+ ukr-ita tatoeba-test-v2021-08-07 0.69157 49.6 5000 27846
87
+ ukr-por tatoeba-test-v2021-08-07 0.64722 45.0 3372 21315
88
+ ukr-spa tatoeba-test-v2021-08-07 0.68409 50.7 10115 59284
benchmark_translations.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59dfd9c7656714371a8b5bc21af1bf8e83a868af77d193c8aaec7de973cb9edb
3
+ size 18874591
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bad_words_ids": [
9
+ [
10
+ 61465
11
+ ]
12
+ ],
13
+ "bos_token_id": 0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 61465,
21
+ "decoder_vocab_size": 61466,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 26818,
28
+ "forced_eos_token_id": 26818,
29
+ "init_std": 0.02,
30
+ "is_encoder_decoder": true,
31
+ "max_length": 512,
32
+ "max_position_embeddings": 1024,
33
+ "model_type": "marian",
34
+ "normalize_embedding": false,
35
+ "num_beams": 4,
36
+ "num_hidden_layers": 6,
37
+ "pad_token_id": 61465,
38
+ "scale_embedding": true,
39
+ "share_encoder_decoder_embeddings": true,
40
+ "static_position_embeddings": true,
41
+ "torch_dtype": "float16",
42
+ "transformers_version": "4.18.0.dev0",
43
+ "use_cache": true,
44
+ "vocab_size": 61466
45
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec9359673a61cb478e71d2c9ce4bbb9984d524a1bbe49b4c34930d02172cf5e
3
+ size 604694083
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627be4d20619494b4a628a7083b0cadf5eaa6612d19964b08597e6c43162337f
3
+ size 1011490
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ae198e752ec69954b5cad0d2bcae687e2e899960424fb985778a3f11f4ab681
3
+ size 814394
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "zle", "target_lang": "itc", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20210807_transformer-big_2022-08-03/zle-itc", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff