tiedeman commited on
Commit
aafd105
1 Parent(s): 0ff93e4

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - de
5
+ - en
6
+ - es
7
+ - fr
8
+ - lt
9
+ - lv
10
+ - prg
11
+ - pt
12
+ - sgs
13
+
14
+ tags:
15
+ - translation
16
+ - opus-mt-tc-bible
17
+
18
+ license: apache-2.0
19
+ model-index:
20
+ - name: opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa
21
+ results:
22
+ - task:
23
+ name: Translation lit-deu
24
+ type: translation
25
+ args: lit-deu
26
+ dataset:
27
+ name: flores200-devtest
28
+ type: flores200-devtest
29
+ args: lit-deu
30
+ metrics:
31
+ - name: BLEU
32
+ type: bleu
33
+ value: 23.7
34
+ - name: chr-F
35
+ type: chrf
36
+ value: 0.53223
37
+ - task:
38
+ name: Translation lit-eng
39
+ type: translation
40
+ args: lit-eng
41
+ dataset:
42
+ name: flores200-devtest
43
+ type: flores200-devtest
44
+ args: lit-eng
45
+ metrics:
46
+ - name: BLEU
47
+ type: bleu
48
+ value: 32.6
49
+ - name: chr-F
50
+ type: chrf
51
+ value: 0.59361
52
+ - task:
53
+ name: Translation lit-fra
54
+ type: translation
55
+ args: lit-fra
56
+ dataset:
57
+ name: flores200-devtest
58
+ type: flores200-devtest
59
+ args: lit-fra
60
+ metrics:
61
+ - name: BLEU
62
+ type: bleu
63
+ value: 30.0
64
+ - name: chr-F
65
+ type: chrf
66
+ value: 0.56786
67
+ - task:
68
+ name: Translation lit-por
69
+ type: translation
70
+ args: lit-por
71
+ dataset:
72
+ name: flores200-devtest
73
+ type: flores200-devtest
74
+ args: lit-por
75
+ metrics:
76
+ - name: BLEU
77
+ type: bleu
78
+ value: 28.2
79
+ - name: chr-F
80
+ type: chrf
81
+ value: 0.55393
82
+ - task:
83
+ name: Translation lit-spa
84
+ type: translation
85
+ args: lit-spa
86
+ dataset:
87
+ name: flores200-devtest
88
+ type: flores200-devtest
89
+ args: lit-spa
90
+ metrics:
91
+ - name: BLEU
92
+ type: bleu
93
+ value: 20.9
94
+ - name: chr-F
95
+ type: chrf
96
+ value: 0.49041
97
+ - task:
98
+ name: Translation lav-deu
99
+ type: translation
100
+ args: lav-deu
101
+ dataset:
102
+ name: flores101-devtest
103
+ type: flores_101
104
+ args: lav deu devtest
105
+ metrics:
106
+ - name: BLEU
107
+ type: bleu
108
+ value: 23.8
109
+ - name: chr-F
110
+ type: chrf
111
+ value: 0.54001
112
+ - task:
113
+ name: Translation lav-fra
114
+ type: translation
115
+ args: lav-fra
116
+ dataset:
117
+ name: flores101-devtest
118
+ type: flores_101
119
+ args: lav fra devtest
120
+ metrics:
121
+ - name: BLEU
122
+ type: bleu
123
+ value: 29.4
124
+ - name: chr-F
125
+ type: chrf
126
+ value: 0.57002
127
+ - task:
128
+ name: Translation lav-por
129
+ type: translation
130
+ args: lav-por
131
+ dataset:
132
+ name: flores101-devtest
133
+ type: flores_101
134
+ args: lav por devtest
135
+ metrics:
136
+ - name: BLEU
137
+ type: bleu
138
+ value: 26.7
139
+ - name: chr-F
140
+ type: chrf
141
+ value: 0.55155
142
+ - task:
143
+ name: Translation lav-spa
144
+ type: translation
145
+ args: lav-spa
146
+ dataset:
147
+ name: flores101-devtest
148
+ type: flores_101
149
+ args: lav spa devtest
150
+ metrics:
151
+ - name: BLEU
152
+ type: bleu
153
+ value: 20.8
154
+ - name: chr-F
155
+ type: chrf
156
+ value: 0.49259
157
+ - task:
158
+ name: Translation lit-eng
159
+ type: translation
160
+ args: lit-eng
161
+ dataset:
162
+ name: flores101-devtest
163
+ type: flores_101
164
+ args: lit eng devtest
165
+ metrics:
166
+ - name: BLEU
167
+ type: bleu
168
+ value: 32.1
169
+ - name: chr-F
170
+ type: chrf
171
+ value: 0.59073
172
+ - task:
173
+ name: Translation lit-por
174
+ type: translation
175
+ args: lit-por
176
+ dataset:
177
+ name: flores101-devtest
178
+ type: flores_101
179
+ args: lit por devtest
180
+ metrics:
181
+ - name: BLEU
182
+ type: bleu
183
+ value: 27.8
184
+ - name: chr-F
185
+ type: chrf
186
+ value: 0.55106
187
+ - task:
188
+ name: Translation lav-deu
189
+ type: translation
190
+ args: lav-deu
191
+ dataset:
192
+ name: ntrex128
193
+ type: ntrex128
194
+ args: lav-deu
195
+ metrics:
196
+ - name: BLEU
197
+ type: bleu
198
+ value: 18.5
199
+ - name: chr-F
200
+ type: chrf
201
+ value: 0.47317
202
+ - task:
203
+ name: Translation lav-eng
204
+ type: translation
205
+ args: lav-eng
206
+ dataset:
207
+ name: ntrex128
208
+ type: ntrex128
209
+ args: lav-eng
210
+ metrics:
211
+ - name: BLEU
212
+ type: bleu
213
+ value: 19.7
214
+ - name: chr-F
215
+ type: chrf
216
+ value: 0.53734
217
+ - task:
218
+ name: Translation lav-fra
219
+ type: translation
220
+ args: lav-fra
221
+ dataset:
222
+ name: ntrex128
223
+ type: ntrex128
224
+ args: lav-fra
225
+ metrics:
226
+ - name: BLEU
227
+ type: bleu
228
+ value: 19.6
229
+ - name: chr-F
230
+ type: chrf
231
+ value: 0.47843
232
+ - task:
233
+ name: Translation lav-por
234
+ type: translation
235
+ args: lav-por
236
+ dataset:
237
+ name: ntrex128
238
+ type: ntrex128
239
+ args: lav-por
240
+ metrics:
241
+ - name: BLEU
242
+ type: bleu
243
+ value: 19.3
244
+ - name: chr-F
245
+ type: chrf
246
+ value: 0.47027
247
+ - task:
248
+ name: Translation lav-spa
249
+ type: translation
250
+ args: lav-spa
251
+ dataset:
252
+ name: ntrex128
253
+ type: ntrex128
254
+ args: lav-spa
255
+ metrics:
256
+ - name: BLEU
257
+ type: bleu
258
+ value: 22.7
259
+ - name: chr-F
260
+ type: chrf
261
+ value: 0.49428
262
+ - task:
263
+ name: Translation lit-deu
264
+ type: translation
265
+ args: lit-deu
266
+ dataset:
267
+ name: ntrex128
268
+ type: ntrex128
269
+ args: lit-deu
270
+ metrics:
271
+ - name: BLEU
272
+ type: bleu
273
+ value: 19.4
274
+ - name: chr-F
275
+ type: chrf
276
+ value: 0.50279
277
+ - task:
278
+ name: Translation lit-eng
279
+ type: translation
280
+ args: lit-eng
281
+ dataset:
282
+ name: ntrex128
283
+ type: ntrex128
284
+ args: lit-eng
285
+ metrics:
286
+ - name: BLEU
287
+ type: bleu
288
+ value: 28.1
289
+ - name: chr-F
290
+ type: chrf
291
+ value: 0.56642
292
+ - task:
293
+ name: Translation lit-fra
294
+ type: translation
295
+ args: lit-fra
296
+ dataset:
297
+ name: ntrex128
298
+ type: ntrex128
299
+ args: lit-fra
300
+ metrics:
301
+ - name: BLEU
302
+ type: bleu
303
+ value: 22.6
304
+ - name: chr-F
305
+ type: chrf
306
+ value: 0.51276
307
+ - task:
308
+ name: Translation lit-por
309
+ type: translation
310
+ args: lit-por
311
+ dataset:
312
+ name: ntrex128
313
+ type: ntrex128
314
+ args: lit-por
315
+ metrics:
316
+ - name: BLEU
317
+ type: bleu
318
+ value: 22.6
319
+ - name: chr-F
320
+ type: chrf
321
+ value: 0.50864
322
+ - task:
323
+ name: Translation lit-spa
324
+ type: translation
325
+ args: lit-spa
326
+ dataset:
327
+ name: ntrex128
328
+ type: ntrex128
329
+ args: lit-spa
330
+ metrics:
331
+ - name: BLEU
332
+ type: bleu
333
+ value: 25.9
334
+ - name: chr-F
335
+ type: chrf
336
+ value: 0.53105
337
+ - task:
338
+ name: Translation lav-eng
339
+ type: translation
340
+ args: lav-eng
341
+ dataset:
342
+ name: tatoeba-test-v2021-08-07
343
+ type: tatoeba_mt
344
+ args: lav-eng
345
+ metrics:
346
+ - name: BLEU
347
+ type: bleu
348
+ value: 21.5
349
+ - name: chr-F
350
+ type: chrf
351
+ value: 0.63015
352
+ - task:
353
+ name: Translation lit-deu
354
+ type: translation
355
+ args: lit-deu
356
+ dataset:
357
+ name: tatoeba-test-v2021-08-07
358
+ type: tatoeba_mt
359
+ args: lit-deu
360
+ metrics:
361
+ - name: BLEU
362
+ type: bleu
363
+ value: 47.5
364
+ - name: chr-F
365
+ type: chrf
366
+ value: 0.66527
367
+ - task:
368
+ name: Translation lit-eng
369
+ type: translation
370
+ args: lit-eng
371
+ dataset:
372
+ name: tatoeba-test-v2021-08-07
373
+ type: tatoeba_mt
374
+ args: lit-eng
375
+ metrics:
376
+ - name: BLEU
377
+ type: bleu
378
+ value: 58.9
379
+ - name: chr-F
380
+ type: chrf
381
+ value: 0.72975
382
+ - task:
383
+ name: Translation lit-spa
384
+ type: translation
385
+ args: lit-spa
386
+ dataset:
387
+ name: tatoeba-test-v2021-08-07
388
+ type: tatoeba_mt
389
+ args: lit-spa
390
+ metrics:
391
+ - name: BLEU
392
+ type: bleu
393
+ value: 49.9
394
+ - name: chr-F
395
+ type: chrf
396
+ value: 0.67956
397
+ - task:
398
+ name: Translation multi-multi
399
+ type: translation
400
+ args: multi-multi
401
+ dataset:
402
+ name: tatoeba-test-v2020-07-28-v2023-09-26
403
+ type: tatoeba_mt
404
+ args: multi-multi
405
+ metrics:
406
+ - name: BLEU
407
+ type: bleu
408
+ value: 55.5
409
+ - name: chr-F
410
+ type: chrf
411
+ value: 0.71003
412
+ - task:
413
+ name: Translation lav-eng
414
+ type: translation
415
+ args: lav-eng
416
+ dataset:
417
+ name: newstest2017
418
+ type: wmt-2017-news
419
+ args: lav-eng
420
+ metrics:
421
+ - name: BLEU
422
+ type: bleu
423
+ value: 22.0
424
+ - name: chr-F
425
+ type: chrf
426
+ value: 0.49729
427
+ - task:
428
+ name: Translation lit-eng
429
+ type: translation
430
+ args: lit-eng
431
+ dataset:
432
+ name: newstest2019
433
+ type: wmt-2019-news
434
+ args: lit-eng
435
+ metrics:
436
+ - name: BLEU
437
+ type: bleu
438
+ value: 31.2
439
+ - name: chr-F
440
+ type: chrf
441
+ value: 0.59971
442
+ ---
443
+ # opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa
444
+
445
+ ## Table of Contents
446
+ - [Model Details](#model-details)
447
+ - [Uses](#uses)
448
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
449
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
450
+ - [Training](#training)
451
+ - [Evaluation](#evaluation)
452
+ - [Citation Information](#citation-information)
453
+ - [Acknowledgements](#acknowledgements)
454
+
455
+ ## Model Details
456
+
457
+ Neural machine translation model for translating from Baltic languages (bat) to unknown (deu+eng+fra+por+spa).
458
+
459
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
460
+ **Model Description:**
461
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
462
+ - **Model Type:** Translation (transformer-big)
463
+ - **Release**: 2024-05-30
464
+ - **License:** Apache-2.0
465
+ - **Language(s):**
466
+ - Source Language(s): lav lit prg sgs
467
+ - Target Language(s): deu eng fra por spa
468
+ - Valid Target Language Labels: >>deu<< >>eng<< >>fra<< >>por<< >>spa<< >>xxx<<
469
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/bat-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
470
+ - **Resources for more information:**
471
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/bat-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
472
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
473
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
474
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
475
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
476
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
477
+
478
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>deu<<`
479
+
480
+ ## Uses
481
+
482
+ This model can be used for translation and text-to-text generation.
483
+
484
+ ## Risks, Limitations and Biases
485
+
486
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
487
+
488
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
489
+
490
+ ## How to Get Started With the Model
491
+
492
+ A short example code:
493
+
494
+ ```python
495
+ from transformers import MarianMTModel, MarianTokenizer
496
+
497
+ src_text = [
498
+ ">>deu<< Replace this with text in an accepted source language.",
499
+ ">>spa<< This is the second sentence."
500
+ ]
501
+
502
+ model_name = "pytorch-models/opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa"
503
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
504
+ model = MarianMTModel.from_pretrained(model_name)
505
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
506
+
507
+ for t in translated:
508
+ print( tokenizer.decode(t, skip_special_tokens=True) )
509
+ ```
510
+
511
+ You can also use OPUS-MT models with the transformers pipelines, for example:
512
+
513
+ ```python
514
+ from transformers import pipeline
515
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa")
516
+ print(pipe(">>deu<< Replace this with text in an accepted source language."))
517
+ ```
518
+
519
+ ## Training
520
+
521
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
522
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
523
+ - **Model Type:** transformer-big
524
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/bat-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
525
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
526
+
527
+ ## Evaluation
528
+
529
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/bat-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
530
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/bat-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt)
531
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/bat-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt)
532
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
533
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
534
+
535
+ | langpair | testset | chr-F | BLEU | #sent | #words |
536
+ |----------|---------|-------|-------|-------|--------|
537
+ | lav-eng | tatoeba-test-v2021-08-07 | 0.63015 | 21.5 | 1631 | 11213 |
538
+ | lit-deu | tatoeba-test-v2021-08-07 | 0.66527 | 47.5 | 1115 | 8531 |
539
+ | lit-eng | tatoeba-test-v2021-08-07 | 0.72975 | 58.9 | 2528 | 17855 |
540
+ | lit-spa | tatoeba-test-v2021-08-07 | 0.67956 | 49.9 | 454 | 2751 |
541
+ | lav-deu | flores101-devtest | 0.54001 | 23.8 | 1012 | 25094 |
542
+ | lav-fra | flores101-devtest | 0.57002 | 29.4 | 1012 | 28343 |
543
+ | lav-por | flores101-devtest | 0.55155 | 26.7 | 1012 | 26519 |
544
+ | lav-spa | flores101-devtest | 0.49259 | 20.8 | 1012 | 29199 |
545
+ | lit-eng | flores101-devtest | 0.59073 | 32.1 | 1012 | 24721 |
546
+ | lit-por | flores101-devtest | 0.55106 | 27.8 | 1012 | 26519 |
547
+ | lit-deu | flores200-devtest | 0.53223 | 23.7 | 1012 | 25094 |
548
+ | lit-eng | flores200-devtest | 0.59361 | 32.6 | 1012 | 24721 |
549
+ | lit-fra | flores200-devtest | 0.56786 | 30.0 | 1012 | 28343 |
550
+ | lit-por | flores200-devtest | 0.55393 | 28.2 | 1012 | 26519 |
551
+ | lit-spa | flores200-devtest | 0.49041 | 20.9 | 1012 | 29199 |
552
+ | lav-eng | newstest2017 | 0.49729 | 22.0 | 2001 | 47511 |
553
+ | lit-eng | newstest2019 | 0.59971 | 31.2 | 1000 | 25878 |
554
+ | lav-deu | ntrex128 | 0.47317 | 18.5 | 1997 | 48761 |
555
+ | lav-eng | ntrex128 | 0.53734 | 19.7 | 1997 | 47673 |
556
+ | lav-fra | ntrex128 | 0.47843 | 19.6 | 1997 | 53481 |
557
+ | lav-por | ntrex128 | 0.47027 | 19.3 | 1997 | 51631 |
558
+ | lav-spa | ntrex128 | 0.49428 | 22.7 | 1997 | 54107 |
559
+ | lit-deu | ntrex128 | 0.50279 | 19.4 | 1997 | 48761 |
560
+ | lit-eng | ntrex128 | 0.56642 | 28.1 | 1997 | 47673 |
561
+ | lit-fra | ntrex128 | 0.51276 | 22.6 | 1997 | 53481 |
562
+ | lit-por | ntrex128 | 0.50864 | 22.6 | 1997 | 51631 |
563
+ | lit-spa | ntrex128 | 0.53105 | 25.9 | 1997 | 54107 |
564
+
565
+ ## Citation Information
566
+
567
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
568
+
569
+ ```bibtex
570
+ @article{tiedemann2023democratizing,
571
+ title={Democratizing neural machine translation with {OPUS-MT}},
572
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
573
+ journal={Language Resources and Evaluation},
574
+ number={58},
575
+ pages={713--755},
576
+ year={2023},
577
+ publisher={Springer Nature},
578
+ issn={1574-0218},
579
+ doi={10.1007/s10579-023-09704-w}
580
+ }
581
+
582
+ @inproceedings{tiedemann-thottingal-2020-opus,
583
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
584
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
585
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
586
+ month = nov,
587
+ year = "2020",
588
+ address = "Lisboa, Portugal",
589
+ publisher = "European Association for Machine Translation",
590
+ url = "https://aclanthology.org/2020.eamt-1.61",
591
+ pages = "479--480",
592
+ }
593
+
594
+ @inproceedings{tiedemann-2020-tatoeba,
595
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
596
+ author = {Tiedemann, J{\"o}rg},
597
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
598
+ month = nov,
599
+ year = "2020",
600
+ address = "Online",
601
+ publisher = "Association for Computational Linguistics",
602
+ url = "https://aclanthology.org/2020.wmt-1.139",
603
+ pages = "1174--1182",
604
+ }
605
+ ```
606
+
607
+ ## Acknowledgements
608
+
609
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
610
+
611
+ ## Model conversion info
612
+
613
+ * transformers version: 4.45.1
614
+ * OPUS-MT git hash: a0ea3b3
615
+ * port time: Mon Oct 7 17:27:51 EEST 2024
616
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ multi-multi tatoeba-test-v2020-07-28-v2023-09-26 0.71003 55.5 6367 44692
2
+ lav-deu flores101-devtest 0.54001 23.8 1012 25094
3
+ lav-fra flores101-devtest 0.57002 29.4 1012 28343
4
+ lav-por flores101-devtest 0.55155 26.7 1012 26519
5
+ lav-spa flores101-devtest 0.49259 20.8 1012 29199
6
+ lit-eng flores101-devtest 0.59073 32.1 1012 24721
7
+ lit-por flores101-devtest 0.55106 27.8 1012 26519
8
+ lit-deu flores200-devtest 0.53223 23.7 1012 25094
9
+ lit-eng flores200-devtest 0.59361 32.6 1012 24721
10
+ lit-fra flores200-devtest 0.56786 30.0 1012 28343
11
+ lit-por flores200-devtest 0.55393 28.2 1012 26519
12
+ lit-spa flores200-devtest 0.49041 20.9 1012 29199
13
+ lav-eng newstest2017 0.49729 22.0 2001 47511
14
+ lit-eng newstest2019 0.59971 31.2 1000 25878
15
+ lav-deu ntrex128 0.47317 18.5 1997 48761
16
+ lav-eng ntrex128 0.53734 19.7 1997 47673
17
+ lav-fra ntrex128 0.47843 19.6 1997 53481
18
+ lav-por ntrex128 0.47027 19.3 1997 51631
19
+ lav-spa ntrex128 0.49428 22.7 1997 54107
20
+ lit-deu ntrex128 0.50279 19.4 1997 48761
21
+ lit-eng ntrex128 0.56642 28.1 1997 47673
22
+ lit-fra ntrex128 0.51276 22.6 1997 53481
23
+ lit-por ntrex128 0.50864 22.6 1997 51631
24
+ lit-spa ntrex128 0.53105 25.9 1997 54107
25
+ lit-eng tatoeba-test-v2020-07-28 0.72798 58.7 2500 17688
26
+ lit-eng tatoeba-test-v2021-03-30 0.72790 58.7 5003 35368
27
+ lit-spa tatoeba-test-v2021-03-30 0.68382 49.9 457 2763
28
+ lav-eng tatoeba-test-v2021-08-07 0.63015 21.5 1631 11213
29
+ lit-deu tatoeba-test-v2021-08-07 0.66527 47.5 1115 8531
30
+ lit-eng tatoeba-test-v2021-08-07 0.72975 58.9 2528 17855
31
+ lit-spa tatoeba-test-v2021-08-07 0.67956 49.9 454 2751
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 59466,
17
+ "decoder_vocab_size": 59467,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 788,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 59466,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 59467
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 59466
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 59466,
10
+ "eos_token_id": 788,
11
+ "forced_eos_token_id": 788,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 59466,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b4667803a7a2836fb089a866617aa57d6a5a793e8ff97d4b5755850f24e914d
3
+ size 949273820
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3525e5635298f9c67cf944e9ec615e51112397cf60e1b725425fb032d3f9095
3
+ size 949325061
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c92f9062726b6d6b01ba134f6ddcf9fbe07c1343c251335bd0902a965479dd9
3
+ size 834375
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f30006b2bbd5c46ca39b1ecb0cc9009163828293d2dd2a2ff06931a2aa312a2
3
+ size 819232
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "bat", "target_lang": "deu+eng+fra+por+spa", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30/bat-deu+eng+fra+por+spa", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff