lbourdois's picture
Add multilingual to the language tag
7fc2adf
metadata
language:
  - ca
  - da
  - es
  - fr
  - gl
  - is
  - it
  - nb
  - pt
  - ro
  - sv
  - multilingual
license: cc-by-4.0
tags:
  - translation
  - opus-mt-tc
model-index:
  - name: opus-mt-tc-big-gmq-itc
    results:
      - task:
          type: translation
          name: Translation dan-cat
        dataset:
          name: flores101-devtest
          type: flores_101
          args: dan cat devtest
        metrics:
          - type: bleu
            value: 33.4
            name: BLEU
          - type: chrf
            value: 0.59224
            name: chr-F
          - type: bleu
            value: 38.3
            name: BLEU
          - type: chrf
            value: 0.63387
            name: chr-F
          - type: bleu
            value: 26.4
            name: BLEU
          - type: chrf
            value: 0.54446
            name: chr-F
          - type: bleu
            value: 25.7
            name: BLEU
          - type: chrf
            value: 0.55237
            name: chr-F
          - type: bleu
            value: 36.9
            name: BLEU
          - type: chrf
            value: 0.62233
            name: chr-F
          - type: bleu
            value: 31.8
            name: BLEU
          - type: chrf
            value: 0.58235
            name: chr-F
          - type: bleu
            value: 24.3
            name: BLEU
          - type: chrf
            value: 0.52453
            name: chr-F
          - type: bleu
            value: 22.7
            name: BLEU
          - type: chrf
            value: 0.4893
            name: chr-F
          - type: bleu
            value: 26.2
            name: BLEU
          - type: chrf
            value: 0.52704
            name: chr-F
          - type: bleu
            value: 18
            name: BLEU
          - type: chrf
            value: 0.45387
            name: chr-F
          - type: bleu
            value: 18.6
            name: BLEU
          - type: chrf
            value: 0.47303
            name: chr-F
          - type: bleu
            value: 24.9
            name: BLEU
          - type: chrf
            value: 0.51381
            name: chr-F
          - type: bleu
            value: 21.6
            name: BLEU
          - type: chrf
            value: 0.48224
            name: chr-F
          - type: bleu
            value: 18.1
            name: BLEU
          - type: chrf
            value: 0.45786
            name: chr-F
          - type: bleu
            value: 28.9
            name: BLEU
          - type: chrf
            value: 0.55984
            name: chr-F
          - type: bleu
            value: 33.8
            name: BLEU
          - type: chrf
            value: 0.60102
            name: chr-F
          - type: bleu
            value: 23.4
            name: BLEU
          - type: chrf
            value: 0.52145
            name: chr-F
          - type: bleu
            value: 22.2
            name: BLEU
          - type: chrf
            value: 0.52619
            name: chr-F
          - type: bleu
            value: 32.2
            name: BLEU
          - type: chrf
            value: 0.58836
            name: chr-F
          - type: bleu
            value: 27.6
            name: BLEU
          - type: chrf
            value: 0.54845
            name: chr-F
          - type: bleu
            value: 21.8
            name: BLEU
          - type: chrf
            value: 0.50661
            name: chr-F
          - type: bleu
            value: 32.4
            name: BLEU
          - type: chrf
            value: 0.58542
            name: chr-F
          - type: bleu
            value: 39.3
            name: BLEU
          - type: chrf
            value: 0.63688
            name: chr-F
          - type: bleu
            value: 26
            name: BLEU
          - type: chrf
            value: 0.53989
            name: chr-F
          - type: bleu
            value: 25.9
            name: BLEU
          - type: chrf
            value: 0.55232
            name: chr-F
          - type: bleu
            value: 36.5
            name: BLEU
          - type: chrf
            value: 0.61882
            name: chr-F
          - type: bleu
            value: 31
            name: BLEU
          - type: chrf
            value: 0.57419
            name: chr-F
          - type: bleu
            value: 23.8
            name: BLEU
          - type: chrf
            value: 0.52175
            name: chr-F
      - task:
          type: translation
          name: Translation dan-fra
        dataset:
          name: tatoeba-test-v2021-08-07
          type: tatoeba_mt
          args: dan-fra
        metrics:
          - type: bleu
            value: 63.8
            name: BLEU
          - type: chrf
            value: 0.76671
            name: chr-F
          - type: bleu
            value: 56.2
            name: BLEU
          - type: chrf
            value: 0.74658
            name: chr-F
          - type: bleu
            value: 57.8
            name: BLEU
          - type: chrf
            value: 0.74944
            name: chr-F
          - type: bleu
            value: 54.8
            name: BLEU
          - type: chrf
            value: 0.72328
            name: chr-F
          - type: bleu
            value: 51
            name: BLEU
          - type: chrf
            value: 0.69354
            name: chr-F
          - type: bleu
            value: 49.2
            name: BLEU
          - type: chrf
            value: 0.66008
            name: chr-F
          - type: bleu
            value: 54.4
            name: BLEU
          - type: chrf
            value: 0.70854
            name: chr-F
          - type: bleu
            value: 55.9
            name: BLEU
          - type: chrf
            value: 0.73672
            name: chr-F
          - type: bleu
            value: 59.2
            name: BLEU
          - type: chrf
            value: 0.73014
            name: chr-F
          - type: bleu
            value: 56.6
            name: BLEU
          - type: chrf
            value: 0.73211
            name: chr-F
          - type: bleu
            value: 48.7
            name: BLEU
          - type: chrf
            value: 0.68146
            name: chr-F
          - type: bleu
            value: 55.3
            name: BLEU
          - type: chrf
            value: 0.71373
            name: chr-F

opus-mt-tc-big-gmq-itc

Table of Contents

Model Details

Neural machine translation model for translating from North Germanic languages (gmq) to Italic languages (itc).

This model is part of the OPUS-MT project, an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of Marian NMT, an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from OPUS and training pipelines use the procedures of OPUS-MT-train. Model Description:

  • Developed by: Language Technology Research Group at the University of Helsinki
  • Model Type: Translation (transformer-big)
  • Release: 2022-08-09
  • License: CC-BY-4.0
  • Language(s):
    • Source Language(s): dan isl nno nob nor swe
    • Target Language(s): cat fra glg ita lat por ron spa
    • Language Pair(s): dan-cat dan-fra dan-glg dan-ita dan-por dan-ron dan-spa isl-cat isl-fra isl-ita isl-por isl-ron isl-spa nob-cat nob-fra nob-glg nob-ita nob-por nob-ron nob-spa swe-cat swe-fra swe-glg swe-ita swe-por swe-ron swe-spa
    • Valid Target Language Labels: >>acf<< >>aoa<< >>arg<< >>ast<< >>cat<< >>cbk<< >>ccd<< >>cks<< >>cos<< >>cri<< >>crs<< >>dlm<< >>drc<< >>egl<< >>ext<< >>fab<< >>fax<< >>fra<< >>frc<< >>frm<< >>fro<< >>frp<< >>fur<< >>gcf<< >>gcr<< >>glg<< >>hat<< >>idb<< >>ist<< >>ita<< >>itk<< >>kea<< >>kmv<< >>lad<< >>lad_Latn<< >>lat<< >>lat_Latn<< >>lij<< >>lld<< >>lmo<< >>lou<< >>mcm<< >>mfe<< >>mol<< >>mwl<< >>mxi<< >>mzs<< >>nap<< >>nrf<< >>oci<< >>osc<< >>osp<< >>osp_Latn<< >>pap<< >>pcd<< >>pln<< >>pms<< >>pob<< >>por<< >>pov<< >>pre<< >>pro<< >>qbb<< >>qhr<< >>rcf<< >>rgn<< >>roh<< >>ron<< >>ruo<< >>rup<< >>ruq<< >>scf<< >>scn<< >>sdc<< >>sdn<< >>spa<< >>spq<< >>spx<< >>src<< >>srd<< >>sro<< >>tmg<< >>tvy<< >>vec<< >>vkp<< >>wln<< >>xfa<< >>xum<<
  • Original Model: opusTCv20210807_transformer-big_2022-08-09.zip
  • Resources for more information:

This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of >>id<< (id = valid target language ID), e.g. >>fra<<

Uses

This model can be used for translation and text-to-text generation.

Risks, Limitations and Biases

CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.

Significant research has explored bias and fairness issues with language models (see, e.g., Sheng et al. (2021) and Bender et al. (2021)).

How to Get Started With the Model

A short example code:

from transformers import MarianMTModel, MarianTokenizer

src_text = [
    ">>spa<< Jag �r inte religi�s.",
    ">>por<< Livet er for kort til � l�re seg tysk."
]

model_name = "pytorch-models/opus-mt-tc-big-gmq-itc"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

for t in translated:
    print( tokenizer.decode(t, skip_special_tokens=True) )

# expected output:
#     No soy religioso.
#     A vida � muito curta para aprender alem�o.

You can also use OPUS-MT models with the transformers pipelines, for example:

from transformers import pipeline
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-gmq-itc")
print(pipe(">>spa<< Jag �r inte religi�s."))

# expected output: No soy religioso.

Training

Evaluation

langpair testset chr-F BLEU #sent #words
dan-fra tatoeba-test-v2021-08-07 0.76671 63.8 1731 11882
dan-ita tatoeba-test-v2021-08-07 0.74658 56.2 284 2226
dan-por tatoeba-test-v2021-08-07 0.74944 57.8 873 5360
dan-spa tatoeba-test-v2021-08-07 0.72328 54.8 5000 35528
isl-ita tatoeba-test-v2021-08-07 0.69354 51.0 236 1450
isl-spa tatoeba-test-v2021-08-07 0.66008 49.2 238 1229
nob-fra tatoeba-test-v2021-08-07 0.70854 54.4 323 2269
nob-spa tatoeba-test-v2021-08-07 0.73672 55.9 885 6866
swe-fra tatoeba-test-v2021-08-07 0.73014 59.2 1407 9580
swe-ita tatoeba-test-v2021-08-07 0.73211 56.6 715 4711
swe-por tatoeba-test-v2021-08-07 0.68146 48.7 320 2032
swe-spa tatoeba-test-v2021-08-07 0.71373 55.3 1351 8235
dan-cat flores101-devtest 0.59224 33.4 1012 27304
dan-fra flores101-devtest 0.63387 38.3 1012 28343
dan-glg flores101-devtest 0.54446 26.4 1012 26582
dan-ita flores101-devtest 0.55237 25.7 1012 27306
dan-por flores101-devtest 0.62233 36.9 1012 26519
dan-ron flores101-devtest 0.58235 31.8 1012 26799
dan-spa flores101-devtest 0.52453 24.3 1012 29199
isl-cat flores101-devtest 0.48930 22.7 1012 27304
isl-fra flores101-devtest 0.52704 26.2 1012 28343
isl-glg flores101-devtest 0.45387 18.0 1012 26582
isl-ita flores101-devtest 0.47303 18.6 1012 27306
isl-por flores101-devtest 0.51381 24.9 1012 26519
isl-ron flores101-devtest 0.48224 21.6 1012 26799
isl-spa flores101-devtest 0.45786 18.1 1012 29199
nob-cat flores101-devtest 0.55984 28.9 1012 27304
nob-fra flores101-devtest 0.60102 33.8 1012 28343
nob-glg flores101-devtest 0.52145 23.4 1012 26582
nob-ita flores101-devtest 0.52619 22.2 1012 27306
nob-por flores101-devtest 0.58836 32.2 1012 26519
nob-ron flores101-devtest 0.54845 27.6 1012 26799
nob-spa flores101-devtest 0.50661 21.8 1012 29199
swe-cat flores101-devtest 0.58542 32.4 1012 27304
swe-fra flores101-devtest 0.63688 39.3 1012 28343
swe-glg flores101-devtest 0.53989 26.0 1012 26582
swe-ita flores101-devtest 0.55232 25.9 1012 27306
swe-por flores101-devtest 0.61882 36.5 1012 26519
swe-ron flores101-devtest 0.57419 31.0 1012 26799
swe-spa flores101-devtest 0.52175 23.8 1012 29199

Citation Information

@inproceedings{tiedemann-thottingal-2020-opus,
    title = "{OPUS}-{MT} {--} Building open translation services for the World",
    author = {Tiedemann, J{\"o}rg  and Thottingal, Santhosh},
    booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
    month = nov,
    year = "2020",
    address = "Lisboa, Portugal",
    publisher = "European Association for Machine Translation",
    url = "https://aclanthology.org/2020.eamt-1.61",
    pages = "479--480",
}

@inproceedings{tiedemann-2020-tatoeba,
    title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
    author = {Tiedemann, J{\"o}rg},
    booktitle = "Proceedings of the Fifth Conference on Machine Translation",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.wmt-1.139",
    pages = "1174--1182",
}

Acknowledgements

The work is supported by the European Language Grid as pilot project 2866, by the FoTran project, funded by the European Research Council (ERC) under the European Union�s Horizon 2020 research and innovation programme (grant agreement No 771113), and the MeMAD project, funded by the European Union�s Horizon 2020 Research and Innovation Programme under grant agreement No 780069. We are also grateful for the generous computational resources and IT infrastructure provided by CSC -- IT Center for Science, Finland.

Model conversion info

  • transformers version: 4.16.2
  • OPUS-MT git hash: 8b9f0b0
  • port time: Sat Aug 13 00:00:00 EEST 2022
  • port machine: LM0-400-22516.local