tiedeman commited on
Commit
98b60ee
1 Parent(s): ee3bb87

Initial commit

Browse files
Files changed (8) hide show
  1. README.md +110 -0
  2. config.json +43 -0
  3. pytorch_model.bin +3 -0
  4. source.spm +0 -0
  5. special_tokens_map.json +1 -0
  6. target.spm +0 -0
  7. tokenizer_config.json +1 -0
  8. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - gmw
4
+ - gmw
5
+
6
+ tags:
7
+ - translation
8
+
9
+ license: CC-BY 4.0
10
+ ---
11
+ # opus-mt-tc-base-gmw-gmw
12
+
13
+ Neural machine translation model for translating from West Germanic languages to West Germanic languages.
14
+
15
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation writtin in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
16
+
17
+ * Publications: [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) , [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/)
18
+
19
+ ## Model info
20
+
21
+ * Release: 2021-02-23
22
+ * source language(s): afr deu eng fry gos hrx ltz nds nld pdc yid
23
+ * target language(s): afr deu eng fry nds nld
24
+ * valid target language labels: >>afr<< >>ang_Latn<< >>deu<< >>eng<< >>fry<< >>ltz<< >>nds<< >>nld<< >>sco<< >>yid<<
25
+ * model: transformer
26
+ * data: opus
27
+ * tokenization: SentencePiece (spm32k,spm32k)
28
+ * original model: [opus-2021-02-23.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.zip)
29
+
30
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>afr<<`
31
+
32
+ ## Usage
33
+
34
+ You can use OPUS-MT models with the transformers pipelines, for example:
35
+
36
+ ```python
37
+ from transformers import pipeline
38
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-base-gmw-gmw")
39
+ print(pipe(">>afr<< Replace this with text in an accepted source language.")
40
+ ```
41
+
42
+ ## Benchmarks
43
+
44
+ | langpair | testset | BLEU | chr-F | #sent | #words | BP |
45
+ |----------|---------|-------|-------|-------|--------|----|
46
+ | afr-deu | Tatoeba-test | 48.5 | 0.677 | 1583 | 9105 | 1.000 |
47
+ | afr-eng | Tatoeba-test | 58.7 | 0.727 | 1374 | 9622 | 0.995 |
48
+ | afr-nld | Tatoeba-test | 54.7 | 0.713 | 1056 | 6710 | 0.989 |
49
+ | deu-afr | Tatoeba-test | 52.4 | 0.697 | 1583 | 9507 | 1.000 |
50
+ | deu-eng | newssyscomb2009 | 25.4 | 0.527 | 502 | 11821 | 0.986 |
51
+ | deu-eng | news-test2008 | 23.9 | 0.519 | 2051 | 49380 | 0.992 |
52
+ | deu-eng | newstest2009 | 23.5 | 0.517 | 2525 | 65402 | 0.978 |
53
+ | deu-eng | newstest2010 | 26.1 | 0.548 | 2489 | 61724 | 1.000 |
54
+ | deu-eng | newstest2011 | 23.9 | 0.525 | 3003 | 74681 | 1.000 |
55
+ | deu-eng | newstest2012 | 25.0 | 0.533 | 3003 | 72812 | 1.000 |
56
+ | deu-eng | newstest2013 | 27.7 | 0.549 | 3000 | 64505 | 1.000 |
57
+ | deu-eng | newstest2014-deen | 27.4 | 0.549 | 3003 | 67337 | 0.977 |
58
+ | deu-eng | newstest2015-ende | 28.8 | 0.554 | 2169 | 46443 | 0.973 |
59
+ | deu-eng | newstest2016-ende | 33.7 | 0.598 | 2999 | 64126 | 1.000 |
60
+ | deu-eng | newstest2017-ende | 29.6 | 0.562 | 3004 | 64399 | 0.979 |
61
+ | deu-eng | newstest2018-ende | 36.3 | 0.611 | 2998 | 67013 | 0.977 |
62
+ | deu-eng | newstest2019-deen | 32.7 | 0.585 | 2000 | 39282 | 0.984 |
63
+ | deu-eng | Tatoeba-test | 44.7 | 0.629 | 10000 | 81233 | 0.975 |
64
+ | deu-nds | Tatoeba-test | 18.7 | 0.444 | 10000 | 76144 | 0.988 |
65
+ | deu-nld | Tatoeba-test | 48.7 | 0.672 | 10000 | 73546 | 0.969 |
66
+ | eng-afr | Tatoeba-test | 56.5 | 0.735 | 1374 | 10317 | 0.984 |
67
+ | eng-deu | newssyscomb2009 | 19.4 | 0.503 | 502 | 11271 | 0.991 |
68
+ | eng-deu | news-test2008 | 19.5 | 0.493 | 2051 | 47427 | 0.996 |
69
+ | eng-deu | newstest2009 | 18.8 | 0.499 | 2525 | 62816 | 0.993 |
70
+ | eng-deu | newstest2010 | 20.8 | 0.509 | 2489 | 61511 | 0.958 |
71
+ | eng-deu | newstest2011 | 19.2 | 0.493 | 3003 | 72981 | 0.980 |
72
+ | eng-deu | newstest2012 | 19.6 | 0.494 | 3003 | 72886 | 0.960 |
73
+ | eng-deu | newstest2013 | 22.8 | 0.518 | 3000 | 63737 | 0.974 |
74
+ | eng-deu | newstest2015-ende | 25.8 | 0.545 | 2169 | 44260 | 1.000 |
75
+ | eng-deu | newstest2016-ende | 30.3 | 0.581 | 2999 | 62670 | 0.989 |
76
+ | eng-deu | newstest2017-ende | 24.2 | 0.537 | 3004 | 61291 | 1.000 |
77
+ | eng-deu | newstest2018-ende | 35.5 | 0.616 | 2998 | 64276 | 1.000 |
78
+ | eng-deu | newstest2019-ende | 31.6 | 0.586 | 1997 | 48969 | 0.973 |
79
+ | eng-deu | Tatoeba-test | 37.8 | 0.591 | 10000 | 83347 | 0.991 |
80
+ | eng-nds | Tatoeba-test | 16.5 | 0.411 | 2500 | 18264 | 0.992 |
81
+ | eng-nld | Tatoeba-test | 50.3 | 0.677 | 10000 | 71436 | 0.979 |
82
+ | fry-deu | Tatoeba-test | 28.7 | 0.545 | 66 | 432 | 1.000 |
83
+ | fry-eng | Tatoeba-test | 31.9 | 0.496 | 205 | 1500 | 1.000 |
84
+ | fry-nld | Tatoeba-test | 43.0 | 0.634 | 233 | 1672 | 1.000 |
85
+ | gos-nld | Tatoeba-test | 15.9 | 0.409 | 1852 | 9903 | 0.959 |
86
+ | hrx-deu | Tatoeba-test | 24.7 | 0.487 | 471 | 2805 | 0.984 |
87
+ | ltz-deu | Tatoeba-test | 36.6 | 0.552 | 337 | 2144 | 1.000 |
88
+ | ltz-eng | Tatoeba-test | 31.4 | 0.477 | 283 | 1751 | 1.000 |
89
+ | ltz-nld | Tatoeba-test | 37.5 | 0.523 | 273 | 1567 | 1.000 |
90
+ | multi-multi | Tatoeba-test | 37.1 | 0.569 | 10000 | 73153 | 1.000 |
91
+ | nds-deu | Tatoeba-test | 34.5 | 0.572 | 10000 | 74571 | 1.000 |
92
+ | nds-eng | Tatoeba-test | 29.6 | 0.492 | 2500 | 17589 | 1.000 |
93
+ | nds-nld | Tatoeba-test | 42.2 | 0.621 | 1657 | 11490 | 0.994 |
94
+ | nld-afr | Tatoeba-test | 59.0 | 0.756 | 1056 | 6823 | 1.000 |
95
+ | nld-deu | Tatoeba-test | 50.6 | 0.688 | 10000 | 72438 | 1.000 |
96
+ | nld-eng | Tatoeba-test | 54.5 | 0.702 | 10000 | 69848 | 0.975 |
97
+ | nld-fry | Tatoeba-test | 23.3 | 0.462 | 233 | 1679 | 1.000 |
98
+ | nld-nds | Tatoeba-test | 21.7 | 0.462 | 1657 | 11711 | 0.998 |
99
+ | pdc-eng | Tatoeba-test | 24.3 | 0.402 | 53 | 399 | 1.000 |
100
+ | yid-nld | Tatoeba-test | 21.3 | 0.402 | 55 | 323 | 1.000 |
101
+
102
+ * test set translations: [opus-2021-02-23.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.test.txt)
103
+ * test set scores: [opus-2021-02-23.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.eval.txt)
104
+
105
+ ## Model conversion info
106
+
107
+ * transformers version: 4.12.3
108
+ * OPUS-MT git hash: fc19512
109
+ * port time: Thu Jan 27 18:04:00 EET 2022
110
+ * port machine: LM0-400-22516.local
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "swish",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bad_words_ids": [
9
+ [
10
+ 35451
11
+ ]
12
+ ],
13
+ "bos_token_id": 0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 35451,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "init_std": 0.02,
29
+ "is_encoder_decoder": true,
30
+ "max_length": 512,
31
+ "max_position_embeddings": 512,
32
+ "model_type": "marian",
33
+ "normalize_embedding": false,
34
+ "num_beams": 6,
35
+ "num_hidden_layers": 6,
36
+ "pad_token_id": 35451,
37
+ "scale_embedding": true,
38
+ "static_position_embeddings": true,
39
+ "torch_dtype": "float16",
40
+ "transformers_version": "4.12.3",
41
+ "use_cache": true,
42
+ "vocab_size": 35452
43
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4586f60a829f1bd8331e4f877b3ddf493c333c821afe8bac9b84847681df2c7
3
+ size 161034627
source.spm ADDED
Binary file (802 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
Binary file (802 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "gmw", "target_lang": "gmw", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marian-models/opus-2021-02-23/gmw-gmw", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff