hardwick commited on
Commit
420cb4a
1 Parent(s): 64c1167

Initial submission of a model with the new conversion procedure.

Browse files
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - dsb
4
+ - cs
5
+ - csb_Latn
6
+ - hsb
7
+ - pl
8
+ - zlw
9
+ - hu
10
+ - vro
11
+ - fi
12
+ - liv_Latn
13
+ - mdf
14
+ - krl
15
+ - fkv_Latn
16
+ - mhr
17
+ - et
18
+ - sma
19
+ - udm
20
+ - vep
21
+ - myv
22
+ - kpv
23
+ - se
24
+ - izh
25
+ - fiu
26
+
27
+ tags:
28
+ - translation
29
+
30
+ license: apache-2.0
31
+ ---
32
+ ### zlw-fiu
33
+ * source language name: West Slavic languages
34
+ * target language name: Finno-Ugrian languages
35
+ * OPUS readme: [README.md](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/README.md)
36
+ * model: transformer
37
+ * source language codes: dsb, cs, csb_Latn, hsb, pl, zlw
38
+ * target language codes: hu, vro, fi, liv_Latn, mdf, krl, fkv_Latn, mhr, et, sma, udm, vep, myv, kpv, se, izh, fiu
39
+ * dataset: opus
40
+ * release date: 2021-02-18
41
+ * pre-processing: normalization + SentencePiece (spm32k,spm32k)
42
+ * download original weights: [opus-2021-02-18.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.zip)
43
+ * a sentence-initial language token is required in the form of >>id<<(id = valid, usually three-letter target language ID)
44
+ * Training data:
45
+ * ces-fin: Tatoeba-train (1000000)
46
+ * ces-hun: Tatoeba-train (1000000)
47
+ * pol-est: Tatoeba-train (1000000)
48
+ * pol-fin: Tatoeba-train (1000000)
49
+ * pol-hun: Tatoeba-train (1000000)
50
+ * Validation data:
51
+ * ces-fin: Tatoeba-dev, 1000
52
+ * ces-hun: Tatoeba-dev, 1000
53
+ * est-pol: Tatoeba-dev, 1000
54
+ * fin-pol: Tatoeba-dev, 1000
55
+ * hun-pol: Tatoeba-dev, 1000
56
+ * mhr-pol: Tatoeba-dev, 461
57
+ * total-size-shuffled: 5426
58
+ * devset-selected: top 5000 lines of Tatoeba-dev.src.shuffled!
59
+ * Test data:
60
+ * newssyscomb2009.ces-hun: 502/9733
61
+ * newstest2009.ces-hun: 2525/54965
62
+ * Tatoeba-test.ces-fin: 88/408
63
+ * Tatoeba-test.ces-hun: 1911/10336
64
+ * Tatoeba-test.multi-multi: 4562/25497
65
+ * Tatoeba-test.pol-chm: 5/36
66
+ * Tatoeba-test.pol-est: 15/98
67
+ * Tatoeba-test.pol-fin: 609/3293
68
+ * Tatoeba-test.pol-hun: 1934/11285
69
+ * test set translations file: [test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.test.txt)
70
+ * test set scores file: [eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.eval.txt)
71
+ * BLEU-scores
72
+ |Test set|score|
73
+ |---|---|
74
+ |Tatoeba-test.ces-fin|57.2|
75
+ |Tatoeba-test.ces-hun|42.6|
76
+ |Tatoeba-test.multi-multi|39.4|
77
+ |Tatoeba-test.pol-hun|36.6|
78
+ |Tatoeba-test.pol-fin|36.1|
79
+ |Tatoeba-test.pol-est|20.9|
80
+ |newssyscomb2009.ces-hun|13.9|
81
+ |newstest2009.ces-hun|13.9|
82
+ |Tatoeba-test.pol-chm|2.0|
83
+ * chr-F-scores
84
+ |Test set|score|
85
+ |---|---|
86
+ |Tatoeba-test.ces-fin|0.71|
87
+ |Tatoeba-test.ces-hun|0.637|
88
+ |Tatoeba-test.multi-multi|0.616|
89
+ |Tatoeba-test.pol-hun|0.605|
90
+ |Tatoeba-test.pol-fin|0.592|
91
+ |newssyscomb2009.ces-hun|0.449|
92
+ |newstest2009.ces-hun|0.443|
93
+ |Tatoeba-test.pol-est|0.372|
94
+ |Tatoeba-test.pol-chm|0.007|
95
+
96
+ ### System Info:
97
+ * hf_name: zlw-fiu
98
+ * source_languages: dsb,cs,csb_Latn,hsb,pl,zlw
99
+ * target_languages: hu,vro,fi,liv_Latn,mdf,krl,fkv_Latn,mhr,et,sma,udm,vep,myv,kpv,se,izh,fiu
100
+ * opus_readme_url: https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/README.md
101
+ * original_repo: Tatoeba-Challenge
102
+ * tags: ['translation']
103
+ * languages: ['dsb', 'cs', 'csb_Latn', 'hsb', 'pl', 'zlw', 'hu', 'vro', 'fi', 'liv_Latn', 'mdf', 'krl', 'fkv_Latn', 'mhr', 'et', 'sma', 'udm', 'vep', 'myv', 'kpv', 'se', 'izh', 'fiu']
104
+ * src_constituents: ['dsb', 'ces', 'csb_Latn', 'hsb', 'pol']
105
+ * tgt_constituents: ['hun', 'vro', 'fin', 'liv_Latn', 'mdf', 'krl', 'fkv_Latn', 'mhr', 'est', 'sma', 'udm', 'vep', 'myv', 'kpv', 'sme', 'izh']
106
+ * src_multilingual: True
107
+ * tgt_multilingual: True
108
+ * helsinki_git_sha: a0966db6db0ae616a28471ff0faf461b36fec07d
109
+ * transformers_git_sha: 3857f2b4e34912c942694489c2b667d9476e55f5
110
+ * port_machine: bungle
111
+ * port_time: 2021-06-29-15:24
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "swish",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bad_words_ids": [
9
+ [
10
+ 59746
11
+ ]
12
+ ],
13
+ "bos_token_id": 0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 59746,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "gradient_checkpointing": false,
29
+ "init_std": 0.02,
30
+ "is_encoder_decoder": true,
31
+ "max_length": 512,
32
+ "max_position_embeddings": 512,
33
+ "model_type": "marian",
34
+ "normalize_embedding": false,
35
+ "num_beams": 6,
36
+ "num_hidden_layers": 6,
37
+ "pad_token_id": 59746,
38
+ "scale_embedding": true,
39
+ "static_position_embeddings": true,
40
+ "transformers_version": "4.7.0.dev0",
41
+ "use_cache": true,
42
+ "vocab_size": 59747
43
+ }
metadata.json ADDED
@@ -0,0 +1 @@
 
1
+ {"hf_name": "zlw-fiu", "source_languages": "dsb,cs,csb_Latn,hsb,pl,zlw", "target_languages": "hu,vro,fi,liv_Latn,mdf,krl,fkv_Latn,mhr,et,sma,udm,vep,myv,kpv,se,izh,fiu", "opus_readme_url": "https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/README.md", "original_repo": "Tatoeba-Challenge", "tags": ["translation"], "languages": ["dsb", "cs", "csb_Latn", "hsb", "pl", "zlw", "hu", "vro", "fi", "liv_Latn", "mdf", "krl", "fkv_Latn", "mhr", "et", "sma", "udm", "vep", "myv", "kpv", "se", "izh", "fiu"], "src_constituents": ["dsb", "ces", "csb_Latn", "hsb", "pol"], "tgt_constituents": ["hun", "vro", "fin", "liv_Latn", "mdf", "krl", "fkv_Latn", "mhr", "est", "sma", "udm", "vep", "myv", "kpv", "sme", "izh"], "src_multilingual": true, "tgt_multilingual": true, "helsinki_git_sha": "a0966db6db0ae616a28471ff0faf461b36fec07d", "transformers_git_sha": "3857f2b4e34912c942694489c2b667d9476e55f5", "port_machine": "bungle", "port_time": "2021-06-29-15:24"}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821f51c216a291c58c2cb8eb892709b247de79549c61e73ce60e9b02bf7d447c
3
+ size 210842993
source.spm ADDED
Binary file (824 kB). View file
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
Binary file (813 kB). View file
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"source_lang": "zlw", "target_lang": "fiu", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marian_ckpt/zlw-fiu"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff