michaelfeil commited on
Commit
76ec296
1 Parent(s): a789499

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,34 +1,9 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
README.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - es
5
+
6
+ tags:
7
+ - ctranslate2
8
+ - translation
9
+
10
+ license: apache-2.0
11
+ ---
12
+ # # Fast-Inference with Ctranslate2
13
+ Speedup inference by 2x-8x using int8 inference in C++
14
+
15
+ quantized version of [Helsinki-NLP/opus-mt-en-es](https://huggingface.co/Helsinki-NLP/opus-mt-en-es)
16
+ ```bash
17
+ pip install hf-hub-ctranslate2>=1.0.0 ctranslate2>=3.13.0
18
+ ```
19
+ Converted using
20
+ ```
21
+ ct2-transformers-converter --model Helsinki-NLP/opus-mt-en-es --output_dir /home/michael/tmp-ct2fast-opus-mt-en-es --force --copy_files README.md generation_config.json tokenizer_config.json vocab.json source.spm .gitattributes target.spm --quantization float16
22
+ ```
23
+
24
+ Checkpoint compatible to [ctranslate2](https://github.com/OpenNMT/CTranslate2) and [hf-hub-ctranslate2](https://github.com/michaelfeil/hf-hub-ctranslate2)
25
+ - `compute_type=int8_float16` for `device="cuda"`
26
+ - `compute_type=int8` for `device="cpu"`
27
+
28
+ ```python
29
+ from hf_hub_ctranslate2 import TranslatorCT2fromHfHub, GeneratorCT2fromHfHub
30
+ from transformers import AutoTokenizer
31
+
32
+ model_name = "michaelfeil/ct2fast-opus-mt-en-es"
33
+ # use either TranslatorCT2fromHfHub or GeneratorCT2fromHfHub here, depending on model.
34
+ model = TranslatorCT2fromHfHub(
35
+ # load in int8 on CUDA
36
+ model_name_or_path=model_name,
37
+ device="cuda",
38
+ compute_type="int8_float16",
39
+ tokenizer=AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
40
+ )
41
+ outputs = model.generate(
42
+ text=["How do you call a fast Flan-ingo?", "User: How are you doing?"],
43
+ )
44
+ print(outputs)
45
+ ```
46
+
47
+ # Licence and other remarks:
48
+ This is just a quantized version. Licence conditions are intended to be idential to original huggingface repo.
49
+
50
+ # Original description
51
+
52
+
53
+ ### eng-spa
54
+
55
+ * source group: English
56
+ * target group: Spanish
57
+ * OPUS readme: [eng-spa](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-spa/README.md)
58
+
59
+ * model: transformer
60
+ * source language(s): eng
61
+ * target language(s): spa
62
+ * model: transformer
63
+ * pre-processing: normalization + SentencePiece (spm32k,spm32k)
64
+ * download original weights: [opus-2020-08-18.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.zip)
65
+ * test set translations: [opus-2020-08-18.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.test.txt)
66
+ * test set scores: [opus-2020-08-18.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.eval.txt)
67
+
68
+ ## Benchmarks
69
+
70
+ | testset | BLEU | chr-F |
71
+ |-----------------------|-------|-------|
72
+ | newssyscomb2009-engspa.eng.spa | 31.0 | 0.583 |
73
+ | news-test2008-engspa.eng.spa | 29.7 | 0.564 |
74
+ | newstest2009-engspa.eng.spa | 30.2 | 0.578 |
75
+ | newstest2010-engspa.eng.spa | 36.9 | 0.620 |
76
+ | newstest2011-engspa.eng.spa | 38.2 | 0.619 |
77
+ | newstest2012-engspa.eng.spa | 39.0 | 0.625 |
78
+ | newstest2013-engspa.eng.spa | 35.0 | 0.598 |
79
+ | Tatoeba-test.eng.spa | 54.9 | 0.721 |
80
+
81
+
82
+ ### System Info:
83
+ - hf_name: eng-spa
84
+
85
+ - source_languages: eng
86
+
87
+ - target_languages: spa
88
+
89
+ - opus_readme_url: https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-spa/README.md
90
+
91
+ - original_repo: Tatoeba-Challenge
92
+
93
+ - tags:
94
+ - ctranslate2 ['translation']
95
+
96
+ - languages: ['en', 'es']
97
+
98
+ - src_constituents: {'eng'}
99
+
100
+ - tgt_constituents: {'spa'}
101
+
102
+ - src_multilingual: False
103
+
104
+ - tgt_multilingual: False
105
+
106
+ - prepro: normalization + SentencePiece (spm32k,spm32k)
107
+
108
+ - url_model: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.zip
109
+
110
+ - url_test_set: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.test.txt
111
+
112
+ - src_alpha3: eng
113
+
114
+ - tgt_alpha3: spa
115
+
116
+ - short_pair: en-es
117
+
118
+ - chrF2_score: 0.721
119
+
120
+ - bleu: 54.9
121
+
122
+ - brevity_penalty: 0.978
123
+
124
+ - ref_len: 77311.0
125
+
126
+ - src_name: English
127
+
128
+ - tgt_name: Spanish
129
+
130
+ - train_date: 2020-08-18 00:00:00
131
+
132
+ - src_alpha2: en
133
+
134
+ - tgt_alpha2: es
135
+
136
+ - prefer_old: False
137
+
138
+ - long_pair: eng-spa
139
+
140
+ - helsinki_git_sha: d2f0910c89026c34a44e331e785dec1e0faa7b82
141
+
142
+ - transformers_git_sha: f7af09b4524b784d67ae8526f0e2fcc6f5ed0de9
143
+
144
+ - port_machine: brutasse
145
+
146
+ - port_time: 2020-08-24-18:20
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "unk_token": "<unk>"
8
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 65000
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 65000,
10
+ "eos_token_id": 0,
11
+ "forced_eos_token_id": 0,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 65000,
15
+ "transformers_version": "4.27.0.dev0"
16
+ }
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36cd9bcb181fc6d5832deeaf770ce183ff4edbbc5e4fe0f86cec92da4379f3b7
3
+ size 155502501
shared_vocabulary.txt ADDED
The diff for this file is too large to render. See raw diff
 
source.spm ADDED
Binary file (802 kB). View file
 
target.spm ADDED
Binary file (826 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"target_lang": "spa", "source_lang": "eng"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff