Add model files
Browse files- README.md +26 -0
- config.json +10 -0
- generation_config.json +16 -0
- model.bin +3 -0
- shared_vocabulary.json +0 -0
- source.spm +0 -0
- special_tokens_map.json +5 -0
- target.spm +0 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
README.md
CHANGED
@@ -1,3 +1,29 @@
|
|
1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
license: cc-by-sa-4.0
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- ja
|
4 |
+
- en
|
5 |
+
tags:
|
6 |
+
- translation
|
7 |
+
- ctranslate2
|
8 |
license: cc-by-sa-4.0
|
9 |
+
base_model: staka/fugumt-en-ja
|
10 |
---
|
11 |
+
# fugumt-en-ja-ct2
|
12 |
+
|
13 |
+
This is a version of [`staka/fugumt-en-ja`](https://huggingface.co/staka/fugumt-en-ja)
|
14 |
+
converted for use with [CTranslate2](https://github.com/OpenNMT/CTranslate2).
|
15 |
+
|
16 |
+
The conversion was performed using the following command:
|
17 |
+
|
18 |
+
```
|
19 |
+
ct2-transformers-converter --model staka/fugumt-en-ja --output_dir fugumt-en-ja-ct2 \
|
20 |
+
--copy_files generation_config.json source.spm target.spm special_tokens_map.json \
|
21 |
+
tokenizer_config.json vocab.json
|
22 |
+
```
|
23 |
+
|
24 |
+
## License
|
25 |
+
This adaptation is based on [`staka/fugumt-en-ja`](https://huggingface.co/staka/fugumt-en-ja)
|
26 |
+
which is licensed under the Creative Commons Attribution-ShareAlike 4.0 International
|
27 |
+
([CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)).
|
28 |
+
Modifications were made for compatibility with [CTranslate2](https://github.com/OpenNMT/CTranslate2).
|
29 |
+
This modified version is also distributed under the CC BY-SA 4.0 license.
|
config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_source_bos": false,
|
3 |
+
"add_source_eos": false,
|
4 |
+
"bos_token": "<s>",
|
5 |
+
"decoder_start_token": "</s>",
|
6 |
+
"eos_token": "</s>",
|
7 |
+
"layer_norm_epsilon": null,
|
8 |
+
"multi_query_attention": false,
|
9 |
+
"unk_token": "<unk>"
|
10 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bad_words_ids": [
|
4 |
+
[
|
5 |
+
32000
|
6 |
+
]
|
7 |
+
],
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"decoder_start_token_id": 32000,
|
10 |
+
"eos_token_id": 0,
|
11 |
+
"forced_eos_token_id": 0,
|
12 |
+
"max_length": 512,
|
13 |
+
"num_beams": 12,
|
14 |
+
"pad_token_id": 32000,
|
15 |
+
"transformers_version": "4.30.0"
|
16 |
+
}
|
model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c05943b9c91597e52600e4dbd8ece8388ee804da97a782bd1ea91aef93e2104
|
3 |
+
size 121644654
|
shared_vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
source.spm
ADDED
Binary file (768 kB). View file
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
target.spm
ADDED
Binary file (768 kB). View file
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"target_lang": "ja", "source_lang": "en"}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|