Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.gitattributes +3 -0
README.md +57 -0
config.json +10 -0
generation_config.json +15 -0
model.bin +3 -0
shared_vocabulary.json +0 -0
source.spm +3 -0
target.spm +3 -0
tokenizer_config.json +48 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+source.spm filter=lfs diff=lfs merge=lfs -text
+target.spm filter=lfs diff=lfs merge=lfs -text
+vocab.spm filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+license: apache-2.0
+datasets:
+- Helsinki-NLP/tatoeba
+- openlanguagedata/flores_plus
+language:
+- es
+- ca
+metrics:
+- bleu
+- comet
+- chrf
+pipeline_tag: translation
+---
+# OPUS-MT-tiny-cat-spa
+Distilled model from a Tatoeba-MT Teacher: [Tatoeba-MT-models/itc-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30](https://object.pouta.csc.fi/Tatoeba-MT-models/itc-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip), which has been trained on the [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data) dataset.
+We used the [OpusDistillery](https://github.com/Helsinki-NLP/OpusDistillery) to train new a new student with the tiny architecture, with a regular transformer decoder.
+For training data, we used [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data).
+The configuration file fed into OpusDistillery can be found [here](https://github.com/Helsinki-NLP/OpusDistillery/blob/main/configs/opustranslate_hf/config.op.ca-es.yml).
+## How to run
+```python
+from transformers import MarianMTModel, MarianTokenizer
+model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids
+output = model.generate(tok)[0]
+tokenizer.decode(output, skip_special_tokens=True)
+```
+## Benchmarks
+### Teacher
+| testset               | BLEU  | chr-F | COMET|
+|-----------------------|-------|-------|-------|
+| Flores+ 	| 24.7	| 53.4 | 0.8264 |
+### Student
+| testset               | BLEU  | chr-F | COMET |
+|-----------------------|-------|-------|-------|
+| Flores+ 	| 24.2 	| 53.2 | 0.8484 |
+## Marian models
+We also provide Marian-compatible versions of this model. To use them, compile [Marian](https://marian-nmt.github.io/quickstart/) and run decoding with `marian-decoder`, for example:
+```bash
+marian-decoder \
+  -i input.txt \
+  -c final.model.npz.best-perplexity.npz.decoder.yml \
+  -m final.model.npz.best-perplexity.npz \
+  -v vocab.spm vocab.spm

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "add_source_bos": false,
+  "add_source_eos": false,
+  "bos_token": "<s>",
+  "decoder_start_token": "</s>",
+  "eos_token": "</s>",
+  "layer_norm_epsilon": null,
+  "multi_query_attention": false,
+  "unk_token": "<unk>"
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "_from_model_config": true,
+  "bad_words_ids": [
+    [
+      32000
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 32000,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "pad_token_id": 32000,
+  "transformers_version": "4.57.6"
+}

model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7b01fc49820b37996e4b1353539be92e2218ee4abf5e89ca56c3ff4ad303bd
+size 34479419

shared_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

source.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31568642c69528bcc70c8e2a30c4f43a28ff39ab82e69cd70431a251f8692fcb
+size 831666

target.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31568642c69528bcc70c8e2a30c4f43a28ff39ab82e69cd70431a251f8692fcb
+size 831666

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "separate_vocabs": false,
+  "source_lang": null,
+  "sp_model_kwargs": {},
+  "target_lang": null,
+  "tokenizer_class": "MarianTokenizer",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff