update model

Browse files

Files changed (5) hide show

README.md +19 -11
config.json +19 -4
preprocessor_config.json +1 -0
pytorch_model.bin +2 -2
vocab.json +1 -1

README.md CHANGED Viewed

@@ -24,10 +24,10 @@ model-index:
     metrics:
        - name: Test WER
          type: wer
-         value: 11.85
        - name: Test CER
          type: cer
-         value: 3.17
 ---
 # Wav2Vec2-Large-XLSR-53-German
@@ -49,7 +49,7 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "de"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"
-SAMPLES = 5
 test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
@@ -82,10 +82,15 @@ for i, predicted_sentence in enumerate(predicted_sentences):
 | Reference  | Prediction |
 | ------------- | ------------- |
 | ZIEHT EUCH BITTE DRAUSSEN DIE SCHUHE AUS. | ZIEHT EUCH BITTE DRAUSSEN DIE SCHUHE AUS |
-| ES KOMMT ZUM SHOWDOWN IN GSTAAD. | ES GRONTEHILSCHONDEBAR ENBESTACDEN |
-| IHRE FOTOSTRECKEN ERSCHIENEN IN MODEMAGAZINEN WIE DER VOGUE, HARPER’S BAZAAR UND MARIE CLAIRE. | IHRE FROTESTRECKEN ERSCHIENEN IN MODEMAGAZINEN WIE DER VOLKE-APERS BASAR VAREQER |
-| FELIPE HAT EINE AUCH FÜR MONARCHEN UNGEWÖHNLICH LANGE TITELLISTE. | FIELIPPE HATE EINE AUCH FÜR MONACHEN UNGEWÖHNLICH LANGE TITELLISTE |
-| ER WURDE ZU EHREN DES REICHSKANZLERS OTTO VON BISMARCK ERRICHTET. | ER WURDE ZU EHREN DES REICHSKANZLERS OTTO VON BISMARK ERRICHTET |
 ## Evaluation
@@ -102,9 +107,11 @@ LANG_ID = "de"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"
 DEVICE = "cuda"
-CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
-                   "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
@@ -152,11 +159,12 @@ print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_
 **Test Result**:
-In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-04-22). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
 | Model | WER | CER |
 | ------------- | ------------- | ------------- |
-| jonatasgrosman/wav2vec2-large-xlsr-53-german | **11.85%** | **3.17%** |
 | maxidl/wav2vec2-large-xlsr-german | 13.10% | 3.64% |
 | marcel/wav2vec2-large-xlsr-53-german | 15.97% | 4.37% |
 | flozi00/wav2vec-xlsr-german | 16.13% | 4.33% |

     metrics:
        - name: Test WER
          type: wer
+         value: 10.55
        - name: Test CER
          type: cer
+         value: 2.81
 ---
 # Wav2Vec2-Large-XLSR-53-German
 LANG_ID = "de"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"
+SAMPLES = 10
 test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
 | Reference  | Prediction |
 | ------------- | ------------- |
 | ZIEHT EUCH BITTE DRAUSSEN DIE SCHUHE AUS. | ZIEHT EUCH BITTE DRAUSSEN DIE SCHUHE AUS |
+| ES KOMMT ZUM SHOWDOWN IN GSTAAD. | ES KOMMT ZUG STUNDEDAUTENESTERKT |
+| IHRE FOTOSTRECKEN ERSCHIENEN IN MODEMAGAZINEN WIE DER VOGUE, HARPER’S BAZAAR UND MARIE CLAIRE. | IHRE FOTELSTRECKEN ERSCHIENEN MIT MODEMAGAZINEN WIE DER VALG AT DAS BASIN MA RIQUAIR |
+| FELIPE HAT EINE AUCH FÜR MONARCHEN UNGEWÖHNLICH LANGE TITELLISTE. | FELIPPE HAT EINE AUCH FÜR MONACHEN UNGEWÖHNLICH LANGE TITELLISTE |
+| ER WURDE ZU EHREN DES REICHSKANZLERS OTTO VON BISMARCK ERRICHTET. | ER WURDE ZU EHREN DES REICHSKANZLERS OTTO VON BISMARCK ERRICHTET   M |
+| WAS SOLLS, ICH BIN BEREIT. | WAS SOLL'S ICH BIN BEREIT |
+| DAS INTERNET BESTEHT AUS VIELEN COMPUTERN, DIE MITEINANDER VERBUNDEN SIND. | DAS INTERNET BESTEHT AUS VIELEN COMPUTERN DIE MITEINANDER VERBUNDEN SIND |
+| DER URANUS IST DER SIEBENTE PLANET IN UNSEREM SONNENSYSTEM. | DER URANUS IST DER SIEBENTE PLANET IN UNSEREM SONNENSYSTEM |
+| DIE WAGEN ERHIELTEN EIN EINHEITLICHES ERSCHEINUNGSBILD IN WEISS MIT ROTEM FENSTERBAND. | DIE WAGEN ERHIELTEN EIN EINHEITLICHES ERSCHEINUNGSBILD IN WEISS MIT ROTEM FENSTERBAND |
+| SIE WAR DIE COUSINE VON CARL MARIA VON WEBER. | SIE WAR DIE COUSINE VON KARL-MARIA VON WEBER |
 ## Evaluation
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"
 DEVICE = "cuda"
+CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
+                   "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
+                   "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
+                   "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
 **Test Result**:
+In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-06-17). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
 | Model | WER | CER |
 | ------------- | ------------- | ------------- |
+| jonatasgrosman/wav2vec2-large-xlsr-53-german | **10.55%** | **2.81%** |
+| Noricum/wav2vec2-large-xlsr-53-german | 11.06% | 2.99% |
 | maxidl/wav2vec2-large-xlsr-german | 13.10% | 3.64% |
 | marcel/wav2vec2-large-xlsr-53-german | 15.97% | 4.37% |
 | flozi00/wav2vec-xlsr-german | 16.13% | 4.33% |

config.json CHANGED Viewed

@@ -7,6 +7,8 @@
   ],
   "attention_dropout": 0.1,
   "bos_token_id": 1,
   "conv_bias": true,
   "conv_dim": [
     512,
@@ -37,33 +39,46 @@
   ],
   "ctc_loss_reduction": "mean",
   "ctc_zero_infinity": true,
   "do_stable_layer_norm": true,
   "eos_token_id": 2,
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
   "feat_proj_dropout": 0.05,
-  "final_dropout": 0.1,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
   "hidden_dropout": 0.05,
-  "hidden_dropout_prob": 0.1,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
   "layerdrop": 0.05,
   "mask_feature_length": 10,
   "mask_feature_prob": 0.0,
   "mask_time_length": 10,
   "mask_time_prob": 0.05,
   "model_type": "wav2vec2",
   "num_attention_heads": 16,
   "num_conv_pos_embedding_groups": 16,
   "num_conv_pos_embeddings": 128,
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
   "pad_token_id": 0,
-  "transformers_version": "4.5.0.dev0",
-  "vocab_size": 36
 }

   ],
   "attention_dropout": 0.1,
   "bos_token_id": 1,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
   "conv_bias": true,
   "conv_dim": [
     512,
   ],
   "ctc_loss_reduction": "mean",
   "ctc_zero_infinity": true,
+  "diversity_loss_weight": 0.1,
   "do_stable_layer_norm": true,
   "eos_token_id": 2,
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
   "feat_proj_dropout": 0.05,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
   "hidden_dropout": 0.05,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
   "layerdrop": 0.05,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
   "mask_feature_length": 10,
   "mask_feature_prob": 0.0,
   "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
   "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
   "num_conv_pos_embedding_groups": 16,
   "num_conv_pos_embeddings": 128,
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
+  "num_negatives": 100,
   "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "transformers_version": "4.7.0.dev0",
+  "vocab_size": 38
 }

preprocessor_config.json CHANGED Viewed

@@ -1,5 +1,6 @@
 {
   "do_normalize": true,
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,

 {
   "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:230f7682c6576a1c855a884b6faf1d52e21ca70f86e426a7c2c1744cd0100b08
-size 1262081431

 version https://git-lfs.github.com/spec/v1
+oid sha256:bff6d75ab89d8ca9cd103df9beb9c10f547501cf5a34aeabea1c8d736c1b81cb
+size 1262089623

vocab.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "\|": 4, "E": 5, "N": 6, "I": 7, "S": 8, "R": 9, "T": 10, "A": 11, "H": 12, "D": 13, "U": 14, "L": 15, "C": 16, "G": 17, "M": 18, "O": 19, "B": 20, "W": 21, "F": 22, "K": 23, "Z": 24, "V": 25, "Ü": 26, "P": 27, "Ä": 28, "Ö": 29, "J": 30, "Y": 31, "'": 32, "X": 33, "Q": 34, "-": 35}


1	+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "\|": 4, "'": 5, "-": 6, "A": 7, "B": 8, "C": 9, "D": 10, "E": 11, "F": 12, "G": 13, "H": 14, "I": 15, "J": 16, "K": 17, "L": 18, "M": 19, "N": 20, "O": 21, "P": 22, "Q": 23, "R": 24, "S": 25, "T": 26, "U": 27, "V": 28, "W": 29, "X": 30, "Y": 31, "Z": 32, "Ä": 33, "Í": 34, "Ó": 35, "Ö": 36, "Ü": 37}