jonatasgrosman commited on
Commit
1271f95
1 Parent(s): 80689ba

update model

Browse files
Files changed (3) hide show
  1. README.md +32 -12
  2. pytorch_model.bin +2 -2
  3. vocab.json +1 -1
README.md CHANGED
@@ -4,6 +4,7 @@ datasets:
4
  - common_voice
5
  metrics:
6
  - wer
 
7
  tags:
8
  - audio
9
  - automatic-speech-recognition
@@ -23,12 +24,15 @@ model-index:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: 34.49
 
 
 
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-Hungarian
30
 
31
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Hungarian using the [Common Voice](https://huggingface.co/datasets/common_voice).
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
@@ -45,8 +49,9 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
45
 
46
  LANG_ID = "hu"
47
  MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
 
48
 
49
- test_dataset = load_dataset("common_voice", LANG_ID, split="test[:2%]")
50
 
51
  processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
52
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
@@ -60,21 +65,31 @@ def speech_file_to_array_fn(batch):
60
  return batch
61
 
62
  test_dataset = test_dataset.map(speech_file_to_array_fn)
63
- inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
64
 
65
  with torch.no_grad():
66
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
67
 
68
  predicted_ids = torch.argmax(logits, dim=-1)
 
69
 
70
- print("Prediction:", processor.batch_decode(predicted_ids))
71
- print("Reference:", test_dataset[:2]["sentence"])
 
 
72
  ```
73
 
 
 
 
 
 
 
 
74
 
75
  ## Evaluation
76
 
77
- The model can be evaluated as follows on the hungarian test data of Common Voice.
78
 
79
  ```python
80
  import torch
@@ -87,12 +102,13 @@ LANG_ID = "hu"
87
  MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
88
  DEVICE = "cuda"
89
 
90
- CHARS_TO_IGNORE = [",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "ʿ", "·", "჻", "¿", "¡", "~", "՞",
91
  "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
92
- "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ"]
93
 
94
  test_dataset = load_dataset("common_voice", LANG_ID, split="test")
95
- wer = load_metric("wer")
 
96
 
97
  chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
98
 
@@ -124,7 +140,11 @@ def evaluate(batch):
124
 
125
  result = test_dataset.map(evaluate, batched=True, batch_size=32)
126
 
127
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
128
  ```
129
 
130
- **Test Result**: 34.49%
 
 
 
 
4
  - common_voice
5
  metrics:
6
  - wer
7
+ - cer
8
  tags:
9
  - audio
10
  - automatic-speech-recognition
 
24
  metrics:
25
  - name: Test WER
26
  type: wer
27
+ value: 31.40
28
+ - name: Test CER
29
+ type: cer
30
+ value: 10.49
31
  ---
32
 
33
  # Wav2Vec2-Large-XLSR-53-Hungarian
34
 
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Hungarian using the [Common Voice](https://huggingface.co/datasets/common_voice) and [CSS10](https://github.com/Kyubyong/css10).
36
  When using this model, make sure that your speech input is sampled at 16kHz.
37
 
38
  The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
 
49
 
50
  LANG_ID = "hu"
51
  MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
52
+ SAMPLES = 5
53
 
54
+ test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
55
 
56
  processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
57
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
 
65
  return batch
66
 
67
  test_dataset = test_dataset.map(speech_file_to_array_fn)
68
+ inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
69
 
70
  with torch.no_grad():
71
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
72
 
73
  predicted_ids = torch.argmax(logits, dim=-1)
74
+ predicted_sentences = processor.batch_decode(predicted_ids)
75
 
76
+ for i, predicted_sentence in enumerate(predicted_sentences):
77
+ print("-" * 100)
78
+ print("Reference:", test_dataset[i]["sentence"])
79
+ print("Prediction:", predicted_sentence)
80
  ```
81
 
82
+ | Reference | Prediction |
83
+ | ------------- | ------------- |
84
+ | BÜSZKÉK VAGYUNK A MAGYAR EMBEREK NAGYSZERŰ SZELLEMI ALKOTÁSAIRA. | BÜSZKÉK VAGYUNK A MAGYAR EMBEREK NAGYSZERŰ SZELLEMI ALKOTÁSAIRE |
85
+ | A NEMZETSÉG TAGJAI KÖZÜL EZT TERMESZTIK A LEGSZÉLESEBB KÖRBEN ÍZLETES TERMÉSÉÉRT. | A NEMZETSÉG TAGJAI KÖZÜL ESZSZERMESZTIK A LEGSZELESEBB KÖRBEN IZLETES TERMÉSSÉÉRT |
86
+ | A VÁROSBA VÁGYÓDOTT A LEGJOBBAN, ÉPPEN MERT ODA NEM JUTHATOTT EL SOHA. | A VÁROSBA VÁGYÓDOTT A LEGJOBBAN ÉPPEN MERT ODA NEM JUTHATOTT EL SOHA |
87
+ | SÍRJA MÁRA MEGSEMMISÜLT. | SIMGI A MANDO MEG SEMMICSEN |
88
+ | MINDEN ZENESZÁMOT DRÁGAKŐNEK NEVEZETT. | MINDEN ZENA SZÁMODRAGAKŐNEK NEVEZETT |
89
 
90
  ## Evaluation
91
 
92
+ The model can be evaluated as follows on the Hungarian test data of Common Voice.
93
 
94
  ```python
95
  import torch
 
102
  MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
103
  DEVICE = "cuda"
104
 
105
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
106
  "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
107
+ "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
108
 
109
  test_dataset = load_dataset("common_voice", LANG_ID, split="test")
110
+ wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
111
+ cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
112
 
113
  chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
114
 
 
140
 
141
  result = test_dataset.map(evaluate, batched=True, batch_size=32)
142
 
143
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
144
+ print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
145
  ```
146
 
147
+ **Test Result**:
148
+
149
+ - WER: 31.40%
150
+ - CER: 10.49%
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:387a71ad34db3306482b4a56141da584923a251b72d05f8844c32eca14d3340a
3
- size 1262097815
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ab6510ff7c1c59ff751c63047eba527956df889284321f87a73cbf322012932
3
+ size 1262101911
vocab.json CHANGED
@@ -1 +1 @@
1
- {"Q": 0, "C": 1, "\u00cd": 2, "\u00c9": 3, "\u00da": 4, "X": 5, "P": 6, "S": 7, "M": 8, "G": 9, "\u00dc": 10, "\u00c1": 11, "\u00d6": 12, "Y": 13, "J": 14, "O": 15, "H": 16, "Z": 17, "V": 18, "L": 19, "W": 20, "I": 21, "\u00d3": 22, "E": 23, "K": 24, "B": 25, "F": 26, "A": 27, "N": 28, "\u0170": 29, "R": 30, "D": 31, "\u0150": 32, "U": 34, "T": 35, "|": 33, "<unk>": 36, "<pad>": 37, "<s>": 38, "</s>": 39}
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "J": 5, "Ű": 6, "G": 7, "Y": 8, "Á": 9, "L": 10, "Ü": 11, "H": 12, "V": 13, "É": 14, "A": 15, "P": 16, "C": 17, "M": 18, "Q": 19, "-": 20, "Ú": 21, "K": 22, "D": 23, "Ő": 24, "Ó": 25, "R": 26, "W": 27, "N": 28, "B": 29, "X": 30, "Í": 31, "S": 32, "O": 33, "F": 34, "T": 35, "Z": 36, "U": 37, "E": 38, "I": 39, "Ö": 40}