arampacha commited on
Commit
a652e11
1 Parent(s): 164bfa1

upd README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -26
README.md CHANGED
@@ -1,9 +1,7 @@
1
  ---
2
  language: cs
3
- datasets:
4
- - common_voice
5
- metrics:
6
- - wer
7
  tags:
8
  - audio
9
  - automatic-speech-recognition
@@ -11,7 +9,7 @@ tags:
11
  - xlsr-fine-tuning-week
12
  license: apache-2.0
13
  model-index:
14
- - name: `Czech XLSR Wav2Vec2 Large 53`
15
  results:
16
  - task:
17
  name: Speech Recognition
@@ -26,7 +24,7 @@ model-index:
26
  value: 24.93
27
  ---
28
 
29
- # Wav2Vec2-Large-XLSR-53-{language} #TODO: replace language with your {language}, *e.g.* French
30
 
31
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Czech using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
32
  When using this model, make sure that your speech input is sampled at 16kHz.
@@ -50,15 +48,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
50
  # Preprocessing the datasets.
51
  # We need to read the aduio files as arrays
52
  def speech_file_to_array_fn(batch):
53
- speech_array, sampling_rate = torchaudio.load(batch["path"])
54
- batch["speech"] = resampler(speech_array).squeeze().numpy()
55
- return batch
56
 
57
  test_dataset = test_dataset.map(speech_file_to_array_fn)
58
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
59
 
60
  with torch.no_grad():
61
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
62
 
63
  predicted_ids = torch.argmax(logits, dim=-1)
64
 
@@ -94,30 +92,30 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
94
  # We need to read the aduio files as arrays
95
  # Note: this models is trained ignoring accents on letters as below
96
  def speech_file_to_array_fn(batch):
97
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().strip()
98
- batch["sentence"] = re.sub(re.compile('[äá]'), 'a', batch['sentence'])
99
- batch["sentence"] = re.sub(re.compile('[öó]'), 'o', batch['sentence'])
100
- batch["sentence"] = re.sub(re.compile('[èé]'), 'e', batch['sentence'])
101
- batch["sentence"] = re.sub(re.compile("[ïí]"), 'i', batch['sentence'])
102
- batch["sentence"] = re.sub(re.compile("[üů]"), 'u', batch['sentence'])
103
- batch['sentence'] = re.sub(' ', ' ', batch['sentence'])
104
- speech_array, sampling_rate = torchaudio.load(batch["path"])
105
- batch["speech"] = resampler(speech_array).squeeze().numpy()
106
- return batch
107
 
108
  test_dataset = test_dataset.map(speech_file_to_array_fn)
109
 
110
  # Preprocessing the datasets.
111
  # We need to read the aduio files as arrays
112
  def evaluate(batch):
113
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
114
 
115
- with torch.no_grad():
116
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
117
 
118
- pred_ids = torch.argmax(logits, dim=-1)
119
- batch["pred_strings"] = processor.batch_decode(pred_ids)
120
- return batch
121
 
122
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
123
 
 
1
  ---
2
  language: cs
3
+ dataset: common_voice
4
+ metrics: wer
 
 
5
  tags:
6
  - audio
7
  - automatic-speech-recognition
 
9
  - xlsr-fine-tuning-week
10
  license: apache-2.0
11
  model-index:
12
+ - name: Czech XLSR Wav2Vec2 Large 53
13
  results:
14
  - task:
15
  name: Speech Recognition
 
24
  value: 24.93
25
  ---
26
 
27
+ # Wav2Vec2-Large-XLSR-53-Chech
28
 
29
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Czech using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
30
  When using this model, make sure that your speech input is sampled at 16kHz.
 
48
  # Preprocessing the datasets.
49
  # We need to read the aduio files as arrays
50
  def speech_file_to_array_fn(batch):
51
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
52
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
53
+ \treturn batch
54
 
55
  test_dataset = test_dataset.map(speech_file_to_array_fn)
56
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
57
 
58
  with torch.no_grad():
59
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
60
 
61
  predicted_ids = torch.argmax(logits, dim=-1)
62
 
 
92
  # We need to read the aduio files as arrays
93
  # Note: this models is trained ignoring accents on letters as below
94
  def speech_file_to_array_fn(batch):
95
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().strip()
96
+ \tbatch["sentence"] = re.sub(re.compile('[äá]'), 'a', batch['sentence'])
97
+ \tbatch["sentence"] = re.sub(re.compile('[öó]'), 'o', batch['sentence'])
98
+ \tbatch["sentence"] = re.sub(re.compile('[èé]'), 'e', batch['sentence'])
99
+ \tbatch["sentence"] = re.sub(re.compile("[ïí]"), 'i', batch['sentence'])
100
+ \tbatch["sentence"] = re.sub(re.compile("[üů]"), 'u', batch['sentence'])
101
+ \tbatch['sentence'] = re.sub(' ', ' ', batch['sentence'])
102
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
103
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
104
+ \treturn batch
105
 
106
  test_dataset = test_dataset.map(speech_file_to_array_fn)
107
 
108
  # Preprocessing the datasets.
109
  # We need to read the aduio files as arrays
110
  def evaluate(batch):
111
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
112
 
113
+ \twith torch.no_grad():
114
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
115
 
116
+ \tpred_ids = torch.argmax(logits, dim=-1)
117
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
118
+ \treturn batch
119
 
120
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
121