kresnik commited on
Commit
d058e04
1 Parent(s): b4cd658

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -0
README.md CHANGED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Evaluation on Zeroth-Korean ASR corpus
2
+
3
+ (Google colab notebook(Korean))[https://colab.research.google.com/github/indra622/tutorials/blob/master/wav2vec2_korean_tutorial.ipynb]
4
+
5
+ ```
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+ from datasets import load_dataset
8
+ import soundfile as sf
9
+ import torch
10
+ from jiwer import wer
11
+
12
+ processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
13
+
14
+ model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')
15
+
16
+ ds = load_dataset("kresnik/zeroth_korean", "clean")
17
+
18
+ test_ds = ds['test']
19
+
20
+ def map_to_array(batch):
21
+ speech, _ = sf.read(batch["file"])
22
+ batch["speech"] = speech
23
+ return batch
24
+
25
+ test_ds = test_ds.map(map_to_array)
26
+
27
+ def map_to_pred(batch):
28
+ inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
29
+ input_values = inputs.input_values.to("cuda")
30
+ #attention_mask = inputs.attention_mask.to("cuda")
31
+
32
+ with torch.no_grad():
33
+ #logits = model(input_values, attention_mask=attention_mask).logits
34
+ logits = model(input_values).logits
35
+
36
+ predicted_ids = torch.argmax(logits, dim=-1)
37
+ transcription = processor.batch_decode(predicted_ids)
38
+ batch["transcription"] = transcription
39
+ return batch
40
+
41
+ result = test_ds.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"])
42
+
43
+ print("WER:", wer(result["text"], result["transcription"]))
44
+
45
+ ```
46
+
47
+ ### Expected WER: 7.43%