m3hrdadfi
/

wav2vec2-large-xlsr-georgian

Automatic Speech Recognition Transformers PyTorch JAX

Georgian wav2vec2 audio speech xlsr-fine-tuning-week Eval Results Inference Endpoints

Model card Files Files and versions Community

m3hrdadfi commited on Mar 25, 2021

Commit

2f546f1

•

1 Parent(s): acdefb8

Update readme for more info

Browse files

Files changed (1) hide show

README.md +123 -34

README.md CHANGED Viewed

@@ -30,14 +30,16 @@ model-index:
 ---
-# Wav2Vec2-Large-XLSR-53 Georgian
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Georgian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
 The model can be used directly (without a language model) as follows:
 ```bash
 !pip install git+https://github.com/huggingface/datasets.git
 !pip install git+https://github.com/huggingface/transformers.git
 !pip install torchaudio
@@ -45,52 +47,49 @@ The model can be used directly (without a language model) as follows:
 !pip install jiwer
 ```
 ```python
 import torch
 import torchaudio
-from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-import librosa
-import pandas as pd
 import numpy as np
-import random
-import os
-import string
-import six
 import re
 import IPython.display as ipd
-# Loading the datasets
-dataset = load_dataset("common_voice", "ka", split="test")
-print(dataset)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
-model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
-# Preprocessing the datasets.
-chars_to_ignore_regex = f"""[{"".join([
     ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
     "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
     "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
-])}]"""
-def remove_special_characters(text, chars_to_ignore):
-    text = re.sub(chars_to_ignore, '', text).lower() + " "
     return text
-def normalizer(batch, chars_to_ignore):
-    text = batch["sentence"]
-    text = remove_special_characters(text, chars_to_ignore)
     batch["sentence"] = text
     return batch
-# We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
     speech_array = speech_array.squeeze().numpy()
@@ -99,6 +98,7 @@ def speech_file_to_array_fn(batch):
     batch["speech"] = speech_array
     return batch
 def predict(batch):
     features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
@@ -113,14 +113,21 @@ def predict(batch):
     batch["predicted"] = processor.batch_decode(pred_ids)[0]
     return batch
-dataset = dataset.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore_regex})
-dataset = dataset.map(speech_file_to_array_fn, remove_columns=list(set(dataset.column_names) - set(['sentence', 'path'])))
-result = dataset.map(predict)
-```
-## Prediction
-```python
 max_items = np.random.randint(0, len(result), 20).tolist()
 for i in max_items:
     reference, predicted =  result["sentence"][i], result["predicted"][i]
@@ -129,6 +136,7 @@ for i in max_items:
     print('---')
 ```
 ```text
 reference: ადმინისტრაციული ცენტრი ქალაქი იმიშლი
 predicted: ადმინისტრაციული ცენტრი ქალაქი იმიშლი
@@ -162,14 +170,95 @@ predicted: იგი მდებარეობს ქალაქის ჩრ
 ---
 ```
 ## Evaluation
 ```python
 wer = load_metric("wer")
 print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
 ```
 **Test Result**:
 - WER: 54.00%

 ---
+# Wav2Vec2-Large-XLSR-53-Georgian
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Georgian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
 The model can be used directly (without a language model) as follows:
+**Requirements**
 ```bash
+# requirement packages
 !pip install git+https://github.com/huggingface/datasets.git
 !pip install git+https://github.com/huggingface/transformers.git
 !pip install torchaudio
 !pip install jiwer
 ```
+**Prediction**
 ```python
+import librosa
 import torch
 import torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset
 import numpy as np
 import re
+import string
 import IPython.display as ipd
+chars_to_ignore = [
     ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
     "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
     "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
+]
+chars_to_mapping = {
+"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
+}
+def multiple_replace(text, chars_to_mapping):
+    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
+    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
+def remove_special_characters(text, chars_to_ignore_regex):
+    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
     return text
+def normalizer(batch, chars_to_ignore, chars_to_mapping):
+    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
+    text = batch["sentence"].lower().strip()
+    text = multiple_replace(text, chars_to_mapping)
+    text = remove_special_characters(text, chars_to_ignore_regex)
     batch["sentence"] = text
     return batch
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
     speech_array = speech_array.squeeze().numpy()
     batch["speech"] = speech_array
     return batch
 def predict(batch):
     features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
     batch["predicted"] = processor.batch_decode(pred_ids)[0]
     return batch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
+model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
+dataset = load_dataset("common_voice", "ka", split="test[:1%]")
+dataset = dataset.map(
+    normalizer,
+    fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
+    remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
+)
+dataset = dataset.map(speech_file_to_array_fn)
+result = dataset.map(predict)
 max_items = np.random.randint(0, len(result), 20).tolist()
 for i in max_items:
     reference, predicted =  result["sentence"][i], result["predicted"][i]
     print('---')
 ```
+**Output:**
 ```text
 reference: ადმინისტრაციული ცენტრი ქალაქი იმიშლი
 predicted: ადმინისტრაციული ცენტრი ქალაქი იმიშლი
 ---
 ```
 ## Evaluation
+The model can be evaluated as follows on the Georgian test data of Common Voice.
 ```python
+import librosa
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset, load_metric
+import numpy as np
+import re
+import string
+chars_to_ignore = [
+    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
+    "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
+    "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
+]
+chars_to_mapping = {
+    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
+}
+def multiple_replace(text, chars_to_mapping):
+    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
+    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
+def remove_special_characters(text, chars_to_ignore_regex):
+    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
+    return text
+def normalizer(batch, chars_to_ignore, chars_to_mapping):
+    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
+    text = batch["sentence"].lower().strip()
+    text = multiple_replace(text, chars_to_mapping)
+    text = remove_special_characters(text, chars_to_ignore_regex)
+    batch["sentence"] = text
+    return batch
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    speech_array = speech_array.squeeze().numpy()
+    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
+    batch["speech"] = speech_array
+    return batch
+def predict(batch):
+    features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    input_values = features.input_values.to(device)
+    attention_mask = features.attention_mask.to(device)
+    with torch.no_grad():
+        logits = model(input_values, attention_mask=attention_mask).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+    batch["predicted"] = processor.batch_decode(pred_ids)[0]
+    return batch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
+model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
+dataset = load_dataset("common_voice", "ka", split="test[:1%]")
+dataset = dataset.map(
+    normalizer,
+    fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
+    remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
+)
+dataset = dataset.map(speech_file_to_array_fn)
+result = dataset.map(predict)
 wer = load_metric("wer")
 print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
 ```
 **Test Result**:
 - WER: 54.00%