yavuzkomecoglu commited on
Commit
8c25af7
1 Parent(s): 108e46c

added Turkish Automatic Speech Recognition demo

Browse files
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from utils import SpeechRecognition
4
+
5
+ sp = SpeechRecognition()
6
+
7
+ sp.load_model()
8
+
9
+ #sample_file = "assets/samples/sample1378.flac"
10
+
11
+ def recognition(audio_file):
12
+ print("audio_file", audio_file.name)
13
+ speech, rate = sp.load_speech_with_file(audio_file.name)
14
+
15
+ result = sp.predict_audio_file(speech)
16
+ print(result)
17
+
18
+ return result
19
+
20
+ inputs = gr.inputs.Audio(label="Input Audio", type="file")
21
+
22
+ outputs = "text"
23
+ title = "Turkish Automatic Speech Recognition"
24
+ description = "Demo for Turkish Automatic Speech Recognition with Huggingface wav2vec Turkish Model. To use it, simply upload your audio, or click one of the examples to load them."
25
+ article = "<p style='text-align: center'>This is the model for <a href='https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-turkish' target='_blank'>m3hrdadfi/wav2vec2-large-xlsr-turkish</a>, a fine-tuned <a href='https://huggingface.co/facebook/wav2vec2-large-xlsr-53' target='_blank'>facebook/wav2vec2-large-xlsr-53</a> model on the <a href='https://commonvoice.mozilla.org/en/datasets' target='_blank'>Turkish Common Voice dataset</a>.<br/>When using this model, make sure that your speech input is sampled at 16kHz.</a></p>"
26
+ examples = [
27
+ ['assets/samples/common_voice_sample_1378.flac'],
28
+ ['assets/samples/common_voice_sample_1589.flac'],
29
+ ['assets/samples/baris_ozcan_sample_1.m4a'],
30
+ ['assets/samples/baris_ozcan_sample_2.wav'],
31
+ ['assets/samples/baris_ozcan_sample_3.m4a']
32
+ ]
33
+
34
+ gr.Interface(recognition, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
assets/samples/baris_ozcan_sample_1.m4a ADDED
Binary file (83.4 kB). View file
 
assets/samples/baris_ozcan_sample_2.wav ADDED
Binary file (812 kB). View file
 
assets/samples/baris_ozcan_sample_3.m4a ADDED
Binary file (67.2 kB). View file
 
assets/samples/common_voice_sample_1378.flac ADDED
Binary file (70 kB). View file
 
assets/samples/common_voice_sample_1589.flac ADDED
Binary file (57.3 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==2.2.6
2
+ transformers
3
+ datasets
4
+ torchaudio
5
+ librosa
6
+ jiwer
7
+ numpy ==1.20
utils.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import librosa
3
+ import torch
4
+ import torchaudio
5
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
6
+ from datasets import load_dataset
7
+
8
+ import numpy as np
9
+ import re
10
+
11
+ chars_to_ignore = [
12
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
13
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
14
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
15
+ ]
16
+
17
+ chars_to_mapping = {
18
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
19
+ }
20
+
21
+
22
+
23
+ class SpeechRecognition:
24
+ def __init__(self):
25
+ print("init SpeechRecognition")
26
+
27
+ def load_model(self):
28
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
30
+ self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device)
31
+
32
+ return self
33
+
34
+
35
+
36
+ def multiple_replace(self, text, chars_to_mapping):
37
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
38
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
39
+
40
+ def remove_special_characters(self, text, chars_to_ignore_regex):
41
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
42
+ return text
43
+
44
+ def normalizer(self, batch, chars_to_ignore, chars_to_mapping):
45
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
46
+ text = batch["sentence"].lower().strip()
47
+
48
+ text = text.replace("\u0307", " ").strip()
49
+ text = self.multiple_replace(text, chars_to_mapping)
50
+ text = self.remove_special_characters(text, chars_to_ignore_regex)
51
+
52
+ batch["sentence"] = text
53
+ return batch
54
+
55
+
56
+ def speech_file_to_array_fn(self, batch):
57
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
58
+ speech_array = speech_array.squeeze().numpy()
59
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
60
+
61
+ batch["speech"] = speech_array
62
+ return batch
63
+
64
+
65
+ def predict(self, batch):
66
+ features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
67
+
68
+ input_values = features.input_values.to(self.device)
69
+ attention_mask = features.attention_mask.to(self.device)
70
+
71
+ with torch.no_grad():
72
+ logits = self.model(input_values, attention_mask=attention_mask).logits
73
+
74
+ pred_ids = torch.argmax(logits, dim=-1)
75
+
76
+ batch["predicted"] = self.processor.batch_decode(pred_ids)[0]
77
+ return batch
78
+
79
+ def predict_audio_file(self, speech):
80
+ features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
81
+
82
+ input_values = features.input_values.to(self.device)
83
+ attention_mask = features.attention_mask.to(self.device)
84
+
85
+ with torch.no_grad():
86
+ logits = self.model(input_values, attention_mask=attention_mask).logits
87
+
88
+ pred_ids = torch.argmax(logits, dim=-1)
89
+
90
+ transcriptions = self.processor.decode(pred_ids[0])
91
+ return transcriptions
92
+
93
+
94
+ def load_speech_with_file(self, audio_file):
95
+ speech, rate = librosa.load(audio_file,sr=16000)
96
+
97
+ return speech, rate