File size: 3,527 Bytes
8c25af7
 
 
 
f2a5fa2
8c25af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f34d3d2
 
3268a18
 
8c25af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import librosa
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer
from datasets import load_dataset

import numpy as np
import re

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"', 
    "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
]

chars_to_mapping = {
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}



class SpeechRecognition:
    def __init__(self):
        print("init SpeechRecognition")

    def load_model(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
        #self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device)
        self.processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo")
        self.model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo").to(self.device)

        return self



    def multiple_replace(self, text, chars_to_mapping):
        pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
        return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

    def remove_special_characters(self, text, chars_to_ignore_regex):
        text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
        return text

    def normalizer(self, batch, chars_to_ignore, chars_to_mapping):
        chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
        text = batch["sentence"].lower().strip()
        
        text = text.replace("\u0307", " ").strip()
        text = self.multiple_replace(text, chars_to_mapping)
        text = self.remove_special_characters(text, chars_to_ignore_regex)

        batch["sentence"] = text
        return batch


    def speech_file_to_array_fn(self, batch):
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        speech_array = speech_array.squeeze().numpy()
        speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)

        batch["speech"] = speech_array
        return batch


    def predict(self, batch):
        features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

        input_values = features.input_values.to(self.device)
        attention_mask = features.attention_mask.to(self.device)

        with torch.no_grad():
            logits = self.model(input_values, attention_mask=attention_mask).logits 
            
        pred_ids = torch.argmax(logits, dim=-1)

        batch["predicted"] = self.processor.batch_decode(pred_ids)[0]
        return batch

    def predict_audio_file(self, speech):
        features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)

        input_values = features.input_values.to(self.device)
        attention_mask = features.attention_mask.to(self.device)

        with torch.no_grad():
            logits = self.model(input_values, attention_mask=attention_mask).logits 
            
        pred_ids = torch.argmax(logits, dim=-1)

        transcriptions = self.processor.decode(pred_ids[0])
        return transcriptions


    def load_speech_with_file(self, audio_file):
        speech, rate = librosa.load(audio_file,sr=16000)

        return speech, rate