yavuzkomecoglu's picture
update model
3268a18
import librosa
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer
from datasets import load_dataset
import numpy as np
import re
chars_to_ignore = [
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
"#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
"“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
]
chars_to_mapping = {
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}
class SpeechRecognition:
def __init__(self):
print("init SpeechRecognition")
def load_model(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
#self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device)
self.processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo")
self.model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo").to(self.device)
return self
def multiple_replace(self, text, chars_to_mapping):
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
def remove_special_characters(self, text, chars_to_ignore_regex):
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
return text
def normalizer(self, batch, chars_to_ignore, chars_to_mapping):
chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
text = batch["sentence"].lower().strip()
text = text.replace("\u0307", " ").strip()
text = self.multiple_replace(text, chars_to_mapping)
text = self.remove_special_characters(text, chars_to_ignore_regex)
batch["sentence"] = text
return batch
def speech_file_to_array_fn(self, batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
speech_array = speech_array.squeeze().numpy()
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
batch["speech"] = speech_array
return batch
def predict(self, batch):
features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
input_values = features.input_values.to(self.device)
attention_mask = features.attention_mask.to(self.device)
with torch.no_grad():
logits = self.model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["predicted"] = self.processor.batch_decode(pred_ids)[0]
return batch
def predict_audio_file(self, speech):
features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
input_values = features.input_values.to(self.device)
attention_mask = features.attention_mask.to(self.device)
with torch.no_grad():
logits = self.model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
transcriptions = self.processor.decode(pred_ids[0])
return transcriptions
def load_speech_with_file(self, audio_file):
speech, rate = librosa.load(audio_file,sr=16000)
return speech, rate