import librosa import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer from datasets import load_dataset import numpy as np import re chars_to_ignore = [ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"', "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„' ] chars_to_mapping = { "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ", } class SpeechRecognition: def __init__(self): print("init SpeechRecognition") def load_model(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish") #self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device) self.processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo") self.model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo").to(self.device) return self def multiple_replace(self, text, chars_to_mapping): pattern = "|".join(map(re.escape, chars_to_mapping.keys())) return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) def remove_special_characters(self, text, chars_to_ignore_regex): text = re.sub(chars_to_ignore_regex, '', text).lower() + " " return text def normalizer(self, batch, chars_to_ignore, chars_to_mapping): chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]""" text = batch["sentence"].lower().strip() text = text.replace("\u0307", " ").strip() text = self.multiple_replace(text, chars_to_mapping) text = self.remove_special_characters(text, chars_to_ignore_regex) batch["sentence"] = text return batch def speech_file_to_array_fn(self, batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) speech_array = speech_array.squeeze().numpy() speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000) batch["speech"] = speech_array return batch def predict(self, batch): features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) input_values = features.input_values.to(self.device) attention_mask = features.attention_mask.to(self.device) with torch.no_grad(): logits = self.model(input_values, attention_mask=attention_mask).logits pred_ids = torch.argmax(logits, dim=-1) batch["predicted"] = self.processor.batch_decode(pred_ids)[0] return batch def predict_audio_file(self, speech): features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True) input_values = features.input_values.to(self.device) attention_mask = features.attention_mask.to(self.device) with torch.no_grad(): logits = self.model(input_values, attention_mask=attention_mask).logits pred_ids = torch.argmax(logits, dim=-1) transcriptions = self.processor.decode(pred_ids[0]) return transcriptions def load_speech_with_file(self, audio_file): speech, rate = librosa.load(audio_file,sr=16000) return speech, rate