Spaces:

yavuzkomecoglu
/

Turkish-Speech-Recognition

Runtime error

App Files Files Community

Turkish-Speech-Recognition / utils.py

yavuzkomecoglu

update model

3268a18 over 3 years ago

raw

history blame contribute delete

3.53 kB


	import librosa
	import torch
	import torchaudio
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer
	from datasets import load_dataset

	import numpy as np
	import re

	chars_to_ignore = [
	",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
	"#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
	"“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
	]

	chars_to_mapping = {
	"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
	}



	class SpeechRecognition:
	def __init__(self):
	print("init SpeechRecognition")

	def load_model(self):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	#self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
	#self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device)
	self.processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo")
	self.model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-common_voice-tr-demo").to(self.device)

	return self



	def multiple_replace(self, text, chars_to_mapping):
	pattern = "\|".join(map(re.escape, chars_to_mapping.keys()))
	return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

	def remove_special_characters(self, text, chars_to_ignore_regex):
	text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
	return text

	def normalizer(self, batch, chars_to_ignore, chars_to_mapping):
	chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
	text = batch["sentence"].lower().strip()

	text = text.replace("\u0307", " ").strip()
	text = self.multiple_replace(text, chars_to_mapping)
	text = self.remove_special_characters(text, chars_to_ignore_regex)

	batch["sentence"] = text
	return batch


	def speech_file_to_array_fn(self, batch):
	speech_array, sampling_rate = torchaudio.load(batch["path"])
	speech_array = speech_array.squeeze().numpy()
	speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)

	batch["speech"] = speech_array
	return batch


	def predict(self, batch):
	features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

	input_values = features.input_values.to(self.device)
	attention_mask = features.attention_mask.to(self.device)

	with torch.no_grad():
	logits = self.model(input_values, attention_mask=attention_mask).logits

	pred_ids = torch.argmax(logits, dim=-1)

	batch["predicted"] = self.processor.batch_decode(pred_ids)[0]
	return batch

	def predict_audio_file(self, speech):
	features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)

	input_values = features.input_values.to(self.device)
	attention_mask = features.attention_mask.to(self.device)

	with torch.no_grad():
	logits = self.model(input_values, attention_mask=attention_mask).logits

	pred_ids = torch.argmax(logits, dim=-1)

	transcriptions = self.processor.decode(pred_ids[0])
	return transcriptions


	def load_speech_with_file(self, audio_file):
	speech, rate = librosa.load(audio_file,sr=16000)

	return speech, rate