import torch import librosa import os from model import Wav2Vec2BertForSequenceClassification from transformers import AutoFeatureExtractor # from optimum.bettertransformer import BetterTransformer device = 'cuda' if torch.cuda.is_available() else 'cpu' # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' # os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' # os.environ['TRANSFORMERS_VERBOSITY'] = 'error' torch.random.manual_seed(0); # protobuf==3.20.0 model_name = "arslanarjumand/wav2vec-read_aloud" processor = AutoFeatureExtractor.from_pretrained(model_name) model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device) # model = BetterTransformer.transform(model) def load_audio(audio_path, processor): audio, sr = librosa.load(audio_path, sr=16000) input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features return input_values @torch.inference_mode() def get_emissions(input_values, model): results = model(input_values,).logits[0] return results def validate_range(value, minus): value = 10 + value * 80 if value > 90: return 90 if value < 50 and value > 10: value = value - minus elif value < 10: value = 10 else: None return int(value) def speaker_pronunciation_assesment(audio_path): input_values = load_audio(audio_path, processor) result_scores = get_emissions(input_values, model) pronunciation_score = validate_range(round(result_scores[0].cpu().item()), 5) fluency_score = validate_range(round(result_scores[1].cpu().item()), 12) total_score = validate_range(round(result_scores[2].cpu().item()), 5) content_scores = validate_range(round(result_scores[3].cpu().item()), 7) result = {'pronunciation_accuracy': pronunciation_score, 'content_scores': content_scores, 'total_score': total_score, 'fluency_score': fluency_score} return result if __name__ == '__main__': pass