Spaces:
Sleeping
Sleeping
import torch | |
import librosa | |
import os | |
from model import Wav2Vec2BertForSequenceClassification | |
from transformers import AutoFeatureExtractor | |
# from optimum.bettertransformer import BetterTransformer | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' | |
# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' | |
# os.environ['TRANSFORMERS_VERBOSITY'] = 'error' | |
torch.random.manual_seed(0); | |
# protobuf==3.20.0 | |
model_name = "arslanarjumand/wav2vec-reptiles" | |
processor = AutoFeatureExtractor.from_pretrained(model_name) | |
model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device) | |
# model = BetterTransformer.transform(model) | |
def load_audio(audio_path, processor): | |
audio, sr = librosa.load(audio_path, sr=16000) | |
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features | |
return input_values | |
def get_emissions(input_values, model): | |
results = model(input_values,).logits[0] | |
return results | |
def vlaidate_range(score): | |
score = score if score <= 90 else 90 | |
score = score if score >= 10 else 10 | |
return score | |
def speaker_pronunciation_assesment(audio_path): | |
input_values = load_audio(audio_path, processor) | |
result_scores = get_emissions(input_values, model) | |
pronunciation_score = vlaidate_range(round(result_scores[0].cpu().item())) | |
fluency_score = vlaidate_range(round(result_scores[1].cpu().item())) | |
total_score = vlaidate_range(round(result_scores[2].cpu().item())) | |
content_scores = vlaidate_range(round(result_scores[3].cpu().item())) | |
result = {'pronunciation_accuracy': pronunciation_score, | |
'content_scores': content_scores, | |
'total_score': total_score, | |
'fluency_score': fluency_score} | |
return result | |
if __name__ == '__main__': | |
pass | |