File size: 2,033 Bytes
c0baabd
 
 
 
 
 
 
 
 
 
 
 
 
 
95aa3d3
c0baabd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d87408
95aa3d3
 
 
 
 
6d87408
95aa3d3
 
 
 
c0baabd
 
 
 
 
6ec9e0e
 
 
 
c0baabd
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import torch
import librosa
import os
from model import Wav2Vec2BertForSequenceClassification
from transformers import AutoFeatureExtractor
# from optimum.bettertransformer import BetterTransformer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
# os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
torch.random.manual_seed(0); 
# protobuf==3.20.0

model_name = "arslanarjumand/wav2vec-read_aloud"
processor = AutoFeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device)
# model = BetterTransformer.transform(model)

def load_audio(audio_path, processor):
    audio, sr = librosa.load(audio_path, sr=16000)

    input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    return input_values
        
@torch.inference_mode()
def get_emissions(input_values, model):
    results = model(input_values,).logits[0]
    return results

def validate_range(value, minus):
    value = 10 + value * 80
    if value > 90:
        return 90
        
    if value < 50 and value > 10:
        value = value - minus
    elif value < 10:
        value = 10
    else: None
    return int(value)

def speaker_pronunciation_assesment(audio_path):
    input_values = load_audio(audio_path, processor)
    result_scores = get_emissions(input_values, model)

    pronunciation_score = validate_range(result_scores[0].cpu().item(), 5)
    fluency_score       = validate_range(result_scores[1].cpu().item(), 12)
    total_score         = validate_range(result_scores[2].cpu().item(), 5)
    content_scores      = validate_range(result_scores[3].cpu().item(), 7)

     

    result = {'pronunciation_accuracy': pronunciation_score,
              'content_scores': content_scores,
              'total_score': total_score,
              'fluency_score': fluency_score}
    return result

if __name__ == '__main__':
    pass