Spaces:

seba3y
/

CAPT-ReadAloud

Sleeping

App Files Files Community

seba3y commited on Jan 29

Commit

76171df

•

1 Parent(s): a5e16c1

Delete wav2vec_aligen.py

Browse files

Files changed (1) hide show

wav2vec_aligen.py +0 -285

wav2vec_aligen.py DELETED Viewed

@@ -1,285 +0,0 @@
-from dataclasses import dataclass
-import torch
-import librosa
-import numpy as np
-import os
-import scipy.stats as stats
-# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
-os.environ['MODEL_IS_LOADED'] = '0'
-# os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = "C:\Program Files\eSpeak NG\libespeak-ng.dll"
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
-# os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-from optimum.bettertransformer import BetterTransformer
-torch.random.manual_seed(0);
-model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
-processor = Wav2Vec2Processor.from_pretrained(model_name, phone_delimiter_token=' ', word_delimiter_token=' ')
-model = Wav2Vec2ForCTC.from_pretrained(model_name).to('cpu').eval()
-model = BetterTransformer.transform(model)
-@dataclass
-class Point:
-    token_index: int
-    time_index: int
-    score: float
-# Merge the labels
-@dataclass
-class Segment:
-    label: str
-    start: int
-    end: int
-    score: float
-    def __repr__(self):
-        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d}]"
-    def __len__(self):
-        return self.end - self.start
-def get_trellis(emission, tokens, blank_id=0):
-    num_frame = emission.size(0)
-    num_tokens = len(tokens)
-    trellis = torch.zeros((num_frame, num_tokens))
-    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
-    trellis[0, 1:] = -float("inf")
-    trellis[-num_tokens + 1 :, 0] = float("inf")
-    for t in range(num_frame - 1):
-        trellis[t + 1, 1:] = torch.maximum(trellis[t, 1:] + emission[t, blank_id], # Score for staying at the same token
-                                           trellis[t, :-1] + emission[t, tokens[1:]], # Score for changing to the next token
-                                           )
-    return trellis
-def backtrack(trellis, emission, tokens, blank_id=0):
-    t, j = trellis.size(0) - 1, trellis.size(1) - 1
-    aligenment_path = [Point(j, t, emission[t, blank_id].exp().item())]
-    while j > 0:
-        # Should not happen but just in case
-        assert t > 0
-        # 1. Figure out if the current position was stay or change
-        # Frame-wise score of stay vs change
-        p_stay = emission[t - 1, blank_id]
-        p_change = emission[t - 1, tokens[j]]
-        # Context-aware score for stay vs change
-        stayed = trellis[t - 1, j] + p_stay
-        changed = trellis[t - 1, j - 1] + p_change
-        # Update position
-        t -= 1
-        if changed > stayed:
-            j -= 1
-        # Store the aligenment_path with frame-wise probability.
-        prob = (p_change if changed > stayed else p_stay).exp().item()
-        aligenment_path.append(Point(j, t, prob))
-    # Now j == 0, which means, it reached the SoS.
-    # Fill up the rest for the sake of visualization
-    while t > 0:
-        prob = emission[t - 1, blank_id].exp().item()
-        aligenment_path.append(Point(j, t - 1, prob))
-        t -= 1
-    return aligenment_path[::-1]
-def merge_repeats(aligenment_path, ph):
-    i1, i2 = 0, 0
-    segments = []
-    while i1 < len(aligenment_path):
-        while i2 < len(aligenment_path) and aligenment_path[i1].token_index == aligenment_path[i2].token_index:
-            i2 += 1
-        score = sum(aligenment_path[k].score for k in range(i1, i2)) / (i2 - i1)
-        segments.append(
-            Segment(
-                ph[aligenment_path[i1].token_index],
-                aligenment_path[i1].time_index,
-                aligenment_path[i2 - 1].time_index + 1,
-                score,
-            )
-        )
-        i1 = i2
-    return segments
-def load_model(device='cpu'):
-    model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
-    processor = Wav2Vec2Processor.from_pretrained(model_name, phone_delimiter_token=' ', word_delimiter_token=' ')
-    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device).eval()
-    model = BetterTransformer.transform(model)
-    return processor, model
-def load_audio(audio_path, processor):
-    audio, sr = librosa.load(audio_path, sr=16000)
-    input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values
-    return input_values
-@torch.inference_mode()
-def get_emissions(input_values, model):
-    emissions = model(input_values,).logits
-    emissions = torch.log_softmax(emissions, dim=-1)
-    emission = emissions[0].cpu().detach()
-    return emission
-def get_chnocial_phonemes(transcript, processor):
-    transcript = transcript.replace('from the', 'from | the')
-    phoneme_ids = processor.tokenizer(transcript).input_ids
-    ph = processor.tokenizer.phonemize(transcript)
-    phoneme_list = ph.replace('   ', ' ').split()
-    transcript = transcript.replace('from | the', 'from the')
-    words = transcript.split()
-    words_phonemes = ph.split('   ')
-    words_phoneme_mapping = [(w, p) for w, p in zip(words, words_phonemes)]
-    return phoneme_list, phoneme_ids, words_phoneme_mapping
-def word_level_scoring(words_phoneme_mapping, segments):
-    word_scores = []
-    start = 0
-    for word, ph_seq in words_phoneme_mapping:
-        n_ph = len(ph_seq.split())
-        cum_score = 0
-        wrong = 0
-        for i in range(start, start + n_ph):
-            s = segments[i]
-            cum_score += s.score
-            if s.score < 0.50:
-                wrong += 1
-        start += n_ph
-        word_scores.append((word, np.round(cum_score / n_ph, 5), np.round(wrong / n_ph, 5)))
-    return word_scores
-def map_word2_class(word_scores):
-    word_levels = []
-    for w, sc, wrong_ratio in word_scores:
-        if wrong_ratio > 0.5 or sc < 0.60:
-            word_levels.append((w, '/'))
-        elif sc < 0.70:
-            word_levels.append((w, 'Wrong'))
-        elif sc < 0.85:
-            word_levels.append((w, 'Understandable'))
-        else:
-            word_levels.append((w, 'Correct'))
-    return word_levels
-def calculate_content_scores(word_levels):
-    content_scores = len(word_levels)
-    for w, c in word_levels:
-        if c == '/':
-            content_scores -= 1
-        elif c == 'Wrong':
-            content_scores -= 0.5
-        else:None
-    content_scores = (content_scores / len(word_levels)) * 100
-    return content_scores
-def calculate_sentence_pronunciation_accuracy(word_scores):
-    w_scores = 0
-    error_scores = 0
-    for w, sc, wrong_ratio in word_scores:
-        sc = sc * 100
-        if sc > 60:
-            if sc < 70:
-                sc = ((sc - 60) / (70 - 60)) * (20 - 0)  + 0
-            elif sc < 88:
-                sc = ((sc - 70) / (88 - 70)) * (70 - 20)  + 20
-            else:
-                sc = ((sc - 88) / (100 - 88)) * (100 - 70)  + 70
-        w_scores += sc
-        error_scores += wrong_ratio
-    w_scores = (w_scores / len(word_scores))
-    # w_scores =( (w_scores - 50) / (100 - 50)) * 100
-    error_scores = (error_scores / len(word_scores)) * 40
-    pronunciation_accuracy = min(w_scores, w_scores - error_scores)
-    return pronunciation_accuracy
-def get_hard_aligenment_with_scores(input_values, transcript):
-    # processor, model = load_model(device='cpu')
-    emission = get_emissions(input_values, model)
-    phoneme_list, phoneme_ids, words_phoneme_mapping = get_chnocial_phonemes(transcript, processor)
-    trellis = get_trellis(emission, phoneme_ids)
-    aligenment_path = backtrack(trellis, emission, phoneme_ids)
-    segments = merge_repeats(aligenment_path, phoneme_list)
-    return segments, words_phoneme_mapping
-def normalize_aspect(value, mean, std):
-    """ Normalize an aspect of speech using normal distribution. """
-    return stats.norm(mean, std).cdf(value)
-def calculate_fluency_scores(audio, total_words, content_score, pron_score):
-    # Constants
-    content_score, pron_score = content_score / 100, pron_score / 100
-    sample_rate = 16000  # Assuming a sample rate of 16 kHz
-    # Define means and standard deviations for fluent speech
-    speech_rate_mean, speech_rate_std = 170, 50
-    phonation_time_mean, phonation_time_std = 50, 4
-    # Calculate speaking and total duration
-    non_silent_intervals = librosa.effects.split(audio, top_db=20)
-    speaking_time = sum([intv[1] - intv[0] for intv in non_silent_intervals]) / sample_rate
-    total_duration = len(audio) / sample_rate
-    # Phonation time ratio
-    phonation_time_ratio = speaking_time / total_duration * 60
-    phonation_time_ratio = normalize_aspect(phonation_time_ratio, phonation_time_mean, phonation_time_std)
-    if phonation_time_ratio > 0.5:
-        phonation_time_ratio =  0.5 - (phonation_time_ratio - 0.5)
-    phonation_time_ratio = (phonation_time_ratio / 0.5) * 1
-    speech_rate = (total_words / (total_duration / 60))
-    speech_rate = speech_rate * content_score
-    speech_rate_score = normalize_aspect(speech_rate, speech_rate_mean, speech_rate_std)
-    if speech_rate_score > 0.5:
-        speech_rate_score =  0.5 - (speech_rate_score - 0.5)
-    speech_rate_score = (speech_rate_score / 0.5) * 1
-    w_rate_score = 0.4
-    w_pho_ratio  = 0.35
-    w_pro           = 0.25
-    scaled_fluency_score = speech_rate_score * w_rate_score + phonation_time_ratio * w_pho_ratio + pron_score * w_pro
-    scaled_fluency_score = scaled_fluency_score * 100
-    return scaled_fluency_score, speech_rate
-def speaker_pronunciation_assesment(audio_path, transcript):
-    input_values = load_audio(audio_path, processor)
-    segments, words_phoneme_mapping = get_hard_aligenment_with_scores(input_values, transcript)
-    word_scores = word_level_scoring(words_phoneme_mapping, segments)
-    word_levels = map_word2_class(word_scores)
-    content_scores = calculate_content_scores(word_levels)
-    pronunciation_accuracy = calculate_sentence_pronunciation_accuracy(word_scores)
-    fluency_accuracy, wpm = calculate_fluency_scores(input_values[0], len(word_scores), content_scores, pronunciation_accuracy)
-    result = {'pronunciation_accuracy': pronunciation_accuracy,
-              'word_levels': word_levels,
-              'content_scores': content_scores,
-              'wpm': wpm,
-              'stress': None,
-              'fluency_score': fluency_accuracy}
-    return result
-if __name__ == '__main__':
-    MODEL_IS_LOADED = False
-else:
-    MODEL_IS_LOADED = False