|
import gradio as gr |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import torch |
|
import phonemizer |
|
import librosa |
|
import math |
|
import io |
|
import base64 |
|
from strsimpy.jaro_winkler import JaroWinkler |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def lark(audioAsB64): |
|
|
|
wav_data = base64.b64decode(audioAsB64.encode("utf-8")) |
|
|
|
|
|
processor = Wav2Vec2Processor.from_pretrained( |
|
"facebook/wav2vec2-xlsr-53-espeak-cv-ft" |
|
) |
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft") |
|
|
|
waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000) |
|
|
|
input_values = processor( |
|
waveform, sampling_rate=sample_rate, return_tensors="pt" |
|
).input_values |
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0] |
|
|
|
|
|
processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") |
|
|
|
input_values = processorSTT( |
|
waveform, sampling_rate=sample_rate, return_tensors="pt" |
|
).input_values |
|
|
|
logits = model(input_values).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
speechToTextTranscripition = processor.batch_decode(predicted_ids)[0] |
|
|
|
|
|
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition) |
|
|
|
|
|
|
|
jarowinkler = JaroWinkler() |
|
similarity_score = jarowinkler.similarity( |
|
speechToPhonemeTranscription, graphemeToPhonemeTranscription |
|
) |
|
|
|
|
|
def getBandFromSimilarityScore(similarity_score): |
|
if similarity_score >= 0.91: |
|
return 9 |
|
elif similarity_score >= 0.81: |
|
return 8 |
|
elif similarity_score >= 0.73: |
|
return 7 |
|
elif similarity_score >= 0.65: |
|
return 6 |
|
elif similarity_score >= 0.60: |
|
return 5 |
|
elif similarity_score >= 0.46: |
|
return 4 |
|
elif similarity_score >= 0.35: |
|
return 3 |
|
elif similarity_score >= 0.1: |
|
return 2 |
|
else: |
|
return 1 |
|
|
|
IELTSband = getBandFromSimilarityScore(similarity_score) |
|
|
|
return [similarity_score, IELTSband] |
|
|
|
|
|
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text"]) |
|
iface.launch() |
|
|