Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
import torch | |
import phonemizer | |
import librosa | |
import math | |
import io | |
import base64 | |
from strsimpy.jaro_winkler import JaroWinkler | |
# base64 to audio β | |
# audio to transcription β | |
# audio to text β | |
# text to phoneme β | |
# accuracy = jarowinkler(transcription, phoneme) β | |
# band = getBandFromAccuracy(accuracy) β | |
# return accuracy, band β | |
def lark(audioAsB64): | |
# base64 to wav data conversion | |
wav_data = base64.b64decode(audioAsB64.encode("utf-8")) | |
# audio to transcription | |
processor = Wav2Vec2Processor.from_pretrained( | |
"facebook/wav2vec2-xlsr-53-espeak-cv-ft" | |
) | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft") | |
waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000) | |
input_values = processor( | |
waveform, sampling_rate=sample_rate, return_tensors="pt" | |
).input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0] | |
# audio to text | |
processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
input_values = processorSTT( | |
waveform, sampling_rate=sample_rate, return_tensors="pt" | |
).input_values | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
speechToTextTranscripition = processor.batch_decode(predicted_ids)[0] | |
# text to phoneme | |
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition) | |
# accuracy = jaroWinkler(transcription, phoneme) | |
jarowinkler = JaroWinkler() | |
similarity_score = jarowinkler.similarity( | |
speechToPhonemeTranscription, graphemeToPhonemeTranscription | |
) | |
# ielts pronunciation band estimation | |
def getBandFromSimilarityScore(similarity_score): | |
if similarity_score >= 0.91: | |
return 9 | |
elif similarity_score >= 0.81: | |
return 8 | |
elif similarity_score >= 0.73: | |
return 7 | |
elif similarity_score >= 0.65: | |
return 6 | |
elif similarity_score >= 0.60: | |
return 5 | |
elif similarity_score >= 0.46: | |
return 4 | |
elif similarity_score >= 0.35: | |
return 3 | |
elif similarity_score >= 0.1: | |
return 2 | |
else: | |
return 1 | |
IELTSband = getBandFromSimilarityScore(similarity_score) | |
return [similarity_score, IELTSband, speechToTextTranscripition] | |
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"]) | |
iface.launch() | |