test / app.py
Aryan Wadhawan
Implemented everything
3e7b6ee
raw
history blame
2.77 kB
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import math
import io
import base64
from strsimpy.jaro_winkler import JaroWinkler
# base64 to audio βœ…
# audio to transcription βœ…
# audio to text βœ…
# text to phoneme βœ…
# accuracy = jarowinkler(transcription, phoneme) βœ…
# band = getBandFromAccuracy(accuracy) βœ…
# return accuracy, band βœ…
def lark(audioAsB64):
# base64 to wav data conversion
wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
# audio to transcription
processor = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-xlsr-53-espeak-cv-ft"
)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000)
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0]
# audio to text
processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
input_values = processorSTT(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
speechToTextTranscripition = processor.batch_decode(predicted_ids)[0]
# text to phoneme
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition)
# accuracy = jaroWinkler(transcription, phoneme)
jarowinkler = JaroWinkler()
similarity_score = jarowinkler.similarity(
speechToPhonemeTranscription, graphemeToPhonemeTranscription
)
# ielts pronunciation band estimation
def getBandFromSimilarityScore(similarity_score):
if similarity_score >= 0.91:
return 9
elif similarity_score >= 0.81:
return 8
elif similarity_score >= 0.73:
return 7
elif similarity_score >= 0.65:
return 6
elif similarity_score >= 0.60:
return 5
elif similarity_score >= 0.46:
return 4
elif similarity_score >= 0.35:
return 3
elif similarity_score >= 0.1:
return 2
else:
return 1
IELTSband = getBandFromSimilarityScore(similarity_score)
return [similarity_score, IELTSband, speechToTextTranscripition]
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"])
iface.launch()