rank_speech / app.py
Mohammad Sabik Irbaz
add microphone
6e71fd8
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import softmax
from transformers import pipeline
import time, librosa, torch, io
from pydub import AudioSegment
import gradio as gr
import numpy as np
device = 'cpu'
cols = ['A1','A2','B1','B2','C1','C2']
tokenizer = AutoTokenizer.from_pretrained('t5-base')
lm = AutoModel.from_pretrained('t5-base').to(device)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
pipe = pipeline("automatic-speech-recognition",
model="openai/whisper-base.en",
chunk_length_s=30, device="cpu")
def vocab_scoring(tokens, duration):
unique_vocab = {}
for token in tokens:
if token not in unique_vocab.keys():
unique_vocab[token] = 1
else:
unique_vocab[token] += 1
vocab_rate = len(unique_vocab)/duration
if vocab_rate < 40: return 1
if vocab_rate < 45: return 2
if vocab_rate < 55: return 3
if vocab_rate < 75: return 4
if vocab_rate < 85: return 5
if vocab_rate >= 85: return 6
def word_scoring(tokens, duration):
word_rate = len(tokens)/duration
if word_rate < 65: return 1
if word_rate < 90: return 2
if word_rate < 117: return 3
if word_rate < 142: return 4
if word_rate < 175: return 5
if word_rate >= 175: return 6
def fluency_scoring(tokenized_sentence, model):
try:
with torch.no_grad():
outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
logits = outputs.last_hidden_state
probas = softmax(logits, dim=-1)
perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
except:
tokenized_sentence = tokenized_sentence[:,:512]
with torch.no_grad():
outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
logits = outputs.last_hidden_state
probas = softmax(logits, dim=-1)
perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
if perplexity > 120: return 1
if perplexity > 100: return 2
if perplexity > 60: return 3
if perplexity > 50: return 4
if perplexity > 30: return 5
if perplexity <= 30: return 6
def similarity_scoring(prompt, response):
prompt_embeddings = model.encode(prompt, convert_to_tensor=True)
response_embeddings = model.encode(response, convert_to_tensor=True)
similarity = util.pytorch_cos_sim(prompt_embeddings, response_embeddings)[0].item()
if similarity < 0.3: return 1
if similarity < 0.4: return 2
if similarity < 0.5: return 3
if similarity < 0.6: return 4
if similarity < 0.7: return 5
if similarity >= 0.7: return 6
def classify(score):
if score <= 1: return (0, "A1")
if score == 2: return (1, "A2")
if score == 3: return (2, "B1")
if score == 4: return (3, "B2")
if score == 5: return (4, "C1")
if score >= 6: return (5, "C2")
def speech_to_text(audio):
audio_, rate = librosa.load(audio, sr=16000)
duration = librosa.get_duration(y=audio_, sr=rate)
transcription = pipe(audio)["text"]
return transcription, duration/60.0
def test_speech(prompt, audio):
response, duration = speech_to_text(audio)
response_tokens = tokenizer.encode(response,
return_tensors="pt",
add_special_tokens=True)
fluency_score = fluency_scoring(response_tokens, lm)
tokens = response_tokens.tolist()[0]
vocab_score = vocab_scoring(tokens, duration)
word_score = word_scoring(tokens, duration)
similarity_score = similarity_scoring(prompt, response)
print(f"Fluency Score => {fluency_score}")
print(f"Vocab Score => {vocab_score}")
print(f"Word Score => {word_score}")
print(f"Similarity Score => {similarity_score}")
scores = []
scores.append(word_score)
scores.append(vocab_score)
scores.append(fluency_score)
scores.append(similarity_score)
scores.append(round((word_score + vocab_score) / 2))
scores.append(round((word_score + fluency_score) / 2))
scores.append(round((word_score + similarity_score) / 2))
scores.append(round((vocab_score + fluency_score) / 2))
scores.append(round((vocab_score + similarity_score) / 2))
scores.append(round((word_score + vocab_score + fluency_score) / 3))
scores.append(round((word_score + vocab_score + similarity_score) / 3))
scores.append(round((word_score + vocab_score + fluency_score + similarity_score) / 4))
print(f"Votes =>\t{scores}")
# Max Voting
preds = [classify(score)[1] for score in scores]
pred_dict = {}
for idx, pred in enumerate(preds):
if pred in pred_dict.keys(): pred_dict[pred] += 1
else: pred_dict[pred] = 1
mx_val = 0
pred = ""
for key, value in pred_dict.items():
if value > mx_val:
mx_val = value
pred = key
return pred
prompt = gr.Textbox(label="Prompt")
audio_response = gr.Audio(source="microphone", type="filepath", label="Audio")
rank = gr.Textbox(label="Rank (A1-C2)")
iface = gr.Interface(fn=test_speech,
inputs=[prompt, audio_response],
outputs=rank.style(show_copy_button=True),
title="Rank Speech")
iface.launch()