from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer, AutoModel from torch.nn.functional import softmax from transformers import pipeline import time, librosa, torch, io from pydub import AudioSegment import gradio as gr import numpy as np device = 'cpu' cols = ['A1','A2','B1','B2','C1','C2'] tokenizer = AutoTokenizer.from_pretrained('t5-base') lm = AutoModel.from_pretrained('t5-base').to(device) model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device) pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30, device="cpu") def vocab_scoring(tokens, duration): unique_vocab = {} for token in tokens: if token not in unique_vocab.keys(): unique_vocab[token] = 1 else: unique_vocab[token] += 1 vocab_rate = len(unique_vocab)/duration if vocab_rate < 40: return 1 if vocab_rate < 45: return 2 if vocab_rate < 55: return 3 if vocab_rate < 75: return 4 if vocab_rate < 85: return 5 if vocab_rate >= 85: return 6 def word_scoring(tokens, duration): word_rate = len(tokens)/duration if word_rate < 65: return 1 if word_rate < 90: return 2 if word_rate < 117: return 3 if word_rate < 142: return 4 if word_rate < 175: return 5 if word_rate >= 175: return 6 def fluency_scoring(tokenized_sentence, model): try: with torch.no_grad(): outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence) logits = outputs.last_hidden_state probas = softmax(logits, dim=-1) perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1))) except: tokenized_sentence = tokenized_sentence[:,:512] with torch.no_grad(): outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence) logits = outputs.last_hidden_state probas = softmax(logits, dim=-1) perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1))) if perplexity > 120: return 1 if perplexity > 100: return 2 if perplexity > 60: return 3 if perplexity > 50: return 4 if perplexity > 30: return 5 if perplexity <= 30: return 6 def similarity_scoring(prompt, response): prompt_embeddings = model.encode(prompt, convert_to_tensor=True) response_embeddings = model.encode(response, convert_to_tensor=True) similarity = util.pytorch_cos_sim(prompt_embeddings, response_embeddings)[0].item() if similarity < 0.3: return 1 if similarity < 0.4: return 2 if similarity < 0.5: return 3 if similarity < 0.6: return 4 if similarity < 0.7: return 5 if similarity >= 0.7: return 6 def classify(score): if score <= 1: return (0, "A1") if score == 2: return (1, "A2") if score == 3: return (2, "B1") if score == 4: return (3, "B2") if score == 5: return (4, "C1") if score >= 6: return (5, "C2") def speech_to_text(audio): audio_, rate = librosa.load(audio, sr=16000) duration = librosa.get_duration(y=audio_, sr=rate) transcription = pipe(audio)["text"] return transcription, duration/60.0 def test_speech(prompt, audio): response, duration = speech_to_text(audio) response_tokens = tokenizer.encode(response, return_tensors="pt", add_special_tokens=True) fluency_score = fluency_scoring(response_tokens, lm) tokens = response_tokens.tolist()[0] vocab_score = vocab_scoring(tokens, duration) word_score = word_scoring(tokens, duration) similarity_score = similarity_scoring(prompt, response) print(f"Fluency Score => {fluency_score}") print(f"Vocab Score => {vocab_score}") print(f"Word Score => {word_score}") print(f"Similarity Score => {similarity_score}") scores = [] scores.append(word_score) scores.append(vocab_score) scores.append(fluency_score) scores.append(similarity_score) scores.append(round((word_score + vocab_score) / 2)) scores.append(round((word_score + fluency_score) / 2)) scores.append(round((word_score + similarity_score) / 2)) scores.append(round((vocab_score + fluency_score) / 2)) scores.append(round((vocab_score + similarity_score) / 2)) scores.append(round((word_score + vocab_score + fluency_score) / 3)) scores.append(round((word_score + vocab_score + similarity_score) / 3)) scores.append(round((word_score + vocab_score + fluency_score + similarity_score) / 4)) print(f"Votes =>\t{scores}") # Max Voting preds = [classify(score)[1] for score in scores] pred_dict = {} for idx, pred in enumerate(preds): if pred in pred_dict.keys(): pred_dict[pred] += 1 else: pred_dict[pred] = 1 mx_val = 0 pred = "" for key, value in pred_dict.items(): if value > mx_val: mx_val = value pred = key return pred prompt = gr.Textbox(label="Prompt") audio_response = gr.Audio(source="microphone", type="filepath", label="Audio") rank = gr.Textbox(label="Rank (A1-C2)") iface = gr.Interface(fn=test_speech, inputs=[prompt, audio_response], outputs=rank.style(show_copy_button=True), title="Rank Speech") iface.launch()