Mohammad Sabik Irbaz commited on
Commit
a4a2eb9
1 Parent(s): 8aad394

speech rank

Browse files
Files changed (2) hide show
  1. app.py +154 -3
  2. requirements.txt +11 -0
app.py CHANGED
@@ -1,7 +1,158 @@
 
 
 
 
 
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  iface.launch()
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ from transformers import AutoTokenizer, AutoModel
3
+ from torch.nn.functional import softmax
4
+ from transformers import pipeline
5
+ import time, librosa, torch, io
6
+ from pydub import AudioSegment
7
  import gradio as gr
8
+ import numpy as np
9
 
10
+ device = 'cpu'
11
+ cols = ['A1','A2','B1','B2','C1','C2']
12
+ tokenizer = AutoTokenizer.from_pretrained('t5-base')
13
+ lm = AutoModel.from_pretrained('t5-base').to(device)
14
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
15
+ pipe = pipeline("automatic-speech-recognition",
16
+ model="openai/whisper-base.en",
17
+ chunk_length_s=30, device="cpu")
18
 
19
+ def vocab_scoring(tokens, duration):
20
+ unique_vocab = {}
21
+ for token in tokens:
22
+ if token not in unique_vocab.keys():
23
+ unique_vocab[token] = 1
24
+ else:
25
+ unique_vocab[token] += 1
26
+ vocab_rate = len(unique_vocab)/duration
27
+
28
+ if vocab_rate < 40: return 1
29
+ if vocab_rate < 45: return 2
30
+ if vocab_rate < 55: return 3
31
+ if vocab_rate < 75: return 4
32
+ if vocab_rate < 85: return 5
33
+ if vocab_rate >= 85: return 6
34
+
35
+ def word_scoring(tokens, duration):
36
+ word_rate = len(tokens)/duration
37
+
38
+ if word_rate < 65: return 1
39
+ if word_rate < 90: return 2
40
+ if word_rate < 117: return 3
41
+ if word_rate < 142: return 4
42
+ if word_rate < 175: return 5
43
+ if word_rate >= 175: return 6
44
+
45
+ def fluency_scoring(tokenized_sentence, model):
46
+ try:
47
+ with torch.no_grad():
48
+ outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
49
+ logits = outputs.last_hidden_state
50
+ probas = softmax(logits, dim=-1)
51
+ perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
52
+ except:
53
+ tokenized_sentence = tokenized_sentence[:,:512]
54
+ with torch.no_grad():
55
+ outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
56
+ logits = outputs.last_hidden_state
57
+ probas = softmax(logits, dim=-1)
58
+ perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
59
+
60
+ if perplexity > 120: return 1
61
+ if perplexity > 100: return 2
62
+ if perplexity > 60: return 3
63
+ if perplexity > 50: return 4
64
+ if perplexity > 30: return 5
65
+ if perplexity <= 30: return 6
66
+
67
+ def similarity_scoring(prompt, response):
68
+ prompt_embeddings = model.encode(prompt, convert_to_tensor=True)
69
+ response_embeddings = model.encode(response, convert_to_tensor=True)
70
+ similarity = util.pytorch_cos_sim(prompt_embeddings, response_embeddings)[0].item()
71
+
72
+ if similarity < 0.3: return 1
73
+ if similarity < 0.4: return 2
74
+ if similarity < 0.5: return 3
75
+ if similarity < 0.6: return 4
76
+ if similarity < 0.7: return 5
77
+ if similarity >= 0.7: return 6
78
+
79
+ def classify(score):
80
+ if score <= 1: return (0, "A1")
81
+ if score == 2: return (1, "A2")
82
+ if score == 3: return (2, "B1")
83
+ if score == 4: return (3, "B2")
84
+ if score == 5: return (4, "C1")
85
+ if score >= 6: return (5, "C2")
86
+
87
+ def speech_to_text(audio):
88
+ audio_, rate = librosa.load(audio, sr=16000)
89
+ duration = librosa.get_duration(y=audio_, sr=rate)
90
+ transcription = pipe(audio)["text"]
91
+ return transcription, duration/60.0
92
+
93
+ def test_speech(prompt, audio):
94
+ response, duration = speech_to_text(audio)
95
+
96
+ response_tokens = tokenizer.encode(response,
97
+ return_tensors="pt",
98
+ add_special_tokens=True)
99
+
100
+ fluency_score = fluency_scoring(response_tokens, lm)
101
+ tokens = response_tokens.tolist()[0]
102
+
103
+ vocab_score = vocab_scoring(tokens, duration)
104
+ word_score = word_scoring(tokens, duration)
105
+
106
+ similarity_score = similarity_scoring(prompt, response)
107
+
108
+ print(f"Fluency Score => {fluency_score}")
109
+ print(f"Vocab Score => {vocab_score}")
110
+ print(f"Word Score => {word_score}")
111
+ print(f"Similarity Score => {similarity_score}")
112
+
113
+ scores = []
114
+
115
+ scores.append(word_score)
116
+ scores.append(vocab_score)
117
+ scores.append(fluency_score)
118
+ scores.append(similarity_score)
119
+
120
+ scores.append(round((word_score + vocab_score) / 2))
121
+ scores.append(round((word_score + fluency_score) / 2))
122
+ scores.append(round((word_score + similarity_score) / 2))
123
+ scores.append(round((vocab_score + fluency_score) / 2))
124
+ scores.append(round((vocab_score + similarity_score) / 2))
125
+
126
+ scores.append(round((word_score + vocab_score + fluency_score) / 3))
127
+ scores.append(round((word_score + vocab_score + similarity_score) / 3))
128
+
129
+ scores.append(round((word_score + vocab_score + fluency_score + similarity_score) / 4))
130
+
131
+ print(f"Votes =>\t{scores}")
132
+
133
+ # Max Voting
134
+ preds = [classify(score)[1] for score in scores]
135
+ pred_dict = {}
136
+ for idx, pred in enumerate(preds):
137
+ if pred in pred_dict.keys(): pred_dict[pred] += 1
138
+ else: pred_dict[pred] = 1
139
+
140
+ mx_val = 0
141
+ pred = ""
142
+ for key, value in pred_dict.items():
143
+ if value > mx_val:
144
+ mx_val = value
145
+ pred = key
146
+
147
+ return pred
148
+
149
+ prompt = gr.Textbox(label="Prompt")
150
+ audio_response = gr.Audio(type="filepath", label="Audio")
151
+ rank = gr.Textbox(label="Rank (A1-C2)")
152
+
153
+ iface = gr.Interface(fn=test_speech,
154
+ inputs=[prompt, audio_response],
155
+ outputs=rank.style(show_copy_button=True),
156
+ title="Rank Speech")
157
+
158
  iface.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.23.0
2
+ librosa==0.10.0.post1
3
+ torch==1.13.1
4
+ sentence-transformers==2.2.2
5
+ sentencepiece==0.1.97
6
+ transformers==4.26.1
7
+ tokenizers==0.13.2
8
+ pydub==0.25.1
9
+ ffmpeg==1.4
10
+ numpy==1.23.5
11
+ scipy==1.10.1