Spaces:
Running
Running
Commit
•
c0baabd
1
Parent(s):
e7b6850
Upload 4 files
Browse files- app.py +60 -0
- model.py +97 -0
- requirements.txt +9 -0
- wav2vec_aligen.py +51 -0
app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from scipy.io import wavfile
|
3 |
+
from wav2vec_aligen import speaker_pronunciation_assesment
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
def analyze_audio(audio):
|
8 |
+
# Write the processed audio to a temporary WAV file
|
9 |
+
if audio is None:
|
10 |
+
return 'the audio is missing'
|
11 |
+
temp_filename = 'temp_audio.wav'
|
12 |
+
wavfile.write(temp_filename, audio[0], audio[1])
|
13 |
+
|
14 |
+
|
15 |
+
result = speaker_pronunciation_assesment(temp_filename)
|
16 |
+
accuracy_score = result['pronunciation_accuracy']
|
17 |
+
fluency_score = result['fluency_score']
|
18 |
+
total_score = result['total_score']
|
19 |
+
content_scores = result['content_scores']
|
20 |
+
|
21 |
+
result_markdown = f"""|Language Aspect| Score|
|
22 |
+
|---|---|
|
23 |
+
|Pronunciation Accuracy| {accuracy_score}|
|
24 |
+
|Fluency| {fluency_score}|
|
25 |
+
|Total Score| {total_score}|
|
26 |
+
|Content Score| {content_scores}|
|
27 |
+
"""
|
28 |
+
return result_markdown
|
29 |
+
|
30 |
+
import gradio as gr
|
31 |
+
|
32 |
+
CHOICES = ['Daibers', 'Carbon', 'Reptiles']
|
33 |
+
SENTENCES = [
|
34 |
+
"""In Germany, over 100,000 tons of diapers are discarded each year, resulting in the wastage of valuable resources. Diaper liners, which contain special polymers known as superabsorbers, are among the materials that end up in landfills. However, researchers have made significant progress in enhancing the recycling process for these liners, leading to substantial improvements.""",
|
35 |
+
|
36 |
+
"""Across the globe, there is a wide spread effort to explore methods for extracting carbon dioxide from the atmosphere or power plant emissions and transforming it into a valuable resource. Among the various ideas being explored, the concept of converting car bondioxide into a stable fuel shows significant promise.""",
|
37 |
+
|
38 |
+
"""Around 250 million years ago,700 species of reptiles closely related to the modern-day crocodile roamed the earth, now new research reveals how a complex interplay between climate change, species competition and habitat can help explain why just 23 species of crocodile survive today."""
|
39 |
+
]
|
40 |
+
|
41 |
+
PAIRED_TEXT = {k: v for k, v in zip(CHOICES, SENTENCES)}
|
42 |
+
|
43 |
+
def get_paired_text(value):
|
44 |
+
text = '## ' + PAIRED_TEXT.get(value, '')
|
45 |
+
return text
|
46 |
+
|
47 |
+
with gr.Blocks() as demo:
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
with gr.Row():
|
51 |
+
drp_down = gr.Dropdown(choices=CHOICES, scale=2)
|
52 |
+
show_text_btn = gr.Button("Select", scale=1)
|
53 |
+
read_text = gr.Markdown(label='Read the follwing text')
|
54 |
+
show_text_btn.click(get_paired_text, inputs=drp_down, outputs=read_text)
|
55 |
+
audio_area = gr.Audio(label='Read the sentence')
|
56 |
+
analyize_audio_btn = gr.Button("Submit", scale=1)
|
57 |
+
with gr.Column():
|
58 |
+
capt_area = gr.Markdown(label='CAPT Scores')
|
59 |
+
analyize_audio_btn.click(analyze_audio, inputs=audio_area, outputs=capt_area)
|
60 |
+
demo.launch()
|
model.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel
|
2 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
3 |
+
from typing import Optional, Tuple, Union
|
4 |
+
from torch.nn import MSELoss
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
|
8 |
+
class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
|
9 |
+
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
|
10 |
+
def __init__(self, config):
|
11 |
+
super().__init__(config)
|
12 |
+
|
13 |
+
if hasattr(config, "add_adapter") and config.add_adapter:
|
14 |
+
raise ValueError(
|
15 |
+
"Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
|
16 |
+
)
|
17 |
+
self.wav2vec2_bert = Wav2Vec2BertModel(config)
|
18 |
+
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
|
19 |
+
if config.use_weighted_layer_sum:
|
20 |
+
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
|
21 |
+
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
|
22 |
+
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
|
23 |
+
|
24 |
+
# Initialize weights and apply final processing
|
25 |
+
self.post_init()
|
26 |
+
|
27 |
+
def freeze_base_model(self):
|
28 |
+
"""
|
29 |
+
Calling this function will disable the gradient computation for the base model so that its parameters will not
|
30 |
+
be updated during training. Only the classification head will be updated.
|
31 |
+
"""
|
32 |
+
for param in self.wav2vec2_bert.parameters():
|
33 |
+
param.requires_grad = False
|
34 |
+
|
35 |
+
|
36 |
+
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
|
37 |
+
def forward(
|
38 |
+
self,
|
39 |
+
input_features: Optional[torch.Tensor],
|
40 |
+
attention_mask: Optional[torch.Tensor] = None,
|
41 |
+
output_attentions: Optional[bool] = None,
|
42 |
+
output_hidden_states: Optional[bool] = None,
|
43 |
+
return_dict: Optional[bool] = None,
|
44 |
+
labels: Optional[torch.Tensor] = None,
|
45 |
+
) -> Union[Tuple, SequenceClassifierOutput]:
|
46 |
+
r"""
|
47 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
48 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
49 |
+
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
50 |
+
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
51 |
+
"""
|
52 |
+
|
53 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
54 |
+
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
|
55 |
+
|
56 |
+
outputs = self.wav2vec2_bert(
|
57 |
+
input_features,
|
58 |
+
attention_mask=attention_mask,
|
59 |
+
output_attentions=output_attentions,
|
60 |
+
output_hidden_states=output_hidden_states,
|
61 |
+
return_dict=return_dict,
|
62 |
+
)
|
63 |
+
|
64 |
+
if self.config.use_weighted_layer_sum:
|
65 |
+
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
|
66 |
+
hidden_states = torch.stack(hidden_states, dim=1)
|
67 |
+
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
|
68 |
+
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
|
69 |
+
else:
|
70 |
+
hidden_states = outputs[0]
|
71 |
+
|
72 |
+
hidden_states = self.projector(hidden_states)
|
73 |
+
if attention_mask is None:
|
74 |
+
pooled_output = hidden_states.mean(dim=1)
|
75 |
+
else:
|
76 |
+
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
|
77 |
+
hidden_states[~padding_mask] = 0.0
|
78 |
+
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
|
79 |
+
|
80 |
+
logits = self.classifier(pooled_output)
|
81 |
+
logits = nn.functional.relu(logits)
|
82 |
+
|
83 |
+
loss = None
|
84 |
+
if labels is not None:
|
85 |
+
loss_fct = MSELoss()
|
86 |
+
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels))
|
87 |
+
|
88 |
+
if not return_dict:
|
89 |
+
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
|
90 |
+
return ((loss,) + output) if loss is not None else output
|
91 |
+
|
92 |
+
return SequenceClassifierOutput(
|
93 |
+
loss=loss,
|
94 |
+
logits=logits,
|
95 |
+
hidden_states=outputs.hidden_states,
|
96 |
+
attentions=outputs.attentions,
|
97 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wave
|
2 |
+
torch
|
3 |
+
optimum
|
4 |
+
scipy
|
5 |
+
numpy
|
6 |
+
resampy
|
7 |
+
gradio
|
8 |
+
librosa
|
9 |
+
transformers
|
wav2vec_aligen.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import librosa
|
3 |
+
import os
|
4 |
+
from model import Wav2Vec2BertForSequenceClassification
|
5 |
+
from transformers import AutoFeatureExtractor
|
6 |
+
# from optimum.bettertransformer import BetterTransformer
|
7 |
+
|
8 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
9 |
+
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
|
10 |
+
# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
|
11 |
+
# os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
|
12 |
+
torch.random.manual_seed(0);
|
13 |
+
# protobuf==3.20.0
|
14 |
+
|
15 |
+
model_name = "arslanarjumand/wav2vec-reptiles"
|
16 |
+
processor = AutoFeatureExtractor.from_pretrained(model_name)
|
17 |
+
model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device)
|
18 |
+
# model = BetterTransformer.transform(model)
|
19 |
+
|
20 |
+
def load_audio(audio_path, processor):
|
21 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
22 |
+
|
23 |
+
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
|
24 |
+
return input_values
|
25 |
+
|
26 |
+
@torch.inference_mode()
|
27 |
+
def get_emissions(input_values, model):
|
28 |
+
results = model(input_values,).logits[0]
|
29 |
+
return results
|
30 |
+
|
31 |
+
|
32 |
+
def speaker_pronunciation_assesment(audio_path):
|
33 |
+
input_values = load_audio(audio_path, processor)
|
34 |
+
result_scores = get_emissions(input_values, model)
|
35 |
+
|
36 |
+
pronunciation_score = round(result_scores[0].cpu().item())
|
37 |
+
fluency_score = round(result_scores[1].cpu().item())
|
38 |
+
total_score = round(result_scores[2].cpu().item())
|
39 |
+
content_scores = round(result_scores[3].cpu().item())
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
result = {'pronunciation_accuracy': pronunciation_score,
|
44 |
+
'content_scores': content_scores,
|
45 |
+
'total_score': total_score,
|
46 |
+
'fluency_score': fluency_score}
|
47 |
+
return result
|
48 |
+
|
49 |
+
if __name__ == '__main__':
|
50 |
+
pass
|
51 |
+
|