arslanarjumand commited on
Commit
c0baabd
1 Parent(s): e7b6850

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +60 -0
  2. model.py +97 -0
  3. requirements.txt +9 -0
  4. wav2vec_aligen.py +51 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scipy.io import wavfile
3
+ from wav2vec_aligen import speaker_pronunciation_assesment
4
+
5
+
6
+
7
+ def analyze_audio(audio):
8
+ # Write the processed audio to a temporary WAV file
9
+ if audio is None:
10
+ return 'the audio is missing'
11
+ temp_filename = 'temp_audio.wav'
12
+ wavfile.write(temp_filename, audio[0], audio[1])
13
+
14
+
15
+ result = speaker_pronunciation_assesment(temp_filename)
16
+ accuracy_score = result['pronunciation_accuracy']
17
+ fluency_score = result['fluency_score']
18
+ total_score = result['total_score']
19
+ content_scores = result['content_scores']
20
+
21
+ result_markdown = f"""|Language Aspect| Score|
22
+ |---|---|
23
+ |Pronunciation Accuracy| {accuracy_score}|
24
+ |Fluency| {fluency_score}|
25
+ |Total Score| {total_score}|
26
+ |Content Score| {content_scores}|
27
+ """
28
+ return result_markdown
29
+
30
+ import gradio as gr
31
+
32
+ CHOICES = ['Daibers', 'Carbon', 'Reptiles']
33
+ SENTENCES = [
34
+ """In Germany, over 100,000 tons of diapers are discarded each year, resulting in the wastage of valuable resources. Diaper liners, which contain special polymers known as superabsorbers, are among the materials that end up in landfills. However, researchers have made significant progress in enhancing the recycling process for these liners, leading to substantial improvements.""",
35
+
36
+ """Across the globe, there is a wide spread effort to explore methods for extracting carbon dioxide from the atmosphere or power plant emissions and transforming it into a valuable resource. Among the various ideas being explored, the concept of converting car bondioxide into a stable fuel shows significant promise.""",
37
+
38
+ """Around 250 million years ago,700 species of reptiles closely related to the modern-day crocodile roamed the earth, now new research reveals how a complex interplay between climate change, species competition and habitat can help explain why just 23 species of crocodile survive today."""
39
+ ]
40
+
41
+ PAIRED_TEXT = {k: v for k, v in zip(CHOICES, SENTENCES)}
42
+
43
+ def get_paired_text(value):
44
+ text = '## ' + PAIRED_TEXT.get(value, '')
45
+ return text
46
+
47
+ with gr.Blocks() as demo:
48
+ with gr.Row():
49
+ with gr.Column():
50
+ with gr.Row():
51
+ drp_down = gr.Dropdown(choices=CHOICES, scale=2)
52
+ show_text_btn = gr.Button("Select", scale=1)
53
+ read_text = gr.Markdown(label='Read the follwing text')
54
+ show_text_btn.click(get_paired_text, inputs=drp_down, outputs=read_text)
55
+ audio_area = gr.Audio(label='Read the sentence')
56
+ analyize_audio_btn = gr.Button("Submit", scale=1)
57
+ with gr.Column():
58
+ capt_area = gr.Markdown(label='CAPT Scores')
59
+ analyize_audio_btn.click(analyze_audio, inputs=audio_area, outputs=capt_area)
60
+ demo.launch()
model.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel
2
+ from transformers.modeling_outputs import SequenceClassifierOutput
3
+ from typing import Optional, Tuple, Union
4
+ from torch.nn import MSELoss
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
9
+ # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
10
+ def __init__(self, config):
11
+ super().__init__(config)
12
+
13
+ if hasattr(config, "add_adapter") and config.add_adapter:
14
+ raise ValueError(
15
+ "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
16
+ )
17
+ self.wav2vec2_bert = Wav2Vec2BertModel(config)
18
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
19
+ if config.use_weighted_layer_sum:
20
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
21
+ self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
22
+ self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
23
+
24
+ # Initialize weights and apply final processing
25
+ self.post_init()
26
+
27
+ def freeze_base_model(self):
28
+ """
29
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
30
+ be updated during training. Only the classification head will be updated.
31
+ """
32
+ for param in self.wav2vec2_bert.parameters():
33
+ param.requires_grad = False
34
+
35
+
36
+ # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
37
+ def forward(
38
+ self,
39
+ input_features: Optional[torch.Tensor],
40
+ attention_mask: Optional[torch.Tensor] = None,
41
+ output_attentions: Optional[bool] = None,
42
+ output_hidden_states: Optional[bool] = None,
43
+ return_dict: Optional[bool] = None,
44
+ labels: Optional[torch.Tensor] = None,
45
+ ) -> Union[Tuple, SequenceClassifierOutput]:
46
+ r"""
47
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
48
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
49
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
50
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
51
+ """
52
+
53
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
54
+ output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
55
+
56
+ outputs = self.wav2vec2_bert(
57
+ input_features,
58
+ attention_mask=attention_mask,
59
+ output_attentions=output_attentions,
60
+ output_hidden_states=output_hidden_states,
61
+ return_dict=return_dict,
62
+ )
63
+
64
+ if self.config.use_weighted_layer_sum:
65
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
66
+ hidden_states = torch.stack(hidden_states, dim=1)
67
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
68
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
69
+ else:
70
+ hidden_states = outputs[0]
71
+
72
+ hidden_states = self.projector(hidden_states)
73
+ if attention_mask is None:
74
+ pooled_output = hidden_states.mean(dim=1)
75
+ else:
76
+ padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
77
+ hidden_states[~padding_mask] = 0.0
78
+ pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
79
+
80
+ logits = self.classifier(pooled_output)
81
+ logits = nn.functional.relu(logits)
82
+
83
+ loss = None
84
+ if labels is not None:
85
+ loss_fct = MSELoss()
86
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels))
87
+
88
+ if not return_dict:
89
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
90
+ return ((loss,) + output) if loss is not None else output
91
+
92
+ return SequenceClassifierOutput(
93
+ loss=loss,
94
+ logits=logits,
95
+ hidden_states=outputs.hidden_states,
96
+ attentions=outputs.attentions,
97
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ wave
2
+ torch
3
+ optimum
4
+ scipy
5
+ numpy
6
+ resampy
7
+ gradio
8
+ librosa
9
+ transformers
wav2vec_aligen.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import os
4
+ from model import Wav2Vec2BertForSequenceClassification
5
+ from transformers import AutoFeatureExtractor
6
+ # from optimum.bettertransformer import BetterTransformer
7
+
8
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
+ # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
10
+ # os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
11
+ # os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
12
+ torch.random.manual_seed(0);
13
+ # protobuf==3.20.0
14
+
15
+ model_name = "arslanarjumand/wav2vec-reptiles"
16
+ processor = AutoFeatureExtractor.from_pretrained(model_name)
17
+ model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device)
18
+ # model = BetterTransformer.transform(model)
19
+
20
+ def load_audio(audio_path, processor):
21
+ audio, sr = librosa.load(audio_path, sr=16000)
22
+
23
+ input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
24
+ return input_values
25
+
26
+ @torch.inference_mode()
27
+ def get_emissions(input_values, model):
28
+ results = model(input_values,).logits[0]
29
+ return results
30
+
31
+
32
+ def speaker_pronunciation_assesment(audio_path):
33
+ input_values = load_audio(audio_path, processor)
34
+ result_scores = get_emissions(input_values, model)
35
+
36
+ pronunciation_score = round(result_scores[0].cpu().item())
37
+ fluency_score = round(result_scores[1].cpu().item())
38
+ total_score = round(result_scores[2].cpu().item())
39
+ content_scores = round(result_scores[3].cpu().item())
40
+
41
+
42
+
43
+ result = {'pronunciation_accuracy': pronunciation_score,
44
+ 'content_scores': content_scores,
45
+ 'total_score': total_score,
46
+ 'fluency_score': fluency_score}
47
+ return result
48
+
49
+ if __name__ == '__main__':
50
+ pass
51
+