arslanarjumand commited on
Commit
57ea1c7
1 Parent(s): fd4e53b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +71 -0
  2. logic.py +42 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scipy.io import wavfile
3
+ from logic import score_audio
4
+
5
+
6
+ CHOICES = ['2305', '2304', '195', '1956', '2302', '2301', '2299', '2300', '2298', '2297'] # '2860',
7
+
8
+ QUESTIONS = ['What are breakfast, lunch, and dinner examples of?',
9
+ 'What do we call a collection of large quantity?',
10
+ 'How many days does it take for a full moon to occur?',
11
+ 'What do you use in a non-verbal communication with expressions to convey a message effectively?',
12
+ 'What do we call the behavior of not buying products from an exact company as a way of protesting?',
13
+ 'What do we call a small soft creature with a hard round shell on its back that moves very slowly?',
14
+ 'What do you call a place with lodging, food, and other services for travelers?',
15
+ 'What is the term for a large, luxurious car, often driven by a chauffeur?',
16
+ 'What is the term for a journey, especially a long one, involving travel by sea or in space?',
17
+ 'What do we call a short trip taken for pleasure, usually lasting a day?'
18
+ ]
19
+
20
+ ANSWERS = ['Meal', 'Bulk', 'Thirty', 'Body language / sign language', 'Boycott', 'Snail', 'Hotel', 'Limousine',
21
+ 'Voyage', 'Excursion']
22
+
23
+ PAIRED_QUESTIONS = {ch: {'Q': q, 'A': a}for ch, q, a in zip(CHOICES, QUESTIONS, ANSWERS)}
24
+
25
+ def get_paired_text(value):
26
+ result = PAIRED_QUESTIONS.get(value, '')
27
+ return result
28
+
29
+ def get_single_text(value):
30
+ result = PAIRED_QUESTIONS.get(value, '')
31
+ return result['Q']
32
+
33
+ def analyze_audio(audio, true_result):
34
+ # Write the processed audio to a temporary WAV file
35
+ if audio is None or true_result is None:
36
+ return 'the audio is missing'
37
+ temp_filename = 'temp_audio.wav'
38
+ wavfile.write(temp_filename, audio[0], audio[1])
39
+
40
+ true_result = get_paired_text(true_result)['A']
41
+ result = score_audio(temp_filename, true_result)
42
+ transcription = result['transcription']
43
+ score = result['score']
44
+
45
+ result_markdown = f"""# Test score : {score} / 1
46
+
47
+ # Predicted Answer
48
+
49
+ ## {transcription}
50
+
51
+ # True Answer
52
+
53
+ ## {true_result}
54
+ """
55
+ return result_markdown
56
+
57
+
58
+ with gr.Blocks() as demo:
59
+ with gr.Row():
60
+ with gr.Column():
61
+ with gr.Row():
62
+ drp_down = gr.Dropdown(choices=CHOICES, scale=2)
63
+ show_text_btn = gr.Button("Select", scale=1)
64
+ read_text = gr.Markdown(label='Answer this question')
65
+ show_text_btn.click(get_single_text, inputs=drp_down, outputs=read_text)
66
+ audio_area = gr.Audio(label='Answer')
67
+ analyize_audio_btn = gr.Button("Submit", scale=1)
68
+ with gr.Column():
69
+ capt_area = gr.Markdown(label='CAPT Scores')
70
+ analyize_audio_btn.click(analyze_audio, inputs=[audio_area, drp_down], outputs=capt_area)
71
+ demo.launch()
logic.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
+ from optimum.bettertransformer import BetterTransformer
3
+ import torch
4
+ import librosa
5
+
6
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
7
+ # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
8
+ # os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
9
+ # os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
10
+ torch.random.manual_seed(0);
11
+ # protobuf==3.20.0
12
+
13
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
14
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
15
+ model = BetterTransformer.transform(model)
16
+
17
+
18
+ def load_audio(audio_path, processor):
19
+ audio, sr = librosa.load(audio_path, sr=None)
20
+
21
+ input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
22
+ return input_values
23
+
24
+ @torch.inference_mode()
25
+ def get_emissions(input_values, model):
26
+ results = model(input_values,).logits
27
+ return results
28
+
29
+ def score_audio(audio_path, true_result):
30
+ true_result = true_result.split('/')
31
+ input_values = load_audio(audio_path, processor)
32
+ logits = get_emissions(input_values, model).cpu()
33
+ predicted_ids = torch.argmax(logits, dim=-1)
34
+ transcription = processor.batch_decode(predicted_ids)[0].lower()
35
+
36
+
37
+ result = {'transcription': transcription,
38
+ 'score': int(any([x in transcription for x in true_result])),
39
+ }
40
+ return result
41
+
42
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ wave
2
+ torch
3
+ optimum
4
+ scipy
5
+ numpy
6
+ resampy
7
+ gradio
8
+ librosa
9
+ transformers