Spaces:

arslanarjumand
/

ShortQuestionAnswering

Sleeping

App Files Files Community

arslanarjumand commited on Mar 17, 2024

Commit

57ea1c7

verified ·

1 Parent(s): fd4e53b

Upload 3 files

Browse files

Files changed (3) hide show

app.py +71 -0
logic.py +42 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from scipy.io import wavfile
+from logic import score_audio
+CHOICES     = ['2305', '2304', '195', '1956', '2302', '2301', '2299', '2300', '2298', '2297'] # '2860',
+QUESTIONS = ['What are breakfast, lunch, and dinner examples of?',
+             'What do we call a collection of large quantity?',
+             'How many days does it take for a full moon to occur?',
+             'What do you use in a non-verbal communication with expressions to convey a message effectively?',
+             'What do we call the behavior of not buying products from an exact company as a way of protesting?',
+             'What do we call a small soft creature with a hard round shell on its back that moves very slowly?',
+             'What do you call a place with lodging, food, and other services for travelers?',
+             'What is the term for a large, luxurious car, often driven by a chauffeur?',
+             'What is the term for a journey, especially a long one, involving travel by sea or in space?',
+             'What do we call a short trip taken for pleasure, usually lasting a day?'
+             ]
+ANSWERS = ['Meal', 'Bulk', 'Thirty', 'Body language / sign language', 'Boycott', 'Snail', 'Hotel', 'Limousine',
+           'Voyage', 'Excursion']
+PAIRED_QUESTIONS = {ch: {'Q': q, 'A': a}for ch, q, a in zip(CHOICES, QUESTIONS, ANSWERS)}
+def get_paired_text(value):
+    result = PAIRED_QUESTIONS.get(value, '')
+    return result
+def get_single_text(value):
+    result = PAIRED_QUESTIONS.get(value, '')
+    return result['Q']
+def analyze_audio(audio, true_result):
+# Write the processed audio to a temporary WAV file
+    if audio is None or true_result is None:
+      return 'the audio is missing'
+    temp_filename = 'temp_audio.wav'
+    wavfile.write(temp_filename, audio[0], audio[1])
+    true_result = get_paired_text(true_result)['A']
+    result = score_audio(temp_filename, true_result)
+    transcription = result['transcription']
+    score   = result['score']
+    result_markdown = f"""# Test score : {score} / 1
+    # Predicted Answer
+    ## {transcription}
+    # True Answer
+    ## {true_result}
+    """
+    return result_markdown
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                drp_down = gr.Dropdown(choices=CHOICES, scale=2)
+                show_text_btn = gr.Button("Select", scale=1)
+            read_text = gr.Markdown(label='Answer this question')
+            show_text_btn.click(get_single_text, inputs=drp_down, outputs=read_text)
+            audio_area = gr.Audio(label='Answer')
+            analyize_audio_btn = gr.Button("Submit", scale=1)
+        with gr.Column():
+            capt_area = gr.Markdown(label='CAPT Scores')
+            analyize_audio_btn.click(analyze_audio, inputs=[audio_area, drp_down], outputs=capt_area)
+demo.launch()

logic.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from optimum.bettertransformer import BetterTransformer
+import torch
+import librosa
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
+# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
+# os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
+torch.random.manual_seed(0);
+# protobuf==3.20.0
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+model = BetterTransformer.transform(model)
+def load_audio(audio_path, processor):
+    audio, sr = librosa.load(audio_path, sr=None)
+    input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+    return input_values
+@torch.inference_mode()
+def get_emissions(input_values, model):
+    results = model(input_values,).logits
+    return results
+def score_audio(audio_path, true_result):
+    true_result = true_result.split('/')
+    input_values = load_audio(audio_path, processor)
+    logits = get_emissions(input_values, model).cpu()
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)[0].lower()
+    result = {'transcription': transcription,
+              'score': int(any([x in transcription for x in true_result])),
+              }
+    return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+wave
+torch
+optimum
+scipy
+numpy
+resampy
+gradio
+librosa
+transformers