mireiafarrus commited on
Commit
9fe9663
1 Parent(s): 87308cb

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import gradio as gr
3
+ import torch
4
+
5
+ from examples import *
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+
9
+ asr = pipeline(
10
+ "automatic-speech-recognition",
11
+ model="MaximilianChen/Casper",
12
+ chunk_length_s=30,
13
+ device=device,
14
+ )
15
+
16
+
17
+ def transcribe_audio(file=None, mic=None):
18
+ if mic is not None:
19
+ audio = mic
20
+ elif file is not None:
21
+ audio = file
22
+ else:
23
+ return "You must either provide a mic recording or a file"
24
+ transcription = asr(audio)["text"]
25
+ return transcription
26
+
27
+
28
+ # css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
29
+ with gr.Blocks() as demo:
30
+
31
+ gr.Markdown("<center><h1>CASPER</h1> "
32
+ "<h2>Catalan Automatic Speech Recognition using Fine-Tuned Whisper</h2></center>")
33
+
34
+ with gr.Row():
35
+ with gr.Column():
36
+ audio_from_microphone = gr.Audio(source="microphone", label="Mic", type="filepath")
37
+ audio_from_file = gr.Audio(source="upload", label="File", type="filepath")
38
+ with gr.Row():
39
+ with gr.Column(scale=2):
40
+ asr_btn = gr.Button("Transcribe!")
41
+ with gr.Column(scale=0):
42
+ cln_btn = gr.ClearButton(value='Clear', components=[audio_from_microphone, audio_from_file])
43
+ with gr.Column():
44
+ output_text = gr.Textbox(label="Generated Transcription")
45
+ del_text = gr.ClearButton(value='Delete Text', components=output_text)
46
+ gr.Markdown("<sub>NOTE: This model does not generate punctuation and casing</sub>")
47
+
48
+ asr_btn.click(fn=transcribe_audio,
49
+ inputs=[audio_from_file, audio_from_microphone],
50
+ outputs=output_text)
51
+
52
+ with gr.Row():
53
+ with gr.Column():
54
+ gr.Markdown("### Audio Examples")
55
+ gr.Examples(examples=infer_from_audio_examples,
56
+ label="From Catalan Google TTS dataset",
57
+ inputs=[audio_from_file, audio_from_microphone],
58
+ outputs=output_text,
59
+ fn=transcribe_audio,
60
+ cache_examples=True, )
61
+ gr.Markdown("### More Details")
62
+ gr.Markdown("The model used is a small version of the Whisper architecture. "
63
+ "Please, find more details about it in this [link](https://huggingface.co/openai/whisper-small) <br>"
64
+ "Whisper has been fine-tuned using the catalan CommonVoice v.11 and the ParlamentParla datasets. "
65
+ "More information about results and evaluation can be found in "
66
+ "[here](https://huggingface.co/MaximilianChen/Casper)")
67
+
68
+ demo.launch()
catalan_audio_examples/catalan_female_speech_1.wav ADDED
Binary file (557 kB). View file
 
catalan_audio_examples/catalan_female_speech_2.wav ADDED
Binary file (459 kB). View file
 
catalan_audio_examples/catalan_male_speech_1.wav ADDED
Binary file (778 kB). View file
 
examples.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # list of examples for quick inferences:
2
+ infer_from_audio_examples = [
3
+ ["catalan_audio_examples/catalan_female_speech_1.wav", None],
4
+ ["catalan_audio_examples/catalan_female_speech_2.wav", None],
5
+ ["catalan_audio_examples/catalan_male_speech_1.wav", None],
6
+ ]
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ torchaudio