unijoh commited on
Commit
7a86b92
1 Parent(s): 8c28ece

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -24
app.py CHANGED
@@ -1,31 +1,41 @@
1
- import gradio as gr
2
- from asr import transcribe
3
- from tts import synthesize_speech
4
- from lid import identify
5
 
6
- def main():
7
- with gr.Blocks() as demo:
8
- gr.Markdown("# Faroese ASR, TTS, and LID Demo")
9
 
10
- with gr.Tab("ASR"):
11
- audio_input = gr.Audio(source="microphone", type="filepath")
12
- transcribe_button = gr.Button("Transcribe")
13
- transcribe_output = gr.Textbox()
14
- transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=transcribe_output)
15
 
16
- with gr.Tab("TTS"):
17
- text_input = gr.Textbox(label="Text Input")
18
- synthesize_button = gr.Button("Synthesize")
19
- synthesize_output = gr.Audio()
20
- synthesize_button.click(fn=synthesize_speech, inputs=text_input, outputs=synthesize_output)
21
 
22
- with gr.Tab("LID"):
23
- audio_input_lid = gr.Audio(source="microphone", type="filepath")
24
- identify_button = gr.Button("Identify Language")
25
- identify_output = gr.Textbox()
26
- identify_button.click(fn=identify, inputs=audio_input_lid, outputs=identify_output)
 
 
 
 
 
 
 
 
 
 
27
 
28
- demo.launch()
 
 
 
 
 
29
 
30
  if __name__ == "__main__":
31
- main()
 
1
+ import os
2
+ import subprocess
 
 
3
 
4
+ # Run the setup script
5
+ subprocess.run(['bash', 'setup.sh'], check=True)
 
6
 
7
+ import gradio as gr
8
+ import torch
9
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
10
+ import librosa
 
11
 
12
+ # Load pre-trained model and processor
13
+ model_name = "facebook/wav2vec2-base-960h"
14
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
15
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
 
16
 
17
+ def transcribe(audio):
18
+ # Load audio
19
+ audio_input, _ = librosa.load(audio, sr=16000)
20
+
21
+ # Tokenize and process
22
+ inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
23
+ with torch.no_grad():
24
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
25
+
26
+ # Get predicted ids
27
+ predicted_ids = torch.argmax(logits, dim=-1)
28
+
29
+ # Decode the ids to text
30
+ transcription = processor.batch_decode(predicted_ids)
31
+ return transcription[0]
32
 
33
+ # Define the Gradio interface
34
+ iface = gr.Interface(
35
+ fn=transcribe,
36
+ inputs=gr.Audio(source="microphone", type="filepath"),
37
+ outputs="text"
38
+ )
39
 
40
  if __name__ == "__main__":
41
+ iface.launch()