adityas2410 commited on
Commit
4942de1
1 Parent(s): 97d393d

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +34 -0
  2. requirements.txt +6 -0
  3. ts_utilities.py +51 -0
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from ts_utilites import transcribe_speech, transcribe_long_form, text_to_speech
3
+
4
+ # Gradio App
5
+ app = gr.TabbedInterface(
6
+ [
7
+ # Transcribe Speech
8
+ gr.Interface(
9
+ fn=transcribe_speech,
10
+ inputs=gr.Audio(type="filepath"),
11
+ outputs=gr.Textbox(label="Transcription", lines=5),
12
+ title="Transcribe Speech",
13
+ allow_flagging="never",
14
+ ),
15
+ # Long-Form Transcription
16
+ gr.Interface(
17
+ fn=transcribe_long_form,
18
+ inputs=gr.Audio(type="filepath"),
19
+ outputs=gr.Textbox(label="Transcription", lines=10),
20
+ title="Long-Form Transcription",
21
+ allow_flagging="never",
22
+ ),
23
+ # Text-to-Speech
24
+ gr.Interface(
25
+ fn=text_to_speech,
26
+ inputs=gr.Textbox(label="Enter Text", placeholder="Type your text here...", lines=5),
27
+ outputs=gr.Audio(label="Generated Speech"),
28
+ title="Text-to-Speech",
29
+ allow_flagging="never",
30
+ )
31
+ ],
32
+ ["Transcribe Speech", "Long-Form Transcription", "Text-to-Speech"]
33
+ )
34
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ librosa
5
+ soundfile
6
+ phonemizer
ts_utilities.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ import soundfile as sf
4
+ from datasets import load_dataset
5
+ from transformers import pipeline
6
+
7
+ # Initialize pipelines for speech recognition and tts models
8
+ asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en")
9
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
10
+
11
+ # Speech-to-Text Function
12
+ def transcribe_speech(filepath):
13
+ if filepath is None:
14
+ return "No audio found. Please retry."
15
+ output = asr(filepath)
16
+ return output["text"]
17
+
18
+ # Long-Form Audio Transcription
19
+ def transcribe_long_form(filepath):
20
+ if filepath is None:
21
+ return "No audio found. Please retry."
22
+ # Load and preprocess audio
23
+ audio, sampling_rate = sf.read(filepath)
24
+ audio_transposed = np.transpose(audio)
25
+ audio_mono = librosa.to_mono(audio_transposed)
26
+ audio_16KHz = librosa.resample(audio_mono, orig_sr=sampling_rate, target_sr=16000)
27
+ # Transcribe using ASR pipeline
28
+ chunks = asr(audio_16KHz, chunk_length_s=30, batch_size=4, return_timestamps=True)["chunks"]
29
+ # Combine all transcriptions
30
+ return "\n".join([chunk["text"] for chunk in chunks])
31
+
32
+ # Text-to-Speech Function
33
+ def text_to_speech(text):
34
+ if not text.strip():
35
+ return "No text provided. Please enter text to synthesize."
36
+ narrated_text = narrator(text)
37
+ audio_array = narrated_text["audio"][0].flatten() # Flatten the 2D array to 1D
38
+ sampling_rate = narrated_text["sampling_rate"] # Get sampling rate
39
+ return sampling_rate, audio_array
40
+
41
+ # Sample Dataset Access Function
42
+ def get_dataset_sample(idx):
43
+ dataset = load_dataset("librispeech_asr", split="train.clean.100", streaming=True, trust_remote_code=True)
44
+ # example = next(iter(dataset))
45
+ dataset_head = list(dataset.take(5))
46
+ sample = dataset_head[idx]
47
+ audio_array = sample["audio"]["array"]
48
+ sampling_rate = sample["audio"]["sampling_rate"]
49
+ transcription = sample["text"]
50
+ return (audio_array, sampling_rate), transcription
51
+