File size: 4,741 Bytes
c229ede
 
 
 
 
 
ed02176
c229ede
 
ed02176
c229ede
 
ed02176
 
c229ede
 
 
ed02176
 
c229ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed02176
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import torch
import torchaudio
import gradio as gr
import time
import numpy as np
import scipy.io.wavfile
from omegaconf import OmegaConf  # βœ… Fix: Import omegaconf

# βœ… 1️⃣ Load Silero STT Model for CPU
device = torch.device("cpu")  
torch_dtype = torch.float32

# βœ… 2️⃣ Load Silero Model & Decoder with `trust_repo=True`
torch.set_num_threads(4)  
model, decoder, utils = torch.hub.load(repo_or_dir="snakers4/silero-models",
                                       model="silero_stt",
                                       language="en",
                                       device=device,
                                       trust_repo=True)  # βœ… Fix: Avoids untrusted repo warning
(read_batch, split_into_batches, read_audio, prepare_model_input) = utils

# βœ… 3️⃣ Real-Time Streaming Transcription (Microphone)
def stream_transcribe(stream, new_chunk):
    start_time = time.time()
    try:
        sr, y = new_chunk

        # βœ… Convert stereo to mono
        if y.ndim > 1:
            y = y.mean(axis=1)

        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        # βœ… Resample audio to 16kHz using torchaudio
        y_tensor = torch.tensor(y)
        y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()

        # βœ… Append to Stream
        if stream is not None:
            stream = np.concatenate([stream, y_resampled])
        else:
            stream = y_resampled

        # βœ… Prepare Model Input
        input_tensor = torch.from_numpy(stream).unsqueeze(0)
        input_tensor = prepare_model_input(input_tensor, device=device)

        # βœ… Run Transcription
        transcription = model(input_tensor)
        text = decoder(transcription[0].cpu())

        latency = time.time() - start_time
        return stream, text, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return stream, str(e), "Error"

# βœ… 4️⃣ Transcription for File Upload
def transcribe(inputs, previous_transcription):
    start_time = time.time()
    try:
        # βœ… Convert file input to correct format
        sample_rate, audio_data = inputs

        # βœ… Resample using torchaudio (optimized)
        audio_tensor = torch.tensor(audio_data)
        resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()

        # βœ… Prepare Model Input
        input_tensor = torch.from_numpy(resampled_audio).unsqueeze(0)
        input_tensor = prepare_model_input(input_tensor, device=device)

        # βœ… Run Transcription
        transcription = model(input_tensor)
        text = decoder(transcription[0].cpu())

        previous_transcription += text
        latency = time.time() - start_time

        return previous_transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return previous_transcription, "Error"

# βœ… 5️⃣ Clear Function
def clear():
    return ""

# βœ… 6️⃣ Gradio Interface (Microphone Streaming)
with gr.Blocks() as microphone:
    gr.Markdown(f"# Silero STT - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
    gr.Markdown("Using `Silero STT` for lightweight, accurate speech-to-text.")

    with gr.Row():
        input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
        output = gr.Textbox(label="Live Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        clear_button = gr.Button("Clear Output")

    state = gr.State()
    input_audio_microphone.stream(
        stream_transcribe, [state, input_audio_microphone], 
        [state, output, latency_textbox], time_limit=30, stream_every=1
    )
    clear_button.click(clear, outputs=[output])

# βœ… 7️⃣ Gradio Interface (File Upload)
with gr.Blocks() as file:
    gr.Markdown(f"# Upload Audio File for Transcription 🎡")
    gr.Markdown("Using `Silero STT` for offline, high-accuracy transcription.")

    with gr.Row():
        input_audio = gr.Audio(sources=["upload"], type="numpy")
        output = gr.Textbox(label="Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        submit_button = gr.Button("Submit")
        clear_button = gr.Button("Clear Output")

    submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
    clear_button.click(clear, outputs=[output])

# βœ… 8️⃣ Final Gradio App
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])

# βœ… 9️⃣ Run Gradio Locally
if __name__ == "__main__":
    demo.launch()