File size: 4,889 Bytes
32b6530
f33e6ad
32b6530
f33e6ad
32b6530
 
d63bba0
32b6530
a612649
32b6530
d63bba0
a612649
32b6530
a612649
32b6530
d63bba0
32b6530
 
 
d63bba0
 
 
 
32b6530
f33e6ad
32b6530
 
 
 
 
 
a612649
32b6530
 
a612649
32b6530
 
d63bba0
32b6530
 
 
 
 
 
 
 
826b8e9
32b6530
 
 
a612649
826b8e9
f33e6ad
 
826b8e9
 
 
 
 
 
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
 
 
 
f33e6ad
826b8e9
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
d63bba0
826b8e9
a612649
 
826b8e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
d63bba0
826b8e9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import torch
import torchaudio
import gradio as gr
import time
import numpy as np
import scipy.io.wavfile
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# βœ… 1️⃣ Use "whisper-medium" for the best balance of speed & accuracy
device = "cpu"
torch_dtype = torch.float32
MODEL_NAME = "openai/whisper-medium"

# βœ… 2️⃣ Load Whisper Model on CPU
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

# βœ… 3️⃣ Speed up execution with torch.compile()
model = torch.compile(model)  # βœ… Faster inference on CPU

# βœ… 4️⃣ Load Processor & Pipeline
processor = AutoProcessor.from_pretrained(MODEL_NAME)
processor.feature_extractor.sampling_rate = 16000  # βœ… Set correct sampling rate

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,  # βœ… Longer chunks for better accuracy
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1},  # βœ… Beam search + English
)

# βœ… 5️⃣ Real-Time Streaming Transcription (Microphone)
def stream_transcribe(stream, new_chunk):
    start_time = time.time()
    try:
        sr, y = new_chunk

        # βœ… Convert stereo to mono
        if y.ndim > 1:
            y = y.mean(axis=1)

        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        # βœ… Resample audio to 16kHz using torchaudio
        y_tensor = torch.tensor(y)
        y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()

        # βœ… Append to Stream
        if stream is not None:
            stream = np.concatenate([stream, y_resampled])
        else:
            stream = y_resampled
            
        # βœ… Run Transcription with Optimized Parameters
        transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
        latency = time.time() - start_time

        return stream, transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return stream, str(e), "Error"

# βœ… 6️⃣ Transcription for File Upload
def transcribe(inputs, previous_transcription):
    start_time = time.time()
    try:
        # βœ… Convert file input to correct format
        sample_rate, audio_data = inputs

        # βœ… Resample using torchaudio (optimized)
        audio_tensor = torch.tensor(audio_data)
        resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()

        transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]

        previous_transcription += transcription
        latency = time.time() - start_time

        return previous_transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return previous_transcription, "Error"

# βœ… 7️⃣ Clear Function
def clear():
    return ""

# βœ… 8️⃣ Gradio Interface (Microphone Streaming)
with gr.Blocks() as microphone:
    gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) πŸŽ™οΈ")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")

    with gr.Row():
        input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
        output = gr.Textbox(label="Live Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        clear_button = gr.Button("Clear Output")

    state = gr.State()
    input_audio_microphone.stream(
        stream_transcribe, [state, input_audio_microphone], 
        [state, output, latency_textbox], time_limit=30, stream_every=1
    )
    clear_button.click(clear, outputs=[output])

# βœ… 9️⃣ Gradio Interface (File Upload)
with gr.Blocks() as file:
    gr.Markdown(f"# Upload Audio File for Transcription 🎡")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")

    with gr.Row():
        input_audio = gr.Audio(sources=["upload"], type="numpy")
        output = gr.Textbox(label="Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        submit_button = gr.Button("Submit")
        clear_button = gr.Button("Clear Output")

    submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
    clear_button.click(clear, outputs=[output])

# βœ… πŸ”Ÿ Final Gradio App
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])

# βœ… 1️⃣1️⃣ Run Gradio Locally
if __name__ == "__main__":
    demo.launch()