File size: 4,633 Bytes
de6325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32bd0d9
 
de6325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628f79b
de6325b
 
 
32bd0d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import gradio as gr
from transformers import pipeline
import numpy as np
import time
from typing import Tuple
import logging
import torch


# Create a logger. 
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check if all the variables are set.
required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
for required_variable in required_variables:
    if os.environ.get(required_variable, "NO") == "NO":
        logger.error(
            f"Environment variable {required_variable} is not set. "
            "Please set it before running the application."
        )
        raise ValueError(
            f"Environment variable {required_variable} is not set. "
            "Please set it before running the application."
        )


# Create the transcription pipeline.
model_name = os.environ["MODEL_NAME"]
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Loading model {model_name} with device {device}...")
transcriber = pipeline(
    "automatic-speech-recognition", 
    model=model_name, 
    device=device
)
logger.info(f"Model loaded successfully.")


# Start the app.
def main():
    interface = create_interface()
    interface.launch()

    
# Create the Gradio interface for the Whisper transcription service.
def create_interface():
    
    # The UI is a block of Gradio components.
    with gr.Blocks() as interface:

        # Title.
        gr.Markdown("# Whisper Speech Transcription")

        # One row for the password input and another for the audio input.
        with gr.Row():
            with gr.Column(scale=2):
                passwort_input = gr.Textbox(
                    label="Enter Password",
                    placeholder="Enter the password to access the transcription service",
                    type="password"
                )

        # Row for audio input.
        with gr.Row():
            with gr.Column(scale=2):
                audio_input = gr.Audio(
                    sources=["microphone", "upload"], 
                    type="numpy",
                    label="Record or Upload Audio"
                )
                
        # Row for the transcription button.
        with gr.Row():
            transcribe_button = gr.Button("Transcribe", variant="primary")
            
        # Row for the transcription output.
        with gr.Row():
            output_text = gr.Textbox(
                label="Transcription Output",
                placeholder="Transcription will appear here...",
                lines=5
            )

        # Status message for transcription time.
        status_text = gr.Textbox(
            label="Status",
            placeholder="Transcription status will appear here...",
            lines=1,
            interactive=False
        )
  
        # Set up the transcribe button click event
        transcribe_button.click(
            fn=transcribe_audio,
            inputs=[audio_input, passwort_input],
            outputs=[output_text, status_text],
        )
        
        # Also transcribe when audio is recorded/uploaded
        audio_input.change(
            fn=transcribe_audio,
            inputs=[audio_input, passwort_input],
            outputs=[output_text, status_text],
        )
    return interface


def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:
    
    # If the password is wrong, return an error message.
    if password != os.environ.get("PASSWORD"):
        return "Incorrect password. Please try again.", ""

    # If there is no audio, return an error message.
    if audio is None:
        return "No audio detected. Please record some audio.", ""
        
    # Start measuring the time.
    start_time = time.time()

    # Unpack the audio.
    sr, y = audio
    
    # Convert to mono if stereo
    if y.ndim > 1:
        logger.debug(f"Converting {y.shape[1]} channels to mono")
        y = y.mean(axis=1)
    
    # Normalize audio
    y = y.astype(np.float32)
    max_abs = np.max(np.abs(y))
    if max_abs > 0:  # Avoid division by zero
        y /= max_abs
    
    logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")
    
    # Run transcription
    result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
    logger.info(f"Transcription completed.")

    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    audio_time = len(y) / sr
    status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}."
    return result["text"], status_string


# Entrypoint.
main()