File size: 9,718 Bytes
5632b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b457f9
5632b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b457f9
5632b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c5a3b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278

# app.py

import gradio as gr
import soundfile as sf
import os
from transformers import pipeline

asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

def transcribe_speech(audio_filepath):
    if audio_filepath is None:
      gr.Warning('No audio found. Please try again!')
    # This line defines a Python function named 'transcribe_speech'
    # It takes one argument: 'audio_filepath', which is expected to be a string
    # representing the path to an audio file on your system (e.g., 'my_audio.wav').

    # 1. Load audio from file
    # This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
    # to read the contents of the audio file specified by 'audio_filepath'.
    # It returns two main pieces of information:
    # - 'audio': A NumPy array containing the numerical samples of the audio waveform.
    #            This is the raw digital representation of the sound.
    # - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
    #         samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
    audio, sr = sf.read(audio_filepath)

    # 2. Pass audio data to the ASR model/pipeline for transcription
    # This is the core step where the speech recognition happens.
    # - 'asr': This variable (which must be defined and initialized elsewhere in your code)
    #          represents your pre-trained ASR model or, more likely, a Hugging Face
    #          ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
    # - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
    #          expected by many Hugging Face ASR models and pipelines. It's a dictionary
    #          where:
    #          - 'array': Contains the raw numerical audio waveform.
    #          - 'sampling_rate': Provides the corresponding sampling rate.
    #          The ASR model needs both to correctly interpret the audio.
    # - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
    #             typically a dictionary containing the transcribed text and potentially
    #             other metadata (like word timestamps or confidence scores).
    result = asr(
        {"array": audio, "sampling_rate": sr}
    )

    # 3. Extract and return the transcribed text
    # The ASR pipeline or model usually returns its primary output (the transcription)
    # under a specific key, commonly 'text'.
    # This line extracts that text string from the 'result' dictionary.
    return result['text']

  
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(
        sources="microphone",
        type="filepath",
        label="🎀 Speak into your microphone" # Appealing label
    ),
    outputs=gr.Textbox(
        label="πŸ“ Transcription Result", # Appealing label
        lines=4, # Slightly more lines for longer transcriptions
        placeholder="Your transcribed text will appear here..."
    ),
    flagging_mode="never", # Disable flagging
    description="Record your voice directly using your device's microphone. Get an instant transcription."
)


file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(
        sources="upload", # Allow input from file upload
        type="filepath",  # Function receives audio as a temporary file path
        label="πŸ“ Upload an Audio File" # Appealing label
    ),
    outputs=gr.Textbox(
        label="πŸ“ Transcription Result", # Appealing label
        lines=4, # Slightly more lines
        placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
    ),
    flagging_mode="never", # Disable flagging
    description="Upload an audio file for transcription."
)


custom_css = """
/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
   but it's good practice for other fonts. */
@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');

/* Apply Arial to ALL text elements by default within the Gradio container */
.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
    font-family: 'Arial', sans-serif !important;
}

/* Overall container styling */
.gradio-container {
    max-width: 900px; /* Limit overall width for better readability */
    margin: 30px auto; /* Center the app on the page */
    padding: 30px;
    border-radius: 15px; /* Rounded corners for a softer look */
    box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
    background-color: #ffffff; /* White background for the main content area */
}

/* Titles and Headers */
h1 {
    color: #34495e; /* Darker blue-grey for main title */
    text-align: center;
    font-size: 2.5em; /* Larger main title */
    margin-bottom: 10px;
    font-weight: 700; /* Bold */
}

h3 {
    color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
    text-align: center;
    font-size: 1.2em;
    margin-top: 0;
    margin-bottom: 25px;
}

p {
    text-align: center;
    color: #7f8c8d; /* Muted grey for descriptions */
    font-size: 0.95em;
    margin-bottom: 20px;
}

/* Tabbed Interface Styling */
.tabs {
    border-radius: 10px;
    overflow: hidden; /* Ensures rounded corners on tabs */
    margin-bottom: 20px;
}

.tab-nav button {
    background-color: #ecf0f1; /* Light grey for inactive tabs */
    color: #34495e; /* Dark text for inactive tabs */
    font-weight: bold;
    padding: 12px 20px;
    border-radius: 8px 8px 0 0;
    margin-right: 5px; /* Small space between tabs */
    transition: all 0.3s ease;
}

.tab-nav button.selected {
    background-color: #4a90e2; /* Vibrant blue for active tab */
    color: white; /* White text for active tab */
    box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
}

/* Input and Output Component Styling (General) */
.gr-box {
    border-radius: 10px; /* Rounded corners for input/output boxes */
    border: 1px solid #dfe6e9; /* Light border */
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
    padding: 20px;
    background-color: #fcfcfc; /* Slightly off-white background */
}

/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
.label {
    font-weight: bold;
    color: #2c3e50; /* Dark text for labels */
    font-size: 1.1em;
    margin-bottom: 8px;
}

/* Buttons (Clear, Submit) */
.gr-button {
    background-color: #4a90e2 !important; /* Primary blue for actions */
    color: white !important;
    border: none !important;
    border-radius: 8px !important; /* Rounded buttons */
    padding: 12px 25px !important;
    font-weight: bold !important;
    transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
    margin: 5px; /* Spacing between buttons */
}

.gr-button:hover {
    background-color: #3a7bd2 !important; /* Darker blue on hover */
    box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
}

/* Clear button specific */
.gr-button.secondary {
    background-color: #e0e6eb !important; /* Lighter grey for clear */
    color: #34495e !important;
}
.gr-button.secondary:hover {
    background-color: #d1d8df !important;
    box-shadow: none !important;
}

/* Textbox specific */
textarea {
    border-radius: 8px !important;
    border: 1px solid #bdc3c7 !important;
    padding: 10px !important;
    resize: vertical; /* Allow vertical resizing */
}

/* Audio component player */
.gr-audio-player {
    border-radius: 8px;
    background-color: #f0f0f0;
    padding: 10px;
}

/* Footer styling */
hr {
    border: none;
    border-top: 1px solid #e0e0e0;
    margin-top: 30px;
    margin-bottom: 15px;
}

.footer-text {
    font-size: 0.85em;
    color: #a0a0a0;
    text-align: center;
}
"""

# --- 6. Main Gradio App using Blocks for layout and styling ---
# Initialize a Gradio Blocks interface with a theme and custom CSS.
demo = gr.Blocks(
    theme=gr.themes.Soft(), # A good base theme for soft colors
    css=custom_css          # Apply our custom CSS
)

# Define the layout within the 'demo' Blocks context
with demo:
    # Main Title and Description using Markdown for rich formatting and appealing colors
    # Removed inline style for font-family as it's handled by global CSS now.
    gr.Markdown(
        """
        <center>
            <h1 style="color: #4A90E2;">
                πŸŽ™οΈ AI-Powered Speech-to-Text Transcriber πŸ“
            </h1>
            <h3 style="color: #6C7A89;">
                Developed by Muhammad Farhan Aslam.
            </h3>
            <h3 style="color: #6C7A89;">
                Convert spoken words into accurate text with ease and precision.
            </h3>
            <p style="color: #8C9CA7; font-size: 1.05em;">
                Effortlessly transcribe audio from your microphone or by uploading a file.
                This application leverages advanced AI to provide clear and reliable transcriptions.
            </p>
        </center>
        """
    )

    # Create a tabbed interface for microphone and file upload transcription
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["πŸ“ Transcribe Audio File", "🎀 Transcribe from Microphone"],
    )

    # Add a subtle footer for information or credits
    gr.Markdown(
        """
        <hr>
        <p class="footer-text">
            Built with ❀️ and Gradio on Hugging Face Transformers.
        </p>
        """
    )
# start_port = int(os.environ.get('PORT1', 7861))
demo.launch(share=True)