File size: 9,718 Bytes
5632b5a 7b457f9 5632b5a 7b457f9 5632b5a 6c5a3b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
# app.py
import gradio as gr
import soundfile as sf
import os
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition",
model="distil-whisper/distil-small.en")
def transcribe_speech(audio_filepath):
if audio_filepath is None:
gr.Warning('No audio found. Please try again!')
# This line defines a Python function named 'transcribe_speech'
# It takes one argument: 'audio_filepath', which is expected to be a string
# representing the path to an audio file on your system (e.g., 'my_audio.wav').
# 1. Load audio from file
# This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
# to read the contents of the audio file specified by 'audio_filepath'.
# It returns two main pieces of information:
# - 'audio': A NumPy array containing the numerical samples of the audio waveform.
# This is the raw digital representation of the sound.
# - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
# samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
audio, sr = sf.read(audio_filepath)
# 2. Pass audio data to the ASR model/pipeline for transcription
# This is the core step where the speech recognition happens.
# - 'asr': This variable (which must be defined and initialized elsewhere in your code)
# represents your pre-trained ASR model or, more likely, a Hugging Face
# ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
# - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
# expected by many Hugging Face ASR models and pipelines. It's a dictionary
# where:
# - 'array': Contains the raw numerical audio waveform.
# - 'sampling_rate': Provides the corresponding sampling rate.
# The ASR model needs both to correctly interpret the audio.
# - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
# typically a dictionary containing the transcribed text and potentially
# other metadata (like word timestamps or confidence scores).
result = asr(
{"array": audio, "sampling_rate": sr}
)
# 3. Extract and return the transcribed text
# The ASR pipeline or model usually returns its primary output (the transcription)
# under a specific key, commonly 'text'.
# This line extracts that text string from the 'result' dictionary.
return result['text']
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(
sources="microphone",
type="filepath",
label="π€ Speak into your microphone" # Appealing label
),
outputs=gr.Textbox(
label="π Transcription Result", # Appealing label
lines=4, # Slightly more lines for longer transcriptions
placeholder="Your transcribed text will appear here..."
),
flagging_mode="never", # Disable flagging
description="Record your voice directly using your device's microphone. Get an instant transcription."
)
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(
sources="upload", # Allow input from file upload
type="filepath", # Function receives audio as a temporary file path
label="π Upload an Audio File" # Appealing label
),
outputs=gr.Textbox(
label="π Transcription Result", # Appealing label
lines=4, # Slightly more lines
placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
),
flagging_mode="never", # Disable flagging
description="Upload an audio file for transcription."
)
custom_css = """
/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
but it's good practice for other fonts. */
@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');
/* Apply Arial to ALL text elements by default within the Gradio container */
.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
font-family: 'Arial', sans-serif !important;
}
/* Overall container styling */
.gradio-container {
max-width: 900px; /* Limit overall width for better readability */
margin: 30px auto; /* Center the app on the page */
padding: 30px;
border-radius: 15px; /* Rounded corners for a softer look */
box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
background-color: #ffffff; /* White background for the main content area */
}
/* Titles and Headers */
h1 {
color: #34495e; /* Darker blue-grey for main title */
text-align: center;
font-size: 2.5em; /* Larger main title */
margin-bottom: 10px;
font-weight: 700; /* Bold */
}
h3 {
color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
text-align: center;
font-size: 1.2em;
margin-top: 0;
margin-bottom: 25px;
}
p {
text-align: center;
color: #7f8c8d; /* Muted grey for descriptions */
font-size: 0.95em;
margin-bottom: 20px;
}
/* Tabbed Interface Styling */
.tabs {
border-radius: 10px;
overflow: hidden; /* Ensures rounded corners on tabs */
margin-bottom: 20px;
}
.tab-nav button {
background-color: #ecf0f1; /* Light grey for inactive tabs */
color: #34495e; /* Dark text for inactive tabs */
font-weight: bold;
padding: 12px 20px;
border-radius: 8px 8px 0 0;
margin-right: 5px; /* Small space between tabs */
transition: all 0.3s ease;
}
.tab-nav button.selected {
background-color: #4a90e2; /* Vibrant blue for active tab */
color: white; /* White text for active tab */
box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
}
/* Input and Output Component Styling (General) */
.gr-box {
border-radius: 10px; /* Rounded corners for input/output boxes */
border: 1px solid #dfe6e9; /* Light border */
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
padding: 20px;
background-color: #fcfcfc; /* Slightly off-white background */
}
/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
.label {
font-weight: bold;
color: #2c3e50; /* Dark text for labels */
font-size: 1.1em;
margin-bottom: 8px;
}
/* Buttons (Clear, Submit) */
.gr-button {
background-color: #4a90e2 !important; /* Primary blue for actions */
color: white !important;
border: none !important;
border-radius: 8px !important; /* Rounded buttons */
padding: 12px 25px !important;
font-weight: bold !important;
transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
margin: 5px; /* Spacing between buttons */
}
.gr-button:hover {
background-color: #3a7bd2 !important; /* Darker blue on hover */
box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
}
/* Clear button specific */
.gr-button.secondary {
background-color: #e0e6eb !important; /* Lighter grey for clear */
color: #34495e !important;
}
.gr-button.secondary:hover {
background-color: #d1d8df !important;
box-shadow: none !important;
}
/* Textbox specific */
textarea {
border-radius: 8px !important;
border: 1px solid #bdc3c7 !important;
padding: 10px !important;
resize: vertical; /* Allow vertical resizing */
}
/* Audio component player */
.gr-audio-player {
border-radius: 8px;
background-color: #f0f0f0;
padding: 10px;
}
/* Footer styling */
hr {
border: none;
border-top: 1px solid #e0e0e0;
margin-top: 30px;
margin-bottom: 15px;
}
.footer-text {
font-size: 0.85em;
color: #a0a0a0;
text-align: center;
}
"""
# --- 6. Main Gradio App using Blocks for layout and styling ---
# Initialize a Gradio Blocks interface with a theme and custom CSS.
demo = gr.Blocks(
theme=gr.themes.Soft(), # A good base theme for soft colors
css=custom_css # Apply our custom CSS
)
# Define the layout within the 'demo' Blocks context
with demo:
# Main Title and Description using Markdown for rich formatting and appealing colors
# Removed inline style for font-family as it's handled by global CSS now.
gr.Markdown(
"""
<center>
<h1 style="color: #4A90E2;">
ποΈ AI-Powered Speech-to-Text Transcriber π
</h1>
<h3 style="color: #6C7A89;">
Developed by Muhammad Farhan Aslam.
</h3>
<h3 style="color: #6C7A89;">
Convert spoken words into accurate text with ease and precision.
</h3>
<p style="color: #8C9CA7; font-size: 1.05em;">
Effortlessly transcribe audio from your microphone or by uploading a file.
This application leverages advanced AI to provide clear and reliable transcriptions.
</p>
</center>
"""
)
# Create a tabbed interface for microphone and file upload transcription
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["π Transcribe Audio File", "π€ Transcribe from Microphone"],
)
# Add a subtle footer for information or credits
gr.Markdown(
"""
<hr>
<p class="footer-text">
Built with β€οΈ and Gradio on Hugging Face Transformers.
</p>
"""
)
# start_port = int(os.environ.get('PORT1', 7861))
demo.launch(share=True)
|