Spaces:

MuhammadFarhanAslam
/

AI-Powered_Speech-to-Text_Transcriber

Sleeping

File size: 9,718 Bytes


# app.py

import gradio as gr
import soundfile as sf
import os
from transformers import pipeline

asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

def transcribe_speech(audio_filepath):
    if audio_filepath is None:
      gr.Warning('No audio found. Please try again!')
    # This line defines a Python function named 'transcribe_speech'
    # It takes one argument: 'audio_filepath', which is expected to be a string
    # representing the path to an audio file on your system (e.g., 'my_audio.wav').

    # 1. Load audio from file
    # This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
    # to read the contents of the audio file specified by 'audio_filepath'.
    # It returns two main pieces of information:
    # - 'audio': A NumPy array containing the numerical samples of the audio waveform.
    #            This is the raw digital representation of the sound.
    # - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
    #         samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
    audio, sr = sf.read(audio_filepath)

    # 2. Pass audio data to the ASR model/pipeline for transcription
    # This is the core step where the speech recognition happens.
    # - 'asr': This variable (which must be defined and initialized elsewhere in your code)
    #          represents your pre-trained ASR model or, more likely, a Hugging Face
    #          ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
    # - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
    #          expected by many Hugging Face ASR models and pipelines. It's a dictionary
    #          where:
    #          - 'array': Contains the raw numerical audio waveform.
    #          - 'sampling_rate': Provides the corresponding sampling rate.
    #          The ASR model needs both to correctly interpret the audio.
    # - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
    #             typically a dictionary containing the transcribed text and potentially
    #             other metadata (like word timestamps or confidence scores).
    result = asr(
        {"array": audio, "sampling_rate": sr}
    )

    # 3. Extract and return the transcribed text
    # The ASR pipeline or model usually returns its primary output (the transcription)
    # under a specific key, commonly 'text'.
    # This line extracts that text string from the 'result' dictionary.
    return result['text']

  
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(
        sources="microphone",
        type="filepath",
        label="🎤 Speak into your microphone" # Appealing label
    ),
    outputs=gr.Textbox(
        label="📝 Transcription Result", # Appealing label
        lines=4, # Slightly more lines for longer transcriptions
        placeholder="Your transcribed text will appear here..."
    ),
    flagging_mode="never", # Disable flagging
    description="Record your voice directly using your device's microphone. Get an instant transcription."
)


file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(
        sources="upload", # Allow input from file upload
        type="filepath",  # Function receives audio as a temporary file path
        label="📁 Upload an Audio File" # Appealing label
    ),
    outputs=gr.Textbox(
        label="📝 Transcription Result", # Appealing label
        lines=4, # Slightly more lines
        placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
    ),
    flagging_mode="never", # Disable flagging
    description="Upload an audio file for transcription."
)


custom_css = """
/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
   but it's good practice for other fonts. */
@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');

/* Apply Arial to ALL text elements by default within the Gradio container */
.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
    font-family: 'Arial', sans-serif !important;
}

/* Overall container styling */
.gradio-container {
    max-width: 900px; /* Limit overall width for better readability */
    margin: 30px auto; /* Center the app on the page */
    padding: 30px;
    border-radius: 15px; /* Rounded corners for a softer look */
    box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
    background-color: #ffffff; /* White background for the main content area */
}

/* Titles and Headers */
h1 {
    color: #34495e; /* Darker blue-grey for main title */
    text-align: center;
    font-size: 2.5em; /* Larger main title */
    margin-bottom: 10px;
    font-weight: 700; /* Bold */
}

h3 {
    color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
    text-align: center;
    font-size: 1.2em;
    margin-top: 0;
    margin-bottom: 25px;
}

p {
    text-align: center;
    color: #7f8c8d; /* Muted grey for descriptions */
    font-size: 0.95em;
    margin-bottom: 20px;
}

/* Tabbed Interface Styling */
.tabs {
    border-radius: 10px;
    overflow: hidden; /* Ensures rounded corners on tabs */
    margin-bottom: 20px;
}

.tab-nav button {
    background-color: #ecf0f1; /* Light grey for inactive tabs */
    color: #34495e; /* Dark text for inactive tabs */
    font-weight: bold;
    padding: 12px 20px;
    border-radius: 8px 8px 0 0;
    margin-right: 5px; /* Small space between tabs */
    transition: all 0.3s ease;
}

.tab-nav button.selected {
    background-color: #4a90e2; /* Vibrant blue for active tab */
    color: white; /* White text for active tab */
    box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
}

/* Input and Output Component Styling (General) */
.gr-box {
    border-radius: 10px; /* Rounded corners for input/output boxes */
    border: 1px solid #dfe6e9; /* Light border */
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
    padding: 20px;
    background-color: #fcfcfc; /* Slightly off-white background */
}

/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
.label {
    font-weight: bold;
    color: #2c3e50; /* Dark text for labels */
    font-size: 1.1em;
    margin-bottom: 8px;
}

/* Buttons (Clear, Submit) */
.gr-button {
    background-color: #4a90e2 !important; /* Primary blue for actions */
    color: white !important;
    border: none !important;
    border-radius: 8px !important; /* Rounded buttons */
    padding: 12px 25px !important;
    font-weight: bold !important;
    transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
    margin: 5px; /* Spacing between buttons */
}

.gr-button:hover {
    background-color: #3a7bd2 !important; /* Darker blue on hover */
    box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
}

/* Clear button specific */
.gr-button.secondary {
    background-color: #e0e6eb !important; /* Lighter grey for clear */
    color: #34495e !important;
}
.gr-button.secondary:hover {
    background-color: #d1d8df !important;
    box-shadow: none !important;
}

/* Textbox specific */
textarea {
    border-radius: 8px !important;
    border: 1px solid #bdc3c7 !important;
    padding: 10px !important;
    resize: vertical; /* Allow vertical resizing */
}

/* Audio component player */
.gr-audio-player {
    border-radius: 8px;
    background-color: #f0f0f0;
    padding: 10px;
}

/* Footer styling */
hr {
    border: none;
    border-top: 1px solid #e0e0e0;
    margin-top: 30px;
    margin-bottom: 15px;
}

.footer-text {
    font-size: 0.85em;
    color: #a0a0a0;
    text-align: center;
}
"""

# --- 6. Main Gradio App using Blocks for layout and styling ---
# Initialize a Gradio Blocks interface with a theme and custom CSS.
demo = gr.Blocks(
    theme=gr.themes.Soft(), # A good base theme for soft colors
    css=custom_css          # Apply our custom CSS
)

# Define the layout within the 'demo' Blocks context
with demo:
    # Main Title and Description using Markdown for rich formatting and appealing colors
    # Removed inline style for font-family as it's handled by global CSS now.
    gr.Markdown(
        """
        <center>
            <h1 style="color: #4A90E2;">
                🎙️ AI-Powered Speech-to-Text Transcriber 📝
            </h1>
            <h3 style="color: #6C7A89;">
                Developed by Muhammad Farhan Aslam.
            </h3>
            <h3 style="color: #6C7A89;">
                Convert spoken words into accurate text with ease and precision.
            </h3>
            <p style="color: #8C9CA7; font-size: 1.05em;">
                Effortlessly transcribe audio from your microphone or by uploading a file.
                This application leverages advanced AI to provide clear and reliable transcriptions.
            </p>
        </center>
        """
    )

    # Create a tabbed interface for microphone and file upload transcription
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["📁 Transcribe Audio File", "🎤 Transcribe from Microphone"],
    )

    # Add a subtle footer for information or credits
    gr.Markdown(
        """
        <hr>
        <p class="footer-text">
            Built with ❤️ and Gradio on Hugging Face Transformers.
        </p>
        """
    )
# start_port = int(os.environ.get('PORT1', 7861))
demo.launch(share=True)