File size: 2,279 Bytes
f029287
a61fcf3
 
 
f029287
a61fcf3
f029287
e268a60
a61fcf3
 
 
 
 
 
 
f029287
 
a61fcf3
 
 
 
 
f029287
 
 
a61fcf3
 
f029287
a61fcf3
 
 
f029287
a61fcf3
 
f029287
a61fcf3
 
285119c
a61fcf3
f029287
 
a61fcf3
 
 
 
 
 
f029287
 
a61fcf3
 
 
 
f029287
 
a61fcf3
 
 
f029287
a61fcf3
f029287
a61fcf3
 
f029287
a61fcf3
 
f029287
a61fcf3
f029287
a61fcf3
f029287
a61fcf3
f029287
e268a60
f029287
a61fcf3
 
 
 
 
f029287
 
a61fcf3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tempfile
import librosa.display


def calculate_basic_metrics(y, sr):
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    average_pitch = np.mean(pitches[pitches > 0])
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    energy = np.sum(y ** 2)
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

    return {
        'Average Pitch': average_pitch,
        'Number of MFCCs': mfccs.shape[1],
        'Energy': energy,
        'Zero Crossing Rate': zero_crossing_rate,
        'Spectral Centroid': spectral_centroid
    }


def calculate_advanced_metrics(y, sr):
    metrics = {}

    f0, _, _ = librosa.pyin(y, fmin=50, fmax=4000)
    if f0 is not None:
        metrics['Average F0 (YIN)'] = np.nanmean(f0)

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    metrics['Average Chroma'] = np.mean(chroma)

    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    metrics['Average Spectral Contrast'] = np.mean(spectral_contrast)

    return metrics


def generate_spectrogram(y, sr):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max), sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.tight_layout()


    with tempfile.NamedTemporaryFile(delete=False, suffix='.png', mode='w+b') as f:
        plt.savefig(f.name, format='png')
        plt.close()
        return f.name


def process_audio(file):
    if file is None:
        return {}, "placeholder.png"  

    sr, y = file

    if y.dtype != np.float32:
        y = y.astype(np.float32) / np.iinfo(y.dtype).max

    basic_metrics = calculate_basic_metrics(y, sr)
    advanced_metrics = calculate_advanced_metrics(y, sr)

    metrics = {**basic_metrics, **advanced_metrics}

    image_path = generate_spectrogram(y, sr)

    return metrics, image_path


iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(label="Upload Audio"),
    outputs=["json", "image"],
    title="Speech-Scope",
    description="Speech and audio Metrics Analysis"
)

iface.launch(debug=True)