File size: 2,428 Bytes
9a50e70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a508e2
9a50e70
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
import librosa
import pickle
import numpy as np
import gradio as gr

class ML_model:
    def __init__(self):
        self.ml_model = torch.load("support_file/resnet_carcrash_94.pth", map_location=torch.device('cpu'))
        self.ml_model.eval()
        with open('support_file/indtocat.pkl', 'rb') as f:
            self.i2c = pickle.load(f)

    def spec_to_image(self, spec, eps=1e-6):
        mean = spec.mean()
        std = spec.std()
        spec_norm = (spec - mean) / (std + eps)
        spec_min, spec_max = spec_norm.min(), spec_norm.max()
        spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
        spec_scaled = spec_scaled.astype(np.uint8)
        return spec_scaled

    def get_melspectrogram_db(self, file_path):
        # Load audio file
        wav, sr = librosa.load(file_path, sr=None)
        sr= 44100
        # Parameters for mel spectrogram
        n_fft = 2048
        hop_length = 512
        n_mels = 128
        fmin = 20
        fmax = 8300
        if wav.shape[0]<5*sr:
            wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
        else:
            wav=wav[:5*sr]
        # Compute mel spectrogram
        spec = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax)
        
        # Convert to dB scale
        spec_db = librosa.power_to_db(spec, ref=np.max)
        return spec_db
    
    def get_prediction(self, file_path):
        spec_db = self.get_melspectrogram_db(file_path)
        input_image = self.spec_to_image(spec_db)
        input_tensor = torch.tensor(input_image[np.newaxis, np.newaxis, ...], dtype=torch.float32).to('cpu')
        predictions = self.ml_model(input_tensor)
        predicted_index = predictions.argmax(dim=1).item()
        return self.i2c[predicted_index]

def predict(file_path):
    ml_model = ML_model()  # Initialize model
    prediction = ml_model.get_prediction(file_path)
    return prediction

interface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="filepath", label="Upload your audio file"),
    outputs="text",
    title="Car Crash Sound Detection",
    description="Upload a car crash sound clip and the model will identify the crash type.",
    examples=["input_fileszQ1QmqrakIA_5-talking.wav","Car_Crash.mp4","input_fileszQ1QmqrakIA_13-crash.wav"],
    cache_examples=False
)

interface.launch(share=True)