File size: 4,058 Bytes
8bb1d29
a097442
95facbc
518eabe
95facbc
 
 
8bb1d29
95facbc
 
 
 
 
 
 
 
 
 
0720448
95facbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bb1d29
95facbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bb1d29
95facbc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import torchaudio
import torch
import os
from pydub import AudioSegment
import tempfile
from speechbrain.pretrained.separation import SepformerSeparation

class AudioDenoiser:
    def __init__(self):
        # Initialize the SepFormer model for audio enhancement
        self.model = SepformerSeparation.from_hparams(
            source="speechbrain/sepformer-dns4-16k-enhancement",
            savedir='pretrained_models/sepformer-dns4-16k-enhancement'
        )
        
        # Create output directory if it doesn't exist
        os.makedirs("enhanced_audio", exist_ok=True)
    
    def convert_audio_to_wav(self, input_path):
        """
        Convert any audio format to WAV with proper settings
        
        Args:
            input_path (str): Path to input audio file
            
        Returns:
            str: Path to converted WAV file
        """
        try:
            # Create a temporary file for the converted audio
            temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            temp_wav_path = temp_wav.name
            
            # Load audio using pydub (supports multiple formats)
            audio = AudioSegment.from_file(input_path)
            
            # Convert to mono if stereo
            if audio.channels > 1:
                audio = audio.set_channels(1)
            
            # Export as WAV with proper settings
            audio.export(
                temp_wav_path,
                format='wav',
                parameters=[
                    '-ar', '16000',  # Set sample rate to 16kHz
                    '-ac', '1'       # Set channels to mono
                ]
            )
            
            return temp_wav_path
            
        except Exception as e:
            raise gr.Error(f"Error converting audio format: {str(e)}")
    
    def enhance_audio(self, audio_path):
        """
        Process the input audio file and return the enhanced version
        
        Args:
            audio_path (str): Path to the input audio file
            
        Returns:
            str: Path to the enhanced audio file
        """
        try:
            # Convert input audio to proper WAV format
            wav_path = self.convert_audio_to_wav(audio_path)
            
            # Separate and enhance the audio
            est_sources = self.model.separate_file(path=wav_path)
            
            # Generate output filename
            output_path = os.path.join("enhanced_audio", "enhanced_audio.wav")
            
            # Save the enhanced audio
            torchaudio.save(
                output_path,
                est_sources[:, :, 0].detach().cpu(),
                16000  # Sample rate
            )
            
            # Clean up temporary file
            os.unlink(wav_path)
            
            return output_path
            
        except Exception as e:
            raise gr.Error(f"Error processing audio: {str(e)}")

def create_gradio_interface():
    # Initialize the denoiser
    denoiser = AudioDenoiser()
    
    # Create the Gradio interface
    interface = gr.Interface(
        fn=denoiser.enhance_audio,
        inputs=gr.Audio(
            type="filepath",
            label="Upload Noisy Audio"
        ),
        outputs=gr.Audio(
            label="Enhanced Audio",
            type="filepath"
        ),
        title="Audio Denoising using SepFormer",
        description="""
        This application uses the SepFormer model from SpeechBrain to enhance audio quality
        by removing background noise. Supports various audio formats including MP3 and WAV.
        """,
        article="""
        Supported audio formats:
        - MP3
        - WAV
        - OGG
        - FLAC
        - M4A
        and more...
        
        The audio will automatically be converted to the correct format for processing.
        """
    )
    
    return interface

if __name__ == "__main__":
    # Create and launch the interface
    demo = create_gradio_interface()
    demo.launch()