Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import tempfile | |
| import zipfile | |
| import numpy as np | |
| import pandas as pd | |
| import librosa | |
| import librosa.display | |
| import matplotlib.pyplot as plt | |
| import soundfile as sf | |
| import gradio as gr | |
| from scipy.signal import medfilt | |
| from noisereduce import reduce_noise | |
| import webrtcvad | |
| from pesq import pesq | |
| from pystoi import stoi | |
| def load_audio(file_obj): | |
| y, sr = librosa.load(file_obj, sr=16000) | |
| return y, sr | |
| def save_audio(y, sr, path): | |
| sf.write(path, y, sr) | |
| def plot_waveform(y, sr, title): | |
| plt.figure(figsize=(10, 2)) | |
| librosa.display.waveshow(y, sr=sr) | |
| plt.title(title) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| plt.close() | |
| buf.seek(0) | |
| return buf | |
| def plot_spectrogram(y, sr, title): | |
| plt.figure(figsize=(10, 3)) | |
| D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) | |
| librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log') | |
| plt.colorbar(format='%+2.0f dB') | |
| plt.title(title) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| plt.close() | |
| buf.seek(0) | |
| return buf | |
| def vad_plot(y, sr, title): | |
| vad = webrtcvad.Vad(2) | |
| if sr != 16000: | |
| y = librosa.resample(y, orig_sr=sr, target_sr=16000) | |
| sr = 16000 | |
| frame_duration_ms = 30 | |
| frame_size = int(sr * frame_duration_ms / 1000) | |
| y = np.pad(y, (0, frame_size - len(y) % frame_size)) if len(y) % frame_size != 0 else y | |
| frames = np.split(y, len(y) // frame_size) | |
| voiced = [] | |
| for frame in frames: | |
| pcm = (frame * 32767).astype(np.int16).tobytes() | |
| try: | |
| voiced.append(vad.is_speech(pcm, sr)) | |
| except: | |
| voiced.append(False) | |
| plt.figure(figsize=(10, 1.5)) | |
| plt.plot(voiced, drawstyle='steps-mid') | |
| plt.title(title) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| plt.close() | |
| buf.seek(0) | |
| return buf | |
| def compute_pesq_mfcc_stoi(original_path, enhanced_path): | |
| sr = 16000 | |
| original, _ = librosa.load(original_path, sr=sr) | |
| enhanced, _ = librosa.load(enhanced_path, sr=sr) | |
| pesq_score = pesq(sr, original, enhanced, 'wb') | |
| stoi_score = stoi(original, enhanced, sr, extended=False) | |
| mfcc_diff = np.mean(np.abs( | |
| librosa.feature.mfcc(original, sr, n_mfcc=13) - | |
| librosa.feature.mfcc(enhanced, sr, n_mfcc=13) | |
| )) | |
| return pesq_score, stoi_score, mfcc_diff | |
| def compute_snr(original, enhanced): | |
| noise = original - enhanced | |
| snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-9)) | |
| return snr | |
| def noise_reduction(y, sr): return reduce_noise(y=y, sr=sr) | |
| def voice_isolation(y, sr): return y # Placeholder | |
| def reverb_cleanup(y, sr): return medfilt(y, kernel_size=5) | |
| def volume_normalize(y): return y / np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else y | |
| def language_aware_tuning(y, sr): return librosa.effects.preemphasis(y) | |
| def process_files(files, nr, vi, reverb, vol, lang, skip_metrics=False, progress=gr.Progress()): | |
| results, metrics = [], [] | |
| temp_dir = tempfile.mkdtemp() | |
| zip_path = os.path.join(temp_dir, "enhanced_output.zip") | |
| zipf = zipfile.ZipFile(zip_path, 'w') | |
| total = len(files) | |
| for i, file_obj in enumerate(files): | |
| progress((i + 1) / total, desc=f"Processing {file_obj.name}") | |
| y, sr = load_audio(file_obj) | |
| original_y = y.copy() | |
| if nr: y = noise_reduction(y, sr) | |
| if vi: y = voice_isolation(y, sr) | |
| if reverb: y = reverb_cleanup(y, sr) | |
| if vol: y = volume_normalize(y) | |
| if lang: y = language_aware_tuning(y, sr) | |
| name = os.path.splitext(file_obj.name)[0] | |
| orig_path = os.path.join(temp_dir, f"{name}_original.wav") | |
| enh_path = os.path.join(temp_dir, f"{name}_enhanced.wav") | |
| save_audio(original_y, sr, orig_path) | |
| save_audio(y, sr, enh_path) | |
| for plot_func, label in [(plot_waveform, "waveform"), (plot_spectrogram, "spectrogram"), (vad_plot, "vad")]: | |
| for typ, signal in [("original", original_y), ("enhanced", y)]: | |
| buf = plot_func(signal, sr, f"{typ.title()} {label.title()}") | |
| img_path = os.path.join(temp_dir, f"{name}_{label}_{typ}.png") | |
| with open(img_path, "wb") as f: | |
| f.write(buf.read()) | |
| zipf.write(img_path, arcname=os.path.basename(img_path)) | |
| if skip_metrics: | |
| pesq_score = stoi_score = mfcc_diff = None | |
| else: | |
| try: | |
| pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(orig_path, enh_path) | |
| except: | |
| pesq_score, stoi_score, mfcc_diff = None, None, None | |
| snr = compute_snr(original_y, y) | |
| metrics.append({ | |
| "file": file_obj.name, | |
| "SNR": snr, | |
| "PESQ": pesq_score, | |
| "STOI": stoi_score, | |
| "MFCC Diff": mfcc_diff | |
| }) | |
| zipf.write(orig_path, arcname=os.path.basename(orig_path)) | |
| zipf.write(enh_path, arcname=os.path.basename(enh_path)) | |
| df = pd.DataFrame(metrics) | |
| metrics_path = os.path.join(temp_dir, "metrics.csv") | |
| df.to_csv(metrics_path, index=False) | |
| zipf.write(metrics_path, arcname="metrics.csv") | |
| zipf.close() | |
| enhanced_files = [f for f in os.listdir(temp_dir) if f.endswith("_enhanced.wav")] | |
| preview_path = os.path.join(temp_dir, enhanced_files[0]) if enhanced_files else None | |
| return zip_path, preview_path | |
| def run_enhancement(files, nr, vi, reverb, vol, lang, skip_metrics): | |
| if not files: | |
| return None, None, "Upload audio files.", gr.update(visible=False) | |
| if not any([nr, vi, reverb, vol, lang]): | |
| return None, None, "Select at least one enhancement.", gr.update(visible=True, value="No enhancements selected.") | |
| zip_path, preview = process_files(files, nr, vi, reverb, vol, lang, skip_metrics) | |
| return zip_path, preview, "Done!", gr.update(visible=False) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π§ AudioVoiceEnhancer.AI") | |
| files = gr.File(label="Upload Audio", file_types=[".wav", ".mp3"], file_count="multiple") | |
| with gr.Row(): | |
| nr = gr.Checkbox(label="Noise Reduction", value=True) | |
| vi = gr.Checkbox(label="Voice Isolation", value=True) | |
| reverb = gr.Checkbox(label="Reverb Cleanup", value=True) | |
| vol = gr.Checkbox(label="Volume Normalize", value=True) | |
| lang = gr.Checkbox(label="Language-Aware Tuning", value=True) | |
| skip_metrics = gr.Checkbox(label="π Skip PESQ/STOI for Speed", value=True) | |
| run_btn = gr.Button("Enhance Audio") | |
| warning = gr.Textbox(visible=False, label="Warning") | |
| output_zip = gr.File(label="Download ZIP") | |
| output_audio = gr.Audio(label="Preview Enhanced", type="filepath") | |
| label = gr.Label("Status") | |
| run_btn.click( | |
| fn=run_enhancement, | |
| inputs=[files, nr, vi, reverb, vol, lang, skip_metrics], | |
| outputs=[output_zip, output_audio, label, warning], | |
| show_progress=True | |
| ) | |
| demo.queue() | |
| demo.launch() | |