Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files

xet

Community

tee342 commited on Jun 12

Commit

c08f175

verified ·

1 Parent(s): 07ae69f

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -187

app.py CHANGED Viewed

@@ -1,28 +1,9 @@
-import subprocess
-# Force upgrade huggingface_hub and transformers
-subprocess.run(["pip", "install", "--upgrade", "huggingface_hub>=0.23.0", "transformers>=4.40.0"])
-import subprocess
-# Force upgrade huggingface_hub
-subprocess.run(["pip", "install", "--upgrade", "huggingface_hub"])
-import subprocess
-subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
-import os
-from huggingface_hub import login
-hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    login(token=hf_token)
-else:
-    print("⚠️ No HF_TOKEN found — some models may not load")
 from pydub import AudioSegment
 import numpy as np
 import tempfile
 import os
 import noisereduce as nr
-import json
 import torch
 from demucs import pretrained
 from demucs.apply import apply_model
@@ -31,18 +12,15 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 from io import BytesIO
 from PIL import Image
-import zipfile
 import datetime
 import librosa
 import joblib
 import warnings
-from faster_whisper import WhisperModel
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
-import whisper
-from pyannote.audio import Pipeline as DiarizationPipeline
-from openvoice.api import TTS, ToneColorConverter
-from openvoice.se_extractor import get_se
 # Suppress warnings
 warnings.filterwarnings("ignore")
@@ -147,7 +125,7 @@ def stem_split(audio_path):
     return [gr.File(value=path) for path in stem_paths]
-# === Preset Loader with Fallback ===
 def load_presets():
     try:
         preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
@@ -177,7 +155,7 @@ if not preset_choices:
 preset_names = list(preset_choices.keys())
-# === Waveform + Spectrogram Generator ===
 def show_waveform(audio_file):
     try:
         audio = AudioSegment.from_file(audio_file)
@@ -193,27 +171,18 @@ def show_waveform(audio_file):
     except Exception as e:
         return None
-def detect_genre(audio_path):
-    try:
-        y, sr = torchaudio.load(audio_path)
-        mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
-        return "Speech"
-    except Exception:
-        return "Unknown"
 # === Session Info Export ===
-def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
     log = {
         "timestamp": str(datetime.datetime.now()),
         "filename": os.path.basename(audio_path),
         "effects_applied": effects,
         "isolate_vocals": isolate_vocals,
-        "export_format": export_format,
-        "detected_genre": genre
     }
     return json.dumps(log, indent=2)
-# === Main Processing Function with Status Updates ===
 def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
     status = "🔊 Loading audio..."
     try:
@@ -251,92 +220,73 @@ def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, exp
             final_audio.export(output_path, format=export_format.lower())
             waveform_image = show_waveform(output_path)
-            genre = detect_genre(output_path)
-            session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
             status = "🎉 Done!"
-            return output_path, waveform_image, session_log, genre, status
     except Exception as e:
         status = f"❌ Error: {str(e)}"
-        return None, None, status, "", status
-# === Batch Processing Function ===
-def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
-    status = "🔊 Loading files..."
-    try:
-        output_dir = tempfile.mkdtemp()
-        results = []
-        session_logs = []
-        for file in files:
-            processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
-            results.append(processed_path)
-            session_logs.append(log)
-        zip_path = os.path.join(output_dir, "batch_output.zip")
-        with zipfile.ZipFile(zip_path, 'w') as zipf:
-            for i, res in enumerate(results):
-                filename = f"processed_{i}.{export_format.lower()}"
-                zipf.write(res, filename)
-                zipf.writestr(f"session_info_{i}.json", session_logs[i])
-        return zip_path, "📦 ZIP created successfully!"
-    except Exception as e:
-        return None, f"❌ Batch processing failed: {str(e)}"
-# === Load Models Once at Start ===
-# 🧠 Speaker Diarization Model
-diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
-# 🎤 OpenVoice TTS + Converter
-tts_model = TTS(lang='en')
-tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
-# === Transcribe & Diarize Tab ===
 whisper_model = WhisperModel("base")
 def diarize_and_transcribe(audio_path):
     # Run diarization
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
-    diarization = diarize_model(temp_wav)
-    # Run transcription
-    result = whisper.transcribe(temp_wav)
-    segments = []
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
-        text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
-        segments.append({
-            "speaker": speaker,
-            "start": turn.start,
-            "end": turn.end,
-            "text": text
-        })
-    return segments
-# === Voice Cloning (Dubbing) ===
-def clone_voice(source_audio, target_audio, text):
-    source_se, _ = get_se(source_audio)
-    target_se, _ = get_se(target_audio)
-    out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
-    tts_model.tts_to_file(text=text, file_path=out_path)
-    tone_converter.convert(
-        audio_src_path=out_path,
-        src_se=source_se,
-        tgt_se=target_se,
-        output_path=out_path
-    )
-    return out_path
-# === UI ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
@@ -367,7 +317,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
                 gr.Audio(label="Processed Audio", type="filepath"),
                 gr.Image(label="Waveform Preview"),
                 gr.Textbox(label="Session Log (JSON)", lines=5),
-                gr.Textbox(label="Detected Genre", lines=1),
                 gr.Textbox(label="Status", value="✅ Ready", lines=1)
             ],
             title="Edit One File at a Time",
@@ -377,7 +326,53 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             clear_btn=None
         )
     # --- Batch Processing ---
     with gr.Tab("🔊 Batch Processing"):
         gr.Interface(
             fn=batch_process_audio,
@@ -399,78 +394,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             clear_btn=None
         )
-    # --- Remix Mode ---
-    with gr.Tab("🎛 Remix Mode"):
-        gr.Interface(
-            fn=stem_split,
-            inputs=gr.Audio(label="Upload Music Track", type="filepath"),
-            outputs=[
-                gr.File(label="Vocals"),
-                gr.File(label="Drums"),
-                gr.File(label="Bass"),
-                gr.File(label="Other")
-            ],
-            title="Split Into Drums, Bass, Vocals, and More",
-            description="Use AI to separate musical elements like vocals, drums, and bass.",
-            flagging_mode="never",
-            clear_btn=None
-        )
-    # --- Transcribe & Edit ===
-    with gr.Tab("📝 Transcribe & Edit"):
-        gr.Interface(
-            fn=transcribe_audio,
-            inputs=gr.Audio(label="Upload Audio", type="filepath"),
-            outputs=gr.Textbox(label="Transcribed Text", lines=10),
-            title="Transcribe & Edit Spoken Content",
-            description="Convert voice to text and edit it before exporting again."
-        )
-    # --- Speaker Diarization ===
-    with gr.Tab("🧏‍♂️ Who Spoke When?"):
-        gr.Interface(
-            fn=diarize_and_transcribe,
-            inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
-            outputs=gr.JSON(label="Diarized Transcript"),
-            title="Split By Speaker + Transcribe",
-            description="Detect speakers and transcribe their speech automatically."
-        )
-    # --- Voice Cloning (Dubbing) ===
-    with gr.Tab("🎭 Voice Cloning (Dubbing)"):
-        gr.Interface(
-            fn=clone_voice,
-            inputs=[
-                gr.File(label="Source Voice Clip"),
-                gr.File(label="Target Voice Clip"),
-                gr.Textbox(label="Text to Clone", lines=5)
-            ],
-            outputs=gr.Audio(label="Cloned Output", type="filepath"),
-            title="Replace One Voice With Another",
-            description="Clone voice from source to target speaker using AI"
-        )
-    # --- TTS Voice Generator ===
-    with gr.Tab("💬 TTS Voice Generator"):
-        gr.Interface(
-            fn=generate_tts,
-            inputs=gr.Textbox(label="Enter Text", lines=5),
-            outputs=gr.Audio(label="Generated Speech", type="filepath"),
-            title="Text-to-Speech Generator",
-            description="Type anything and turn it into natural-sounding speech."
-        )
-    # --- Audio Analysis Dashboard ===
-    with gr.Tab("📊 Audio Analysis"):
-        gr.Interface(
-            fn=analyze_audio,
-            inputs=gr.Audio(label="Upload Track", type="filepath"),
-            outputs=[
-                gr.JSON(label="Audio Stats"),
-                gr.Image(label="Waveform Graph")
-            ],
-            title="View Loudness, BPM, Silence, and More",
-            description="Analyze audio loudness, tempo, and frequency content."
-        )
 demo.launch()

+import gradio as gr
 from pydub import AudioSegment
 import numpy as np
 import tempfile
 import os
 import noisereduce as nr
 import torch
 from demucs import pretrained
 from demucs.apply import apply_model
 import matplotlib.pyplot as plt
 from io import BytesIO
 from PIL import Image
+import whisper
+from faster_whisper import WhisperModel
+import json
 import datetime
 import librosa
 import joblib
 import warnings
 from mutagen.mp3 import MP3
 from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
 # Suppress warnings
 warnings.filterwarnings("ignore")
     return [gr.File(value=path) for path in stem_paths]
+# === Load Presets ===
 def load_presets():
     try:
         preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
 preset_names = list(preset_choices.keys())
+# === Waveform Generator ===
 def show_waveform(audio_file):
     try:
         audio = AudioSegment.from_file(audio_file)
     except Exception as e:
         return None
 # === Session Info Export ===
+def generate_session_log(audio_path, effects, isolate_vocals, export_format):
     log = {
         "timestamp": str(datetime.datetime.now()),
         "filename": os.path.basename(audio_path),
         "effects_applied": effects,
         "isolate_vocals": isolate_vocals,
+        "export_format": export_format
     }
     return json.dumps(log, indent=2)
+# === Main Processing Function ===
 def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
     status = "🔊 Loading audio..."
     try:
             final_audio.export(output_path, format=export_format.lower())
             waveform_image = show_waveform(output_path)
+            session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format)
             status = "🎉 Done!"
+            return output_path, waveform_image, session_log, status
     except Exception as e:
         status = f"❌ Error: {str(e)}"
+        return None, None, status, status
+# === Transcribe & Edit Tab ===
 whisper_model = WhisperModel("base")
+def transcribe_audio(audio_path):
+    segments, info = whisper_model.transcribe(audio_path, beam_size=5)
+    text = " ".join([seg.text for seg in segments])
+    return text
+# === Speaker Diarization Tab ===
+try:
+    from pyannote.audio import Pipeline as DiarizationPipeline
+    from huggingface_hub import login
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        login(token=hf_token)
+    else:
+        print("⚠️ HF_TOKEN not set — some models may not load")
+    diarize_pipeline = DiarizationPipeline.from_pretrained(
+        "pyannote/speaker-diarization",
+        use_auth_token=hf_token or True
+    )
+except Exception as e:
+    print(f"⚠️ Failed to load diarization: {e}")
+    diarize_pipeline = None
 def diarize_and_transcribe(audio_path):
+    if diarize_pipeline is None:
+        return "⚠️ Diarization model not loaded — check HF_TOKEN"
     # Run diarization
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
+    try:
+        from pyannote.audio import Pipeline as DiarizationPipeline
+        diarization = diarize_pipeline(temp_wav)
+        # Run transcription
+        result = whisper.transcribe(temp_wav)
+        segments = []
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
+            segments.append({
+                "speaker": speaker,
+                "start": turn.start,
+                "end": turn.end,
+                "text": text
+            })
+        return segments
+    except Exception as e:
+        return f"⚠️ Diarization failed: {str(e)}"
+# === UI Setup ===
 effect_options = [
     "Noise Reduction",
     "Compress Dynamic Range",
                 gr.Audio(label="Processed Audio", type="filepath"),
                 gr.Image(label="Waveform Preview"),
                 gr.Textbox(label="Session Log (JSON)", lines=5),
                 gr.Textbox(label="Status", value="✅ Ready", lines=1)
             ],
             title="Edit One File at a Time",
             clear_btn=None
         )
+    # --- Transcribe & Edit Tab ---
+    with gr.Tab("📝 Transcribe & Edit"):
+        gr.Interface(
+            fn=transcribe_audio,
+            inputs=gr.Audio(label="Upload Audio", type="filepath"),
+            outputs=gr.Textbox(label="Transcribed Text", lines=10),
+            title="Transcribe Spoken Content",
+            description="Convert voice to text and edit it before exporting again."
+        )
+    # --- Diarization Tab (Who Spoke When?) ---
+    if diarize_pipeline:
+        with gr.Tab("🧏‍♂️ Who Spoke When?"):
+            gr.Interface(
+                fn=diarize_and_transcribe,
+                inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
+                outputs=gr.JSON(label="Diarized Transcript"),
+                title="Split By Speaker + Transcribe",
+                description="Use AI to split podcast by speaker and transcribe their speech.",
+                flagging_mode="never"
+            )
     # --- Batch Processing ---
+    def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
+        status = "🔊 Loading files..."
+        try:
+            output_dir = tempfile.mkdtemp()
+            results = []
+            session_logs = []
+            for file in files:
+                processed_path, _, log, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
+                results.append(processed_path)
+                session_logs.append(log)
+            zip_path = os.path.join(output_dir, "batch_output.zip")
+            with zipfile.ZipFile(zip_path, 'w') as zipf:
+                for i, res in enumerate(results):
+                    filename = f"processed_{i}.{export_format.lower()}"
+                    zipf.write(res, filename)
+                    zipf.writestr(f"session_info_{i}.json", session_logs[i])
+            return zip_path, "📦 ZIP created successfully!"
+        except Exception as e:
+            return None, f"❌ Batch processing failed: {str(e)}"
     with gr.Tab("🔊 Batch Processing"):
         gr.Interface(
             fn=batch_process_audio,
             clear_btn=None
         )
 demo.launch()