tee342 commited on
Commit
c08f175
Β·
verified Β·
1 Parent(s): 07ae69f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -187
app.py CHANGED
@@ -1,28 +1,9 @@
1
- import subprocess
2
-
3
- # Force upgrade huggingface_hub and transformers
4
- subprocess.run(["pip", "install", "--upgrade", "huggingface_hub>=0.23.0", "transformers>=4.40.0"])
5
- import subprocess
6
-
7
- # Force upgrade huggingface_hub
8
- subprocess.run(["pip", "install", "--upgrade", "huggingface_hub"])
9
- import subprocess
10
- subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
11
- import os
12
- from huggingface_hub import login
13
-
14
- hf_token = os.getenv("HF_TOKEN")
15
-
16
- if hf_token:
17
- login(token=hf_token)
18
- else:
19
- print("⚠️ No HF_TOKEN found β€” some models may not load")
20
  from pydub import AudioSegment
21
  import numpy as np
22
  import tempfile
23
  import os
24
  import noisereduce as nr
25
- import json
26
  import torch
27
  from demucs import pretrained
28
  from demucs.apply import apply_model
@@ -31,18 +12,15 @@ from pathlib import Path
31
  import matplotlib.pyplot as plt
32
  from io import BytesIO
33
  from PIL import Image
34
- import zipfile
 
 
35
  import datetime
36
  import librosa
37
  import joblib
38
  import warnings
39
- from faster_whisper import WhisperModel
40
  from mutagen.mp3 import MP3
41
  from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
42
- import whisper
43
- from pyannote.audio import Pipeline as DiarizationPipeline
44
- from openvoice.api import TTS, ToneColorConverter
45
- from openvoice.se_extractor import get_se
46
 
47
  # Suppress warnings
48
  warnings.filterwarnings("ignore")
@@ -147,7 +125,7 @@ def stem_split(audio_path):
147
 
148
  return [gr.File(value=path) for path in stem_paths]
149
 
150
- # === Preset Loader with Fallback ===
151
  def load_presets():
152
  try:
153
  preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
@@ -177,7 +155,7 @@ if not preset_choices:
177
 
178
  preset_names = list(preset_choices.keys())
179
 
180
- # === Waveform + Spectrogram Generator ===
181
  def show_waveform(audio_file):
182
  try:
183
  audio = AudioSegment.from_file(audio_file)
@@ -193,27 +171,18 @@ def show_waveform(audio_file):
193
  except Exception as e:
194
  return None
195
 
196
- def detect_genre(audio_path):
197
- try:
198
- y, sr = torchaudio.load(audio_path)
199
- mfccs = librosa.feature.mfcc(y=y.numpy().flatten(), sr=sr, n_mfcc=13).mean(axis=1).reshape(1, -1)
200
- return "Speech"
201
- except Exception:
202
- return "Unknown"
203
-
204
  # === Session Info Export ===
205
- def generate_session_log(audio_path, effects, isolate_vocals, export_format, genre):
206
  log = {
207
  "timestamp": str(datetime.datetime.now()),
208
  "filename": os.path.basename(audio_path),
209
  "effects_applied": effects,
210
  "isolate_vocals": isolate_vocals,
211
- "export_format": export_format,
212
- "detected_genre": genre
213
  }
214
  return json.dumps(log, indent=2)
215
 
216
- # === Main Processing Function with Status Updates ===
217
  def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
218
  status = "πŸ”Š Loading audio..."
219
  try:
@@ -251,92 +220,73 @@ def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, exp
251
  final_audio.export(output_path, format=export_format.lower())
252
 
253
  waveform_image = show_waveform(output_path)
254
- genre = detect_genre(output_path)
255
- session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format, genre)
256
 
257
  status = "πŸŽ‰ Done!"
258
- return output_path, waveform_image, session_log, genre, status
259
 
260
  except Exception as e:
261
  status = f"❌ Error: {str(e)}"
262
- return None, None, status, "", status
263
-
264
- # === Batch Processing Function ===
265
- def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
266
- status = "πŸ”Š Loading files..."
267
- try:
268
- output_dir = tempfile.mkdtemp()
269
- results = []
270
- session_logs = []
271
-
272
- for file in files:
273
- processed_path, _, log, _, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
274
- results.append(processed_path)
275
- session_logs.append(log)
276
-
277
- zip_path = os.path.join(output_dir, "batch_output.zip")
278
- with zipfile.ZipFile(zip_path, 'w') as zipf:
279
- for i, res in enumerate(results):
280
- filename = f"processed_{i}.{export_format.lower()}"
281
- zipf.write(res, filename)
282
- zipf.writestr(f"session_info_{i}.json", session_logs[i])
283
-
284
- return zip_path, "πŸ“¦ ZIP created successfully!"
285
 
286
- except Exception as e:
287
- return None, f"❌ Batch processing failed: {str(e)}"
288
-
289
- # === Load Models Once at Start ===
290
-
291
- # 🧠 Speaker Diarization Model
292
- diarize_model = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN")
293
-
294
- # 🎀 OpenVoice TTS + Converter
295
- tts_model = TTS(lang='en')
296
- tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
297
-
298
- # === Transcribe & Diarize Tab ===
299
  whisper_model = WhisperModel("base")
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  def diarize_and_transcribe(audio_path):
 
 
 
302
  # Run diarization
303
  audio = AudioSegment.from_file(audio_path)
304
  temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
305
  audio.export(temp_wav, format="wav")
306
- diarization = diarize_model(temp_wav)
307
-
308
- # Run transcription
309
- result = whisper.transcribe(temp_wav)
310
-
311
- segments = []
312
- for turn, _, speaker in diarization.itertracks(yield_label=True):
313
- text = " ".join([seg.text for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
314
- segments.append({
315
- "speaker": speaker,
316
- "start": turn.start,
317
- "end": turn.end,
318
- "text": text
319
- })
320
-
321
- return segments
322
-
323
- # === Voice Cloning (Dubbing) ===
324
- def clone_voice(source_audio, target_audio, text):
325
- source_se, _ = get_se(source_audio)
326
- target_se, _ = get_se(target_audio)
327
-
328
- out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
329
-
330
- tts_model.tts_to_file(text=text, file_path=out_path)
331
- tone_converter.convert(
332
- audio_src_path=out_path,
333
- src_se=source_se,
334
- tgt_se=target_se,
335
- output_path=out_path
336
- )
337
- return out_path
338
 
339
- # === UI ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  effect_options = [
341
  "Noise Reduction",
342
  "Compress Dynamic Range",
@@ -367,7 +317,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
367
  gr.Audio(label="Processed Audio", type="filepath"),
368
  gr.Image(label="Waveform Preview"),
369
  gr.Textbox(label="Session Log (JSON)", lines=5),
370
- gr.Textbox(label="Detected Genre", lines=1),
371
  gr.Textbox(label="Status", value="βœ… Ready", lines=1)
372
  ],
373
  title="Edit One File at a Time",
@@ -377,7 +326,53 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
377
  clear_btn=None
378
  )
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  # --- Batch Processing ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  with gr.Tab("πŸ”Š Batch Processing"):
382
  gr.Interface(
383
  fn=batch_process_audio,
@@ -399,78 +394,4 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
399
  clear_btn=None
400
  )
401
 
402
- # --- Remix Mode ---
403
- with gr.Tab("πŸŽ› Remix Mode"):
404
- gr.Interface(
405
- fn=stem_split,
406
- inputs=gr.Audio(label="Upload Music Track", type="filepath"),
407
- outputs=[
408
- gr.File(label="Vocals"),
409
- gr.File(label="Drums"),
410
- gr.File(label="Bass"),
411
- gr.File(label="Other")
412
- ],
413
- title="Split Into Drums, Bass, Vocals, and More",
414
- description="Use AI to separate musical elements like vocals, drums, and bass.",
415
- flagging_mode="never",
416
- clear_btn=None
417
- )
418
-
419
- # --- Transcribe & Edit ===
420
- with gr.Tab("πŸ“ Transcribe & Edit"):
421
- gr.Interface(
422
- fn=transcribe_audio,
423
- inputs=gr.Audio(label="Upload Audio", type="filepath"),
424
- outputs=gr.Textbox(label="Transcribed Text", lines=10),
425
- title="Transcribe & Edit Spoken Content",
426
- description="Convert voice to text and edit it before exporting again."
427
- )
428
-
429
- # --- Speaker Diarization ===
430
- with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
431
- gr.Interface(
432
- fn=diarize_and_transcribe,
433
- inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
434
- outputs=gr.JSON(label="Diarized Transcript"),
435
- title="Split By Speaker + Transcribe",
436
- description="Detect speakers and transcribe their speech automatically."
437
- )
438
-
439
- # --- Voice Cloning (Dubbing) ===
440
- with gr.Tab("🎭 Voice Cloning (Dubbing)"):
441
- gr.Interface(
442
- fn=clone_voice,
443
- inputs=[
444
- gr.File(label="Source Voice Clip"),
445
- gr.File(label="Target Voice Clip"),
446
- gr.Textbox(label="Text to Clone", lines=5)
447
- ],
448
- outputs=gr.Audio(label="Cloned Output", type="filepath"),
449
- title="Replace One Voice With Another",
450
- description="Clone voice from source to target speaker using AI"
451
- )
452
-
453
- # --- TTS Voice Generator ===
454
- with gr.Tab("πŸ’¬ TTS Voice Generator"):
455
- gr.Interface(
456
- fn=generate_tts,
457
- inputs=gr.Textbox(label="Enter Text", lines=5),
458
- outputs=gr.Audio(label="Generated Speech", type="filepath"),
459
- title="Text-to-Speech Generator",
460
- description="Type anything and turn it into natural-sounding speech."
461
- )
462
-
463
- # --- Audio Analysis Dashboard ===
464
- with gr.Tab("πŸ“Š Audio Analysis"):
465
- gr.Interface(
466
- fn=analyze_audio,
467
- inputs=gr.Audio(label="Upload Track", type="filepath"),
468
- outputs=[
469
- gr.JSON(label="Audio Stats"),
470
- gr.Image(label="Waveform Graph")
471
- ],
472
- title="View Loudness, BPM, Silence, and More",
473
- description="Analyze audio loudness, tempo, and frequency content."
474
- )
475
-
476
  demo.launch()
 
1
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from pydub import AudioSegment
3
  import numpy as np
4
  import tempfile
5
  import os
6
  import noisereduce as nr
 
7
  import torch
8
  from demucs import pretrained
9
  from demucs.apply import apply_model
 
12
  import matplotlib.pyplot as plt
13
  from io import BytesIO
14
  from PIL import Image
15
+ import whisper
16
+ from faster_whisper import WhisperModel
17
+ import json
18
  import datetime
19
  import librosa
20
  import joblib
21
  import warnings
 
22
  from mutagen.mp3 import MP3
23
  from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
 
 
 
 
24
 
25
  # Suppress warnings
26
  warnings.filterwarnings("ignore")
 
125
 
126
  return [gr.File(value=path) for path in stem_paths]
127
 
128
+ # === Load Presets ===
129
  def load_presets():
130
  try:
131
  preset_files = [f for f in os.listdir("presets") if f.endswith(".json")]
 
155
 
156
  preset_names = list(preset_choices.keys())
157
 
158
+ # === Waveform Generator ===
159
  def show_waveform(audio_file):
160
  try:
161
  audio = AudioSegment.from_file(audio_file)
 
171
  except Exception as e:
172
  return None
173
 
 
 
 
 
 
 
 
 
174
  # === Session Info Export ===
175
+ def generate_session_log(audio_path, effects, isolate_vocals, export_format):
176
  log = {
177
  "timestamp": str(datetime.datetime.now()),
178
  "filename": os.path.basename(audio_path),
179
  "effects_applied": effects,
180
  "isolate_vocals": isolate_vocals,
181
+ "export_format": export_format
 
182
  }
183
  return json.dumps(log, indent=2)
184
 
185
+ # === Main Processing Function ===
186
  def process_audio(audio_file, selected_effects, isolate_vocals, preset_name, export_format):
187
  status = "πŸ”Š Loading audio..."
188
  try:
 
220
  final_audio.export(output_path, format=export_format.lower())
221
 
222
  waveform_image = show_waveform(output_path)
223
+ session_log = generate_session_log(audio_file, effects_to_apply, isolate_vocals, export_format)
 
224
 
225
  status = "πŸŽ‰ Done!"
226
+ return output_path, waveform_image, session_log, status
227
 
228
  except Exception as e:
229
  status = f"❌ Error: {str(e)}"
230
+ return None, None, status, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ # === Transcribe & Edit Tab ===
 
 
 
 
 
 
 
 
 
 
 
 
233
  whisper_model = WhisperModel("base")
234
 
235
+ def transcribe_audio(audio_path):
236
+ segments, info = whisper_model.transcribe(audio_path, beam_size=5)
237
+ text = " ".join([seg.text for seg in segments])
238
+ return text
239
+
240
+ # === Speaker Diarization Tab ===
241
+ try:
242
+ from pyannote.audio import Pipeline as DiarizationPipeline
243
+ from huggingface_hub import login
244
+
245
+ hf_token = os.getenv("HF_TOKEN")
246
+ if hf_token:
247
+ login(token=hf_token)
248
+ else:
249
+ print("⚠️ HF_TOKEN not set β€” some models may not load")
250
+
251
+ diarize_pipeline = DiarizationPipeline.from_pretrained(
252
+ "pyannote/speaker-diarization",
253
+ use_auth_token=hf_token or True
254
+ )
255
+ except Exception as e:
256
+ print(f"⚠️ Failed to load diarization: {e}")
257
+ diarize_pipeline = None
258
+
259
  def diarize_and_transcribe(audio_path):
260
+ if diarize_pipeline is None:
261
+ return "⚠️ Diarization model not loaded β€” check HF_TOKEN"
262
+
263
  # Run diarization
264
  audio = AudioSegment.from_file(audio_path)
265
  temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
266
  audio.export(temp_wav, format="wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ try:
269
+ from pyannote.audio import Pipeline as DiarizationPipeline
270
+ diarization = diarize_pipeline(temp_wav)
271
+
272
+ # Run transcription
273
+ result = whisper.transcribe(temp_wav)
274
+
275
+ segments = []
276
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
277
+ text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
278
+ segments.append({
279
+ "speaker": speaker,
280
+ "start": turn.start,
281
+ "end": turn.end,
282
+ "text": text
283
+ })
284
+
285
+ return segments
286
+ except Exception as e:
287
+ return f"⚠️ Diarization failed: {str(e)}"
288
+
289
+ # === UI Setup ===
290
  effect_options = [
291
  "Noise Reduction",
292
  "Compress Dynamic Range",
 
317
  gr.Audio(label="Processed Audio", type="filepath"),
318
  gr.Image(label="Waveform Preview"),
319
  gr.Textbox(label="Session Log (JSON)", lines=5),
 
320
  gr.Textbox(label="Status", value="βœ… Ready", lines=1)
321
  ],
322
  title="Edit One File at a Time",
 
326
  clear_btn=None
327
  )
328
 
329
+ # --- Transcribe & Edit Tab ---
330
+ with gr.Tab("πŸ“ Transcribe & Edit"):
331
+ gr.Interface(
332
+ fn=transcribe_audio,
333
+ inputs=gr.Audio(label="Upload Audio", type="filepath"),
334
+ outputs=gr.Textbox(label="Transcribed Text", lines=10),
335
+ title="Transcribe Spoken Content",
336
+ description="Convert voice to text and edit it before exporting again."
337
+ )
338
+
339
+ # --- Diarization Tab (Who Spoke When?) ---
340
+ if diarize_pipeline:
341
+ with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
342
+ gr.Interface(
343
+ fn=diarize_and_transcribe,
344
+ inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
345
+ outputs=gr.JSON(label="Diarized Transcript"),
346
+ title="Split By Speaker + Transcribe",
347
+ description="Use AI to split podcast by speaker and transcribe their speech.",
348
+ flagging_mode="never"
349
+ )
350
+
351
  # --- Batch Processing ---
352
+ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, export_format):
353
+ status = "πŸ”Š Loading files..."
354
+ try:
355
+ output_dir = tempfile.mkdtemp()
356
+ results = []
357
+ session_logs = []
358
+
359
+ for file in files:
360
+ processed_path, _, log, _ = process_audio(file.name, selected_effects, isolate_vocals, preset_name, export_format)
361
+ results.append(processed_path)
362
+ session_logs.append(log)
363
+
364
+ zip_path = os.path.join(output_dir, "batch_output.zip")
365
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
366
+ for i, res in enumerate(results):
367
+ filename = f"processed_{i}.{export_format.lower()}"
368
+ zipf.write(res, filename)
369
+ zipf.writestr(f"session_info_{i}.json", session_logs[i])
370
+
371
+ return zip_path, "πŸ“¦ ZIP created successfully!"
372
+
373
+ except Exception as e:
374
+ return None, f"❌ Batch processing failed: {str(e)}"
375
+
376
  with gr.Tab("πŸ”Š Batch Processing"):
377
  gr.Interface(
378
  fn=batch_process_audio,
 
394
  clear_btn=None
395
  )
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  demo.launch()