Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

MultiModal-Coherence-AI / src /generators /audio /generator.py

pratik-250620

Upload folder using huggingface_hub

6835659 verified about 2 months ago

raw

history blame contribute delete

8.56 kB

	"""
	Audio generator with explicit backend tracking.

	Phase 2: Stability over realism.
	- Backend is always recorded (never ambiguous)
	- Fallback ambient is the deterministic baseline
	- AudioLDM 2 used only if explicitly available and stable

	Upgrade note: AudioLDM 1 → AudioLDM 2 (cvssp/audioldm2)
	- Better audio quality, same API surface
	- unload() method added for sequential model loading within 16GB RAM
	"""

	from __future__ import annotations

	from dataclasses import dataclass, asdict
	from typing import Optional, Dict, Any

	import numpy as np
	import soundfile as sf
	from pathlib import Path


	@dataclass(frozen=True)
	class AudioGenResult:
	"""Result of audio generation with full metadata."""
	audio_path: str
	backend: str # "audioldm2" or "fallback_ambient" — always explicit
	prompt_hash: int # Deterministic hash of (prompt, seed) for reproducibility
	duration_sec: float
	sample_rate: int
	note: Optional[str] = None

	def to_dict(self) -> Dict[str, Any]:
	return asdict(self)


	class AudioGenerator:
	"""
	Audio generator with explicit backend selection.

	Strategy (Phase 2B):
	- Default: fallback_ambient (fully deterministic, always works)
	- Optional: AudioLDM 2 (if force_audioldm=True and model is available)

	The fallback ambient generator produces prompt-seeded ambient soundscapes.
	This is acceptable for a case study testing alignment behavior, not audio realism.
	"""

	def __init__(self, device: str = "cpu", force_audioldm: bool = False):
	self.device = device
	self._audioldm_pipe = None
	self._audioldm_backend_name = None
	self._torch = None
	self._audioldm_error = None

	if force_audioldm:
	try:
	from diffusers import AudioLDM2Pipeline
	import torch

	model_id = "cvssp/audioldm2"
	self._audioldm_pipe = AudioLDM2Pipeline.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if device != "cpu" else torch.float32,
	)
	self._audioldm_pipe.to(self.device)
	self._audioldm_backend_name = f"AudioLDM2Pipeline({model_id})"
	self._torch = torch
	except Exception as exc:
	self._audioldm_error = str(exc)

	def generate(
	self,
	prompt: str,
	out_path: str,
	duration_sec: float = 6.0,
	sr: int = 48000,
	seed: Optional[int] = None,
	) -> AudioGenResult:
	"""
	Generate audio for a prompt.

	Backend selection:
	1. If AudioLDM was loaded (force_audioldm=True): try it, fallback on error
	2. Otherwise: use fallback_ambient (deterministic baseline)
	"""
	if self._audioldm_pipe is not None:
	try:
	return self._generate_audioldm(prompt, out_path, duration_sec, sr, seed)
	except Exception as exc:
	return self._generate_fallback(
	prompt, out_path, duration_sec, sr, seed,
	note=f"AudioLDM failed at runtime: {exc}",
	)

	return self._generate_fallback(
	prompt, out_path, duration_sec, sr, seed,
	note=self._audioldm_error or "Using deterministic fallback (default)",
	)

	def unload(self) -> None:
	"""Free GPU/MPS memory by deleting the pipeline. Critical for 16GB RAM constraint."""
	if self._audioldm_pipe is not None:
	del self._audioldm_pipe
	self._audioldm_pipe = None
	if self._torch is not None:
	if self._torch.cuda.is_available():
	self._torch.cuda.empty_cache()
	elif hasattr(self._torch.backends, "mps") and self._torch.backends.mps.is_available():
	self._torch.mps.empty_cache()
	import gc
	gc.collect()

	def _generate_audioldm(
	self, prompt: str, out_path: str, duration_sec: float, sr: int, seed: Optional[int],
	) -> AudioGenResult:
	"""Generate with AudioLDM 2."""
	generator = None
	if seed is not None and self._torch is not None:
	# MPS generator must be created on CPU then used
	gen_device = "cpu" if self.device == "mps" else self.device
	generator = self._torch.Generator(device=gen_device).manual_seed(seed)
	kwargs = {"audio_length_in_s": duration_sec}
	if generator is not None:
	kwargs["generator"] = generator
	result = self._audioldm_pipe(prompt, **kwargs)
	audio = result.audios[0]
	sf.write(out_path, audio, sr)

	prompt_hash = abs(hash((prompt, seed))) % (2**32)
	return AudioGenResult(
	audio_path=out_path,
	backend="audioldm2",
	prompt_hash=prompt_hash,
	duration_sec=duration_sec,
	sample_rate=sr,
	)

	def _generate_fallback(
	self,
	prompt: str,
	out_path: str,
	duration_sec: float,
	sr: int,
	seed: Optional[int],
	note: str = "",
	) -> AudioGenResult:
	"""
	Deterministic ambient soundscape generator.

	Produces prompt-dependent audio by seeding RNG from hash(prompt) + seed.
	Different prompts produce different spectral characteristics:
	- Drone frequency varies with prompt
	- Noise filtering varies with prompt
	- Amplitude envelope varies with prompt

	This ensures wrong_audio perturbations produce genuinely different audio.
	"""
	# Deterministic seed from prompt content
	base_seed = abs(hash(prompt)) % (2**32)
	if seed is not None:
	base_seed = (base_seed + seed) % (2**32)
	rng = np.random.default_rng(base_seed)

	n = int(duration_sec * sr)
	t = np.linspace(0, duration_sec, n, endpoint=False)

	# Prompt-dependent parameters — different prompts get different sounds
	prompt_val = sum(ord(c) for c in prompt)
	drone_freq = 80.0 + (prompt_val % 200) # 80-280 Hz range
	filter_width = 2000 + (prompt_val % 6000) # 2000-8000 sample filter
	noise_amplitude = 0.02 + (prompt_val % 50) * 0.001 # 0.02-0.07
	drone_amplitude = 0.06 + (prompt_val % 40) * 0.001 # 0.06-0.10

	# Generate noise with prompt-dependent filtering
	noise = rng.normal(0, 1, size=n).astype(np.float32)
	kernel = np.ones(filter_width, dtype=np.float32) / filter_width
	noise = np.convolve(noise, kernel, mode="same")

	# Prompt-dependent drone
	drone = drone_amplitude * np.sin(2 * np.pi * drone_freq * t).astype(np.float32)

	# Add second harmonic for richer sound
	harmonic_freq = drone_freq * 1.5 + (prompt_val % 100)
	harmonic = (drone_amplitude * 0.3) * np.sin(2 * np.pi * harmonic_freq * t).astype(np.float32)

	audio = (noise_amplitude * noise + drone + harmonic).astype(np.float32)
	audio = np.clip(audio, -1.0, 1.0)

	sf.write(out_path, audio, sr)

	return AudioGenResult(
	audio_path=out_path,
	backend="fallback_ambient",
	prompt_hash=base_seed,
	duration_sec=duration_sec,
	sample_rate=sr,
	note=note,
	)


	def generate_audio(
	prompt: str,
	out_dir: str,
	filename: str = "audio.wav",
	device: str = "cpu",
	deterministic: bool = True,
	seed: int = 42,
	) -> str:
	"""
	Generate audio for a prompt. Returns path to audio file.

	Uses deterministic fallback by default (stable for experiments).
	"""
	out_path = Path(out_dir) / filename
	out_path.parent.mkdir(parents=True, exist_ok=True)
	generator = AudioGenerator(device=device)
	seed_value = seed if deterministic else None
	result = generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)
	return result.audio_path


	def generate_audio_with_metadata(
	prompt: str,
	out_dir: str,
	filename: str = "audio.wav",
	device: str = "cpu",
	deterministic: bool = True,
	seed: int = 42,
	) -> AudioGenResult:
	"""
	Generate audio and return full metadata.

	Use this in experiment pipelines where backend tracking matters.
	"""
	out_path = Path(out_dir) / filename
	out_path.parent.mkdir(parents=True, exist_ok=True)
	generator = AudioGenerator(device=device)
	seed_value = seed if deterministic else None
	return generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)