Training in progress - step 1000

Browse files

Files changed (7) hide show

alignment.py +6 -3
asr_config.py +16 -24
asr_modeling.py +54 -116
asr_pipeline.py +19 -11
asr_processing.py +3 -5
diarization.py +0 -2
projectors.py +15 -41

alignment.py CHANGED Viewed

@@ -120,16 +120,19 @@ class ForcedAligner:
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
-                token_frames[j - 1].insert(0, t - 1)
                 j -= 1
-            # Always decrement time (monotonic)
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
-            token_frames[j - 1].insert(0, 0)
             j -= 1
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
         for token_idx, frames in enumerate(token_frames):

             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
+                token_frames[j - 1].append(t - 1)
                 j -= 1
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
+            token_frames[j - 1].append(0)
             j -= 1
+        # We appended in reverse-time order; restore monotonic order
+        for frames in token_frames:
+            frames.reverse()
         # Convert to spans
         token_spans: list[tuple[int, float, float]] = []
         for token_idx, frames in enumerate(token_frames):

asr_config.py CHANGED Viewed

@@ -2,6 +2,9 @@ from typing import Optional
 import transformers
 class ASRConfig(transformers.PretrainedConfig):
     """Configuration class for the ASR model.
@@ -107,8 +110,7 @@ class ASRConfig(transformers.PretrainedConfig):
         self.system_prompt = system_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
-        # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
-        self.encoder_conv_layers = encoder_conv_layers or [(1, 3, 1), (1, 3, 2)]
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
@@ -151,28 +153,18 @@ class ASRConfig(transformers.PretrainedConfig):
         ]
         self.freeze_projector = freeze_projector
-        # Generation parameters (use explicit value if provided, else use default)
-        self.num_beams = num_beams if num_beams is not None else generation_defaults["num_beams"]
-        self.max_new_tokens = (
-            max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
-        )
-        self.min_new_tokens = (
-            min_new_tokens if min_new_tokens is not None else generation_defaults["min_new_tokens"]
-        )
-        self.repetition_penalty = (
-            repetition_penalty
-            if repetition_penalty is not None
-            else generation_defaults["repetition_penalty"]
-        )
-        self.length_penalty = (
-            length_penalty if length_penalty is not None else generation_defaults["length_penalty"]
-        )
-        self.no_repeat_ngram_size = (
-            no_repeat_ngram_size
-            if no_repeat_ngram_size is not None
-            else generation_defaults["no_repeat_ngram_size"]
-        )
-        self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
         self.do_sample = do_sample
         self.temperature = temperature
         self.top_p = top_p

 import transformers
+# Default conv layers for Whisper/GLM-ASR audio encoders: [(pad, kernel, stride), ...]
+DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
 class ASRConfig(transformers.PretrainedConfig):
     """Configuration class for the ASR model.
         self.system_prompt = system_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
+        self.encoder_conv_layers = encoder_conv_layers or DEFAULT_ENCODER_CONV_LAYERS
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
         ]
         self.freeze_projector = freeze_projector
+        explicit_generation_args = {
+            "num_beams": num_beams,
+            "max_new_tokens": max_new_tokens,
+            "min_new_tokens": min_new_tokens,
+            "repetition_penalty": repetition_penalty,
+            "length_penalty": length_penalty,
+            "no_repeat_ngram_size": no_repeat_ngram_size,
+            "use_cache": use_cache,
+        }
+        for key, default in generation_defaults.items():
+            value = explicit_generation_args[key]
+            setattr(self, key, value if value is not None else default)
         self.do_sample = do_sample
         self.temperature = temperature
         self.top_p = top_p

asr_modeling.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing import Iterator, Optional, Union
 import torch
 import torch.nn as nn
 from transformers import (
-    AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -27,6 +27,26 @@ except ImportError:
 from torchaudio.transforms import SpecAugment
 class ASRModel(PreTrainedModel, GenerationMixin):
     """Audio-to-text model combining an audio encoder, projector, and language model."""
@@ -402,61 +422,25 @@ class ASRModel(PreTrainedModel, GenerationMixin):
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
-        audio_attention_mask: torch.Tensor,
-        expected_token_counts: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        """Encode audio and project to LLM embedding space.
         Args:
             audio_features: Mel spectrogram features (batch, n_mels, mel_len)
-            audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
-            expected_token_counts: Expected number of audio tokens per sample from input_ids.
-                If provided, output will match these counts exactly (padding/truncating as needed).
         Returns:
-            Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
         """
         with torch.no_grad():
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
-        # Project to LLM space
         audio_embeds = self.projector(hidden_states)
-        # Use expected token counts if provided (from input_ids), otherwise compute from audio
-        if expected_token_counts is not None:
-            token_counts = expected_token_counts
-        else:
-            # Compute per-sample encoder output lengths using conv formulas
-            encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
-            token_counts = torch.tensor(
-                [
-                    self.projector.get_output_length(int(length.item()))
-                    for length in encoder_lengths
-                ],
-                device=audio_embeds.device,
-            )
-        # Extract embeddings matching expected token counts per sample
-        batch_size = audio_embeds.shape[0]
-        hidden_dim = audio_embeds.shape[2]
-        result_embeds = []
-        for i in range(batch_size):
-            count = int(token_counts[i].item())
-            sample_embeds = audio_embeds[i, :count, :]  # Take first 'count' embeddings
-            # Pad with zeros if we don't have enough embeddings
-            if sample_embeds.shape[0] < count:
-                padding = torch.zeros(
-                    count - sample_embeds.shape[0],
-                    hidden_dim,
-                    device=audio_embeds.device,
-                    dtype=audio_embeds.dtype,
-                )
-                sample_embeds = torch.cat([sample_embeds, padding], dim=0)
-            result_embeds.append(sample_embeds)
-        return torch.cat(result_embeds, dim=0)
     def forward(
         self,
@@ -470,34 +454,33 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         labels: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         """Forward pass for training and inference."""
-        # Get text embeddings if not provided
         if inputs_embeds is None:
             inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if input_features is not None and input_ids is not None:
-            # Apply SpecAugment during training if enabled
             if self.training and self.spec_augment is not None:
                 input_features = self.spec_augment(input_features)
-            # Count expected audio tokens from input_ids (ground truth from collator)
-            audio_token_counts = (input_ids == self.audio_token_id).sum(dim=-1)
-            # Encode audio -> flattened (total_audio_tokens, hidden_dim)
-            audio_embeds = self._encode_audio(
-                input_features, audio_attention_mask, audio_token_counts
-            )
-            # Replace <audio> token placeholders with audio embeddings using masked_scatter
-            audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
             inputs_embeds = inputs_embeds.masked_scatter(
                 audio_token_mask.to(inputs_embeds.device),
                 audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
             )
-        # Run through language model (let it compute loss if labels provided)
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -509,7 +492,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        # Add auxiliary loss from MoE projectors if available
         if outputs.loss is not None and hasattr(self.projector, "get_aux_loss"):
             aux_loss = self.projector.get_aux_loss()
             if aux_loss is not None and aux_loss.numel() > 0:
@@ -569,7 +551,13 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         batch_size = input_features.shape[0]
         # Encode audio -> flattened embeddings
-        audio_embeds = self._encode_audio(input_features, audio_attention_mask)
         # If input_ids not provided, build prompt with correct number of audio tokens
         if input_ids is None:
@@ -653,7 +641,13 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         batch_size = input_features.shape[0]
         # Encode audio -> flattened embeddings
-        audio_embeds = self._encode_audio(input_features, audio_attention_mask)
         # Build prompt with correct number of audio tokens
         num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
@@ -747,63 +741,11 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         thread.join()
-    @torch.no_grad()
-    def generate_text_only(
-        self,
-        messages: list[dict[str, str]],
-        max_new_tokens: int = 256,
-        **generate_kwargs,
-    ) -> str:
-        """Generate text using only the LLM (no audio encoding).
-        Used for SIFT-style response generation from metadata prompts.
-        Args:
-            messages: List of chat messages [{"role": "user", "content": "..."}]
-            max_new_tokens: Maximum tokens to generate
-            **generate_kwargs: Additional generation arguments
-        Returns:
-            Generated text response
-        """
-        device = next(self.language_model.parameters()).device
-        # Apply chat template
-        input_ids = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
-        ).to(device)
-        if input_ids.dim() == 1:
-            input_ids = input_ids.unsqueeze(0)
-        attention_mask = torch.ones_like(input_ids)
-        # Generate using language model directly
-        output = self.language_model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
-            pad_token_id=self.tokenizer.pad_token_id,
-            eos_token_id=self.tokenizer.eos_token_id,
-            **generate_kwargs,
-        )
-        # Decode only the new tokens
-        new_tokens = output[0, input_ids.shape[1] :]
-        response = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
-        return response.strip()
     def save_pretrained(self, save_directory: Union[str, Path], **kwargs) -> None:
         """Save model, tokenizer, and processor."""
         import shutil
-        from pathlib import Path as PathlibPath
-        save_dir = PathlibPath(save_directory)
         save_dir.mkdir(parents=True, exist_ok=True)
         # Update config with actual vocab size
@@ -874,7 +816,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             json.dump(processor_config, f, indent=2)
         # Copy source files for auto-loading
-        src_dir = PathlibPath(__file__).parent
         for asr_file in src_dir.glob("asr_*.py"):
             shutil.copy(asr_file, save_dir / asr_file.name)
         # Copy projectors module
@@ -896,11 +838,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         # Call parent's push_to_hub
         return super().push_to_hub(repo_id, **kwargs)
-    def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
-        """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
-        pass
 # Register with transformers Auto classes
-AutoConfig.register("asr_model", ASRConfig)
 AutoModel.register(ASRConfig, ASRModel)

 import torch
 import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
     AutoTokenizer,
 from torchaudio.transforms import SpecAugment
+def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor) -> torch.Tensor:
+    """Flatten per-sample audio embeddings into a packed tensor.
+    For each row i, takes the first ``token_counts[i]`` rows of
+    ``audio_embeds[i]`` and concatenates them. If any token count exceeds
+    ``audio_embeds.shape[1]``, the deficit is zero-padded.
+    Equivalent to a per-sample slice/cat loop but with O(1) host-device
+    syncs per call (one ``max().item()``) instead of one per sample.
+    """
+    _, max_len, _ = audio_embeds.shape
+    needed = int(token_counts.max().item())
+    if needed > max_len:
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, needed - max_len))
+        max_len = needed
+    indices = torch.arange(max_len, device=audio_embeds.device).unsqueeze(0)
+    mask = indices < token_counts.unsqueeze(1)
+    return audio_embeds[mask]
 class ASRModel(PreTrainedModel, GenerationMixin):
     """Audio-to-text model combining an audio encoder, projector, and language model."""
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
+        expected_token_counts: torch.Tensor,
     ) -> torch.Tensor:
+        """Encode audio features and return flattened embeddings matching expected_token_counts.
         Args:
             audio_features: Mel spectrogram features (batch, n_mels, mel_len)
+            expected_token_counts: Per-sample audio token counts as int64 tensor (batch,).
         Returns:
+            Flattened audio embeddings of shape (sum(expected_token_counts), hidden_dim).
         """
         with torch.no_grad():
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
         audio_embeds = self.projector(hidden_states)
+        token_counts = expected_token_counts.to(device=audio_embeds.device, dtype=torch.long)
+        return _gather_audio_embeds(audio_embeds, token_counts)
     def forward(
         self,
         labels: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
+        audio_token_counts: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         """Forward pass for training and inference."""
         if inputs_embeds is None:
             inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if input_features is not None and input_ids is not None:
             if self.training and self.spec_augment is not None:
                 input_features = self.spec_augment(input_features)
+            is_audio_token = input_ids == self.audio_token_id
+            if audio_token_counts is None:
+                audio_token_counts = is_audio_token.sum(dim=-1)
+            else:
+                audio_token_counts = audio_token_counts.to(
+                    device=input_ids.device, dtype=torch.long
+                )
+            audio_embeds = self._encode_audio(input_features, audio_token_counts)
+            audio_token_mask = is_audio_token.unsqueeze(-1)
             inputs_embeds = inputs_embeds.masked_scatter(
                 audio_token_mask.to(inputs_embeds.device),
                 audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
             )
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             **kwargs,
         )
         if outputs.loss is not None and hasattr(self.projector, "get_aux_loss"):
             aux_loss = self.projector.get_aux_loss()
             if aux_loss is not None and aux_loss.numel() > 0:
         batch_size = input_features.shape[0]
         # Encode audio -> flattened embeddings
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        token_counts = torch.tensor(
+            [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
+            device=input_features.device,
+            dtype=torch.long,
+        )
+        audio_embeds = self._encode_audio(input_features, token_counts)
         # If input_ids not provided, build prompt with correct number of audio tokens
         if input_ids is None:
         batch_size = input_features.shape[0]
         # Encode audio -> flattened embeddings
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        token_counts = torch.tensor(
+            [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
+            device=input_features.device,
+            dtype=torch.long,
+        )
+        audio_embeds = self._encode_audio(input_features, token_counts)
         # Build prompt with correct number of audio tokens
         num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
         thread.join()
     def save_pretrained(self, save_directory: Union[str, Path], **kwargs) -> None:
         """Save model, tokenizer, and processor."""
         import shutil
+        save_dir = Path(save_directory)
         save_dir.mkdir(parents=True, exist_ok=True)
         # Update config with actual vocab size
             json.dump(processor_config, f, indent=2)
         # Copy source files for auto-loading
+        src_dir = Path(__file__).parent
         for asr_file in src_dir.glob("asr_*.py"):
             shutil.copy(asr_file, save_dir / asr_file.name)
         # Copy projectors module
         # Call parent's push_to_hub
         return super().push_to_hub(repo_id, **kwargs)
 # Register with transformers Auto classes
+# (AutoConfig.register is handled in asr_config.py at module load.)
 AutoModel.register(ASRConfig, ASRModel)

asr_pipeline.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Any
 import numpy as np
 import torch
 import transformers
 try:
     from .alignment import ForcedAligner
@@ -20,6 +21,13 @@ except ImportError:
 # Re-export for backwards compatibility
 __all__ = ["ForcedAligner", "SpeakerDiarizer", "ASRPipeline"]
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
@@ -152,8 +160,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     def _extract_audio(self, inputs) -> dict | None:
         """Extract audio array from various input formats using HF utilities."""
-        from transformers.pipelines.audio_utils import ffmpeg_read
         if isinstance(inputs, dict):
             if "array" in inputs:
                 return {
@@ -257,8 +263,8 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
-        text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
-        # Truncate repetitions at end of text
         text = _truncate_repetitions(text)
         return {"text": text}
@@ -281,14 +287,16 @@ def _truncate_repetitions(text: str, min_repeats: int = 3) -> str:
     if not text:
         return text
-    # 1. Truncate repeated characters at end (e.g., "444444" -> "4")
-    char_pattern = re.compile(r"(.)\1{" + str(min_repeats - 1) + r",}$")
-    text = char_pattern.sub(r"\1", text)
-    # 2. Truncate repeated words at end (e.g., "the the the" -> "the")
-    word_pattern = re.compile(
-        r"\b(\w+)(?:\s+\1){" + str(min_repeats - 1) + r",}\s*$", re.IGNORECASE
-    )
     while word_pattern.search(text):
         text = word_pattern.sub(r"\1", text)

 import numpy as np
 import torch
 import transformers
+from transformers.pipelines.audio_utils import ffmpeg_read
 try:
     from .alignment import ForcedAligner
 # Re-export for backwards compatibility
 __all__ = ["ForcedAligner", "SpeakerDiarizer", "ASRPipeline"]
+_THINK_TAG_RE = re.compile(r"<think>.*?</think>\s*", flags=re.DOTALL)
+_DEFAULT_MIN_REPEATS = 3
+_TRAILING_CHAR_RE = re.compile(r"(.)\1{" + str(_DEFAULT_MIN_REPEATS - 1) + r",}$")
+_TRAILING_WORD_RE = re.compile(
+    r"\b(\w+)(?:\s+\1){" + str(_DEFAULT_MIN_REPEATS - 1) + r",}\s*$", re.IGNORECASE
+)
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
     def _extract_audio(self, inputs) -> dict | None:
         """Extract audio array from various input formats using HF utilities."""
         if isinstance(inputs, dict):
             if "array" in inputs:
                 return {
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
+        if "<think>" in text:
+            text = _THINK_TAG_RE.sub("", text).strip()
         text = _truncate_repetitions(text)
         return {"text": text}
     if not text:
         return text
+    if min_repeats == _DEFAULT_MIN_REPEATS:
+        char_pattern = _TRAILING_CHAR_RE
+        word_pattern = _TRAILING_WORD_RE
+    else:
+        char_pattern = re.compile(r"(.)\1{" + str(min_repeats - 1) + r",}$")
+        word_pattern = re.compile(
+            r"\b(\w+)(?:\s+\1){" + str(min_repeats - 1) + r",}\s*$", re.IGNORECASE
+        )
+    text = char_pattern.sub(r"\1", text)
     while word_pattern.search(text):
         text = word_pattern.sub(r"\1", text)

asr_processing.py CHANGED Viewed

@@ -5,9 +5,9 @@ import transformers
 from transformers import ProcessorMixin
 try:
-    from .asr_config import ASRConfig
 except ImportError:
-    from asr_config import ASRConfig  # type: ignore[no-redef]
 class ASRProcessor(ProcessorMixin):
@@ -18,8 +18,6 @@ class ASRProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
     TRANSCRIBE_PROMPT = ""
-    # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
-    DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
     def __init__(
         self,
@@ -40,7 +38,7 @@ class ASRProcessor(ProcessorMixin):
         self.tokenizer = tokenizer
         self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
         self.projector = projector
-        self.encoder_conv_layers = encoder_conv_layers or self.DEFAULT_ENCODER_CONV_LAYERS
     def _compute_encoder_output_length(self, mel_length: int) -> int:
         """Compute encoder output length using conv layer formulas."""

 from transformers import ProcessorMixin
 try:
+    from .asr_config import DEFAULT_ENCODER_CONV_LAYERS, ASRConfig
 except ImportError:
+    from asr_config import DEFAULT_ENCODER_CONV_LAYERS, ASRConfig  # type: ignore[no-redef]
 class ASRProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
     TRANSCRIBE_PROMPT = ""
     def __init__(
         self,
         self.tokenizer = tokenizer
         self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
         self.projector = projector
+        self.encoder_conv_layers = encoder_conv_layers or DEFAULT_ENCODER_CONV_LAYERS
     def _compute_encoder_output_length(self, mel_length: int) -> int:
         """Compute encoder output length using conv layer formulas."""

diarization.py CHANGED Viewed

@@ -154,8 +154,6 @@ class SpeakerClusterer:
         Returns:
             Cluster labels of shape [N]
         """
-        import warnings
         if len(embeddings.shape) != 2:
             raise ValueError(f"Expected 2D array, got shape {embeddings.shape}")

         Returns:
             Cluster labels of shape [N]
         """
         if len(embeddings.shape) != 2:
             raise ValueError(f"Expected 2D array, got shape {embeddings.shape}")

projectors.py CHANGED Viewed

@@ -58,13 +58,7 @@ class MLPAudioProjector(nn.Module):
         Returns:
             Projected features of shape [batch, (seq_len - k) // k + 1, llm_dim]
         """
-        batch, seq, dim = x.shape
-        # Truncate to match GLM-ASR: use (seq - k) // k + 1 frames
-        # This drops trailing frames that don't fill a complete k-frame window
-        out_len = (seq - self.k) // self.k + 1
-        x = x[:, : out_len * self.k, :]  # Truncate to exact multiple
-        x = x.reshape(batch, out_len, dim * self.k)
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
@@ -76,6 +70,17 @@ class MLPAudioProjector(nn.Module):
 # =============================================================================
 class SimpleAdapter(nn.Module):
     """Simple 2-layer GELU adapter (from MOSA paper)."""
@@ -89,34 +94,6 @@ class SimpleAdapter(nn.Module):
         return self.fc2(self.act(self.fc1(x)))
-class SwiGLU(nn.Module):
-    """SwiGLU activation with gated linear units (used in LLaMA, Mistral, etc.)."""
-    def __init__(self, dim: int, hidden_dim: int, bias: bool = False):
-        super().__init__()
-        self.w1 = nn.Linear(dim, hidden_dim, bias=bias)  # Gate
-        self.w2 = nn.Linear(dim, hidden_dim, bias=bias)  # Value
-        self.w3 = nn.Linear(hidden_dim, dim, bias=bias)  # Output
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.w3(F.silu(self.w1(x)) * self.w2(x))
-class AsymmetricSwiGLU(nn.Module):
-    """SwiGLU that handles different input and output dimensions."""
-    def __init__(
-        self, in_features: int, hidden_features: int, out_features: int, bias: bool = False
-    ):
-        super().__init__()
-        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)  # Gate
-        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)  # Value
-        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)  # Output
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.w3(F.silu(self.w1(x)) * self.w2(x))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
@@ -281,13 +258,10 @@ class MoEAudioProjector(nn.Module):
         Returns:
             Projected features of shape [batch, out_len, llm_dim]
         """
-        # 1. Frame Stacking
-        batch, seq, dim = x.shape
-        out_len = (seq - self.k) // self.k + 1
-        x = x[:, : out_len * self.k, :]
-        x = x.reshape(batch, out_len, dim * self.k)
-        # 2. Normalize stacked input (like main branch SharedMoEBlock)
         x = self.norm(x)
         flat_x = x.view(-1, x.size(-1))  # [tokens, in_dim]

         Returns:
             Projected features of shape [batch, (seq_len - k) // k + 1, llm_dim]
         """
+        x = _frame_stack(x, self.k)
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
 # =============================================================================
+def _frame_stack(x: torch.Tensor, k: int) -> torch.Tensor:
+    """Stack k adjacent frames along the feature dim.
+    Truncates trailing frames that don't fill a complete k-frame window,
+    matching GLM-ASR's `(seq_len - k) // k + 1` formula.
+    """
+    batch, seq, dim = x.shape
+    out_len = (seq - k) // k + 1
+    return x[:, : out_len * k, :].reshape(batch, out_len, dim * k)
 class SimpleAdapter(nn.Module):
     """Simple 2-layer GELU adapter (from MOSA paper)."""
         return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
         Returns:
             Projected features of shape [batch, out_len, llm_dim]
         """
+        x = _frame_stack(x, self.k)
+        batch, out_len, _ = x.shape
+        # Normalize stacked input (like main branch SharedMoEBlock)
         x = self.norm(x)
         flat_x = x.view(-1, x.size(-1))  # [tokens, in_dim]