Update custom model files, README, and requirements

Browse files

Files changed (5) hide show

alignment.py +8 -7
asr_config.py +17 -24
asr_modeling.py +3 -3
diarization.py +35 -8
handler.py +0 -8

alignment.py CHANGED Viewed

@@ -73,7 +73,7 @@ class ForcedAligner:
         # Force alignment to use all tokens by preventing staying in blank
         # at the end when there are still tokens to emit
         if num_tokens > 1:
-            trellis[-num_tokens + 1:, 0] = float("inf")
         for t in range(num_frames):
             for j in range(num_tokens + 1):
@@ -113,7 +113,12 @@ class ForcedAligner:
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
-                (tokens[i], i * frames_per_token, (i + 1) * frames_per_token, (i + 0.5) * frames_per_token)
                 for i in range(num_tokens)
             ]
@@ -280,11 +285,7 @@ class ForcedAligner:
                 last_char_peak = peak_frame
         # Don't forget the last word
-        if (
-            first_char_peak is not None
-            and last_char_peak is not None
-            and word_idx < len(words)
-        ):
             start_time = max(0.0, first_char_peak * frame_duration - start_offset)
             end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
             word_timestamps.append(

         # Force alignment to use all tokens by preventing staying in blank
         # at the end when there are still tokens to emit
         if num_tokens > 1:
+            trellis[-num_tokens + 1 :, 0] = float("inf")
         for t in range(num_frames):
             for j in range(num_tokens + 1):
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
+                (
+                    tokens[i],
+                    i * frames_per_token,
+                    (i + 1) * frames_per_token,
+                    (i + 0.5) * frames_per_token,
+                )
                 for i in range(num_tokens)
             ]
                 last_char_peak = peak_frame
         # Don't forget the last word
+        if first_char_peak is not None and last_char_peak is not None and word_idx < len(words):
             start_time = max(0.0, first_char_peak * frame_duration - start_offset)
             end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
             word_timestamps.append(

asr_config.py CHANGED Viewed

@@ -21,7 +21,7 @@ class ASRConfig(transformers.PretrainedConfig):
         self,
         audio_model_id: str = "zai-org/GLM-ASR-Nano-2512",
         text_model_id: str = "Qwen/Qwen3-0.6B",
-        attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
@@ -64,6 +64,7 @@ class ASRConfig(transformers.PretrainedConfig):
         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         do_sample: bool = False,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
@@ -80,7 +81,7 @@ class ASRConfig(transformers.PretrainedConfig):
         Args:
             audio_model_id: HuggingFace model ID for audio encoder (GLM-ASR/Whisper)
             text_model_id: HuggingFace model ID for text decoder (Qwen)
-            attn_implementation: Attention implementation ("flash_attention_2", "sdpa", "eager")
             model_dtype: Model dtype ("bfloat16", "float16", "float32")
             projector_type: Projector architecture ("mlp", "mosa", "moe", "qformer")
             use_lora: Enable LoRA adapters for Stage 2 fine-tuning
@@ -151,29 +152,21 @@ class ASRConfig(transformers.PretrainedConfig):
         ]
         self.freeze_projector = freeze_projector
-        # Generation parameters (use explicit value if provided, else use default)
-        self.num_beams = num_beams if num_beams is not None else generation_defaults["num_beams"]
-        self.max_new_tokens = (
-            max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
-        )
-        self.min_new_tokens = (
-            min_new_tokens if min_new_tokens is not None else generation_defaults["min_new_tokens"]
-        )
-        self.repetition_penalty = (
-            repetition_penalty
-            if repetition_penalty is not None
-            else generation_defaults["repetition_penalty"]
-        )
-        self.length_penalty = (
-            length_penalty if length_penalty is not None else generation_defaults["length_penalty"]
-        )
-        self.no_repeat_ngram_size = (
-            no_repeat_ngram_size
-            if no_repeat_ngram_size is not None
-            else generation_defaults["no_repeat_ngram_size"]
-        )
-        self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
         self.do_sample = do_sample
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k

         self,
         audio_model_id: str = "zai-org/GLM-ASR-Nano-2512",
         text_model_id: str = "Qwen/Qwen3-0.6B",
+        attn_implementation: str = "sdpa",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         do_sample: bool = False,
+        enable_thinking: bool = False,  # Enable Qwen3 thinking mode for omni models
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
         Args:
             audio_model_id: HuggingFace model ID for audio encoder (GLM-ASR/Whisper)
             text_model_id: HuggingFace model ID for text decoder (Qwen)
+            attn_implementation: Attention implementation ("sdpa", "flash_attention_2", "eager")
             model_dtype: Model dtype ("bfloat16", "float16", "float32")
             projector_type: Projector architecture ("mlp", "mosa", "moe", "qformer")
             use_lora: Enable LoRA adapters for Stage 2 fine-tuning
         ]
         self.freeze_projector = freeze_projector
+        # Generation parameters: check named param first, then kwargs (from config.json), then default
+        def get_gen_param(name, named_value):
+            if named_value is not None:
+                return named_value
+            return kwargs.get(name, generation_defaults[name])
+        self.num_beams = get_gen_param("num_beams", num_beams)
+        self.max_new_tokens = get_gen_param("max_new_tokens", max_new_tokens)
+        self.min_new_tokens = get_gen_param("min_new_tokens", min_new_tokens)
+        self.repetition_penalty = get_gen_param("repetition_penalty", repetition_penalty)
+        self.length_penalty = get_gen_param("length_penalty", length_penalty)
+        self.no_repeat_ngram_size = get_gen_param("no_repeat_ngram_size", no_repeat_ngram_size)
+        self.use_cache = get_gen_param("use_cache", use_cache)
         self.do_sample = do_sample
+        self.enable_thinking = enable_thinking
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k

asr_modeling.py CHANGED Viewed

@@ -582,7 +582,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-                enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
             )
             input_ids = chat_result.input_ids.to(device)
@@ -665,7 +665,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
         )
         input_ids = chat_result.input_ids.to(device)
@@ -764,7 +764,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
         ).to(device)
         if input_ids.dim() == 1:

                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+                enable_thinking=getattr(self.config, "enable_thinking", False),
             )
             input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=getattr(self.config, "enable_thinking", False),
         )
         input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=getattr(self.config, "enable_thinking", False),
         ).to(device)
         if input_ids.dim() == 1:

diarization.py CHANGED Viewed

@@ -91,20 +91,47 @@ class SpectralCluster:
     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
     ) -> tuple[np.ndarray, int]:
-        """Extract spectral embeddings from Laplacian."""
         lambdas, eig_vecs = scipy.linalg.eigh(laplacian)
-        if k_oracle is not None:
-            num_of_spk = k_oracle
-        else:
-            lambda_gap_list = self.get_eigen_gaps(
-                lambdas[self.min_num_spks - 1 : self.max_num_spks + 1]
-            )
-            num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks
         emb = eig_vecs[:, :num_of_spk]
         return emb, num_of_spk
     def cluster_embs(self, emb: np.ndarray, k: int) -> np.ndarray:
         """Cluster spectral embeddings using k-means."""
         _, labels, _ = k_means(emb, k, n_init=10)

     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
     ) -> tuple[np.ndarray, int]:
+        """Extract spectral embeddings from Laplacian.
+        Uses the eigengap heuristic to estimate the number of clusters:
+        The number of clusters k is chosen where the gap between consecutive
+        eigenvalues is largest, indicating a transition from "cluster" eigenvalues
+        (near 0) to "noise" eigenvalues.
+        """
         lambdas, eig_vecs = scipy.linalg.eigh(laplacian)
+        num_of_spk = k_oracle if k_oracle is not None else self._estimate_num_speakers(lambdas)
         emb = eig_vecs[:, :num_of_spk]
         return emb, num_of_spk
+    def _estimate_num_speakers(self, lambdas: np.ndarray) -> int:
+        """Estimate number of speakers using refined eigengap heuristic.
+        For spectral clustering, we look for the largest gap in eigenvalues.
+        The eigenvalues corresponding to clusters are close to 0, and there
+        should be a significant jump to the remaining eigenvalues.
+        """
+        # Consider eigenvalues from index 1 to max_num_spks (skip first, it's always ~0)
+        # We need gaps between positions, so look at indices 1 to max_num_spks+1
+        max_idx = min(self.max_num_spks + 1, len(lambdas))
+        relevant_lambdas = lambdas[1:max_idx]  # Skip first eigenvalue
+        if len(relevant_lambdas) < 2:
+            return self.min_num_spks
+        # Compute absolute gaps (not ratios - ratios are unstable near 0)
+        gaps = np.diff(relevant_lambdas)
+        # Find the largest gap - the index gives us (k-1) since we skipped first
+        # Add 1 to convert from gap index to number of speakers
+        # Add 1 again because we skipped the first eigenvalue
+        max_gap_idx = int(np.argmax(gaps))
+        num_of_spk = max_gap_idx + 2  # +1 for gap->count, +1 for skipped eigenvalue
+        # Clamp between min and max
+        return max(self.min_num_spks, min(num_of_spk, self.max_num_spks))
     def cluster_embs(self, emb: np.ndarray, k: int) -> np.ndarray:
         """Cluster spectral embeddings using k-means."""
         _, labels, _ = k_means(emb, k, n_init=10)

handler.py CHANGED Viewed

@@ -39,8 +39,6 @@ class EndpointHandler:
             "torch_dtype": "auto",
             "low_cpu_mem_usage": True,
         }
-        if self._is_flash_attn_available():
-            model_kwargs["attn_implementation"] = "flash_attention_2"
         # Load model (this loads the model, tokenizer, and feature extractor)
         self.model = ASRModel.from_pretrained(path, **model_kwargs)
@@ -56,12 +54,6 @@ class EndpointHandler:
             device=self.device,
         )
-    def _is_flash_attn_available(self):
-        """Check if flash attention is available."""
-        import importlib.util
-        return importlib.util.find_spec("flash_attn") is not None
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """Process an inference request.

             "torch_dtype": "auto",
             "low_cpu_mem_usage": True,
         }
         # Load model (this loads the model, tokenizer, and feature extractor)
         self.model = ASRModel.from_pretrained(path, **model_kwargs)
             device=self.device,
         )
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """Process an inference request.