mazesmazes
/

tiny-audio

@@ -84,75 +84,105 @@ class ForcedAligner:
                 if j > 0:
                     move = trellis[t, j - 1] + emission[t, tokens[j - 1]]
                 else:
-                    move = torch.tensor(-float("inf"))
-                trellis[t + 1, j] = torch.logaddexp(torch.tensor(stay), move).item()
         return trellis
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
-    ) -> list[tuple[int, int, int]]:
-        """Backtrack through trellis to find optimal alignment path.
         Returns list of (token_id, start_frame, end_frame) for each token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
-        # Trace back from final state
-        t = num_frames
-        j = num_tokens
-        path = []  # Will store (frame, token_index) pairs
-        while t > 0 and j >= 0:
-            # At position (t, j), we need to determine if we got here by:
-            # 1. Staying at j (emitting blank at frame t-1)
-            # 2. Moving from j-1 to j (emitting token j-1 at frame t-1)
-            if j == 0:
-                # Can only stay (no previous token state to come from)
-                t -= 1
-                continue
-            # Compare which transition was more likely
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
             move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-            if move_score > stay_score:
                 # Token j-1 was emitted at frame t-1
-                path.append((t - 1, j - 1))
                 j -= 1
             t -= 1
-        path.reverse()
-        # Convert path to token spans with start/end frames
-        if not path:
-            return []
         token_spans = []
-        i = 0
-        while i < len(path):
-            frame, token_idx = path[i]
-            start_frame = frame
-            # Find end frame (where this token stops being emitted)
-            end_frame = frame + 1
-            while i + 1 < len(path) and path[i + 1][1] == token_idx:
-                i += 1
-                end_frame = path[i][0] + 1
-            token_spans.append((tokens[token_idx], start_frame, end_frame))
-            i += 1
         return token_spans
-    # Sub-frame offset to compensate for Wav2Vec2 convolutional look-ahead (in seconds)
-    # This makes timestamps feel more "natural" by shifting them earlier
-    OFFSET_COMPENSATION = 0.02  # 40ms
     @classmethod
     def align(
@@ -162,7 +192,6 @@ class ForcedAligner:
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
-        offset_compensation: float | None = None,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
@@ -174,9 +203,6 @@ class ForcedAligner:
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
-            offset_compensation: Time offset in seconds to subtract from timestamps
-                to compensate for Wav2Vec2 look-ahead (default: 0.04s / 40ms).
-                Set to 0 to disable.
         Returns:
             List of dicts with 'word', 'start', 'end' keys
@@ -232,8 +258,9 @@ class ForcedAligner:
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate
-        # Apply offset compensation for Wav2Vec2 look-ahead
-        offset = offset_compensation if offset_compensation is not None else cls.OFFSET_COMPENSATION
         # Group aligned tokens into words based on pipe separator
         words = text.split()
@@ -246,8 +273,8 @@ class ForcedAligner:
         for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if current_word_start is not None and word_idx < len(words):
-                    start_time = max(0.0, current_word_start * frame_duration - offset)
-                    end_time = max(0.0, current_word_end * frame_duration - offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
@@ -265,8 +292,8 @@ class ForcedAligner:
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
-            start_time = max(0.0, current_word_start * frame_duration - offset)
-            end_time = max(0.0, current_word_end * frame_duration - offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],

                 if j > 0:
                     move = trellis[t, j - 1] + emission[t, tokens[j - 1]]
                 else:
+                    move = -float("inf")
+                trellis[t + 1, j] = max(stay, move)  # Viterbi: take best path
         return trellis
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
+    ) -> list[tuple[int, int, float]]:
+        """Backtrack through trellis to find optimal forced monotonic alignment.
+        Guarantees:
+        - All tokens are emitted exactly once
+        - Strictly monotonic: each token's frames come after previous token's
+        - No frame skipping or token teleporting
         Returns list of (token_id, start_frame, end_frame) for each token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
+        if num_tokens == 0:
+            return []
+        # Find the best ending point (should be at num_tokens)
+        # But verify trellis reached a valid state
+        if trellis[num_frames, num_tokens] == -float("inf"):
+            # Alignment failed - fall back to uniform distribution
+            frames_per_token = num_frames / num_tokens
+            return [
+                (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
+                for i in range(num_tokens)
+            ]
+        # Backtrack: find where each token transition occurred
+        # path[i] = frame where token i was first emitted
+        token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
+        t = num_frames
+        j = num_tokens
+        while t > 0 and j > 0:
+            # Check: did we transition from j-1 to j at frame t-1?
             stay_score = trellis[t - 1, j] + emission[t - 1, blank_id]
             move_score = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+            if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
+                token_frames[j - 1].insert(0, t - 1)
                 j -= 1
+            # Always decrement time (monotonic)
             t -= 1
+        # Handle any remaining tokens at the start (edge case)
+        while j > 0:
+            token_frames[j - 1].insert(0, 0)
+            j -= 1
+        # Convert to spans with sub-frame interpolation
         token_spans = []
+        for token_idx, frames in enumerate(token_frames):
+            if not frames:
+                # Token never emitted - assign minimal span after previous
+                if token_spans:
+                    prev_end = token_spans[-1][2]
+                    frames = [int(prev_end)]
+                else:
+                    frames = [0]
+            token_id = tokens[token_idx]
+            frame_probs = emission[frames, token_id]
+            peak_idx = int(torch.argmax(frame_probs).item())
+            peak_frame = frames[peak_idx]
+            # Sub-frame interpolation using quadratic fit around peak
+            if len(frames) >= 3 and 0 < peak_idx < len(frames) - 1:
+                y0 = frame_probs[peak_idx - 1].item()
+                y1 = frame_probs[peak_idx].item()
+                y2 = frame_probs[peak_idx + 1].item()
+                denom = y0 - 2 * y1 + y2
+                if abs(denom) > 1e-10:
+                    offset = 0.5 * (y0 - y2) / denom
+                    offset = max(-0.5, min(0.5, offset))
+                else:
+                    offset = 0.0
+                refined_frame = peak_frame + offset
+            else:
+                refined_frame = float(peak_frame)
+            token_spans.append((token_id, refined_frame, refined_frame + 1.0))
         return token_spans
+    # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
+    # Calibrated on librispeech-alignments dataset
+    START_OFFSET = 0.06  # Subtract from start times (shift earlier)
+    END_OFFSET = -0.03   # Add to end times (shift later)
     @classmethod
     def align(
         sample_rate: int = 16000,
         _language: str = "eng",
         _batch_size: int = 16,
     ) -> list[dict]:
         """Align transcript to audio and return word-level timestamps.
             sample_rate: Audio sample rate (default 16000)
             _language: ISO-639-3 language code (default "eng" for English, unused)
             _batch_size: Batch size for alignment model (unused)
         Returns:
             List of dicts with 'word', 'start', 'end' keys
         # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
         frame_duration = 320 / cls._bundle.sample_rate
+        # Apply separate offset compensation for start/end (Wav2Vec2 systematic bias)
+        start_offset = cls.START_OFFSET
+        end_offset = cls.END_OFFSET
         # Group aligned tokens into words based on pipe separator
         words = text.split()
         for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if current_word_start is not None and word_idx < len(words):
+                    start_time = max(0.0, current_word_start * frame_duration - start_offset)
+                    end_time = max(0.0, current_word_end * frame_duration - end_offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
         # Don't forget the last word
         if current_word_start is not None and word_idx < len(words):
+            start_time = max(0.0, current_word_start * frame_duration - start_offset)
+            end_time = max(0.0, current_word_end * frame_duration - end_offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],