Spaces:

ohollo
/

harmonic-analysis

Running

App Files Files Community

ohollo commited on Nov 14, 2025

Commit

a7d861a

1 Parent(s): dba5265

Foundations of codebase

Browse files

Files changed (6) hide show

assets/all_labels.csv +3 -0
src/analysis.py +26 -0
src/methodology.py +38 -0
src/neighbours.py +58 -0
src/scorer.py +37 -0
src/utils.py +13 -0

assets/all_labels.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:875d994568b29d69e14d114fcfa8b77d5fc59964d4117fdd763260cb66f39249
+size 6013069

src/analysis.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from dataclasses import dataclass
+import pandas as pd
+from src.methodology import SimpleMethodology
+from src.neighbours import EmbeddingClosestNeighbours
+from src.scorer import EmbeddingScorer
+class EmbeddingsAnalysis:
+    def __init__(self, index, all_labels, lookup, scalers, close_threshold=0.95):
+        all_labels_np = all_labels['track_id'].to_numpy()
+        all_lengths_np = all_labels['length'].to_numpy()
+        self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
+        specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
+        sm = SimpleMethodology(specific_scalers, specific_scalers[99])
+        self._scorer = EmbeddingScorer(index, all_labels_np, sm)
+    def get_score(self, embeddings, lengths):
+        score = self._scorer.score(embeddings, lengths)
+        return score
+    def get_neighbours(self, embeddings, limit=None):
+        neighbours = self._ecn.get(embeddings, limit)
+        return neighbours

src/methodology.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from abc import ABC, abstractmethod
+import pandas as pd
+_SCALER_X_LABEL = 'score'
+class _TransformerProtocol:
+    def transform(self, X):
+        ...
+class CountBasedMethodology(ABC):
+    @abstractmethod
+    def execute(self, neighbours_df: pd.DataFrame) -> pd.Series:
+        ...
+    @abstractmethod
+    def radii_needed(self) -> list[float]:
+        ...
+class SimpleMethodology(CountBasedMethodology):
+    def __init__(self, scalers: dict[int, _TransformerProtocol], fallback_scaler: _TransformerProtocol):
+        self._scalers = scalers
+        self._fallback_scaler = fallback_scaler
+    def radii_needed(self) -> list[float]:
+        return [0.8, 0.85, 0.9, 0.925, 0.95]
+    def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
+        unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
+        scaled = unscaled.apply(
+            lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
+            axis=1
+        )
+        return scaled

src/neighbours.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import NamedTuple
+import faiss
+import numpy as np
+import pandas as pd
+from src.utils import indices_distances_gen
+class Neighbour(NamedTuple):
+    distance: float
+    label: str
+    length: int
+    metadata: dict
+class EmbeddingClosestNeighbours:
+    """
+    Analyzes embeddings to find close neighbors based on a similarity threshold.
+    :param index: FAISS index for similarity search.
+    :param labels: 1-d Numpy array of labels corresponding to index entries.
+    :param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
+    :param close_threshold: Similarity threshold to consider embeddings as "close".
+    """
+    def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray, metadata: pd.DataFrame, close_threshold: float = CLOSE_THRESHOLD):
+        self._index = index
+        self._labels = labels
+        self._lengths = lengths
+        self._metadata = metadata
+        self._close_threshold = close_threshold
+    def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
+        lims, D, I = self._index.range_search(embeddings, self._close_threshold)
+        all_neighbours = []
+        for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
+            lengths_ = self._lengths[indices_]
+            labels, unique_indices = np.unique(self._labels[indices_], return_index=True)
+            distances = distances_[unique_indices]
+            lengths = lengths_[unique_indices]
+            sorted_indices = np.flip(np.argsort(distances))
+            sorted_labels = labels[sorted_indices]
+            sorted_distances = distances[sorted_indices]
+            sorted_lengths = lengths[sorted_indices]
+            neighbours = [
+                Neighbour(
+                    distance=float(sorted_distances[j]),
+                    label=sorted_labels[j],
+                    length=int(sorted_lengths[j]),
+                    metadata=self._metadata.loc[sorted_labels[j]].to_dict()
+                )
+                for j in range(len(sorted_labels))
+            ]
+            if limit is not None:
+                neighbours = neighbours[:limit]
+            all_neighbours.append(neighbours)
+        return all_neighbours

src/scorer.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import faiss
+import numpy as np
+import pandas as pd
+from src.methodology import CountBasedMethodology
+from src.utils import indices_distances_gen
+def _count_unique_neighbours(embeddings, radius, index, all_labels):
+    res = []
+    for indices_, _ in indices_distances_gen(embeddings, radius, index):
+        neighbours = np.unique(all_labels[indices_])
+        res.append(neighbours.shape[0])
+    return res
+class EmbeddingScorer:
+    """
+    Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
+    :param index: FAISS index for similarity search.
+    :param labels: 1-d Numpy array of labels corresponding to index entries.
+    :param scorer: Methodology that takes dataframe where columns are the different radii (specified in radii argument), along with length of chord sequence. Each row represents an embedding to be scored.
+   """
+    def __init__(self, index: faiss.Index, labels: np.ndarray, methodology: CountBasedMethodology):
+        self._index = index
+        self._labels = labels
+        self._methodology = methodology
+    def score(self, embeddings: np.ndarray, lengths: pd.Series) -> list[float]:
+        radii = self._methodology.radii_needed()
+        counts = {}
+        for radius in radii:
+            counts[str(radius)] = _count_unique_neighbours(embeddings, radius, self._index, self._labels)
+        neighbours_df = pd.DataFrame(counts)
+        scores = self._methodology.execute(neighbours_df, lengths)
+        return scores

src/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import faiss
+import numpy as np
+def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
+    lims, D, I = index.range_search(embeddings, radius)
+    # Iterate over lims and get indices per embedding
+    for i in range(len(lims) - 1):
+        start = lims[i]
+        end = lims[i + 1]
+        indices_ = I[start:end]
+        distances_ = D[start:end]
+        yield indices_, distances_