Spaces:
Running
Running
Foundations of codebase
Browse files- assets/all_labels.csv +3 -0
- src/analysis.py +26 -0
- src/methodology.py +38 -0
- src/neighbours.py +58 -0
- src/scorer.py +37 -0
- src/utils.py +13 -0
assets/all_labels.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:875d994568b29d69e14d114fcfa8b77d5fc59964d4117fdd763260cb66f39249
|
| 3 |
+
size 6013069
|
src/analysis.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from src.methodology import SimpleMethodology
|
| 5 |
+
from src.neighbours import EmbeddingClosestNeighbours
|
| 6 |
+
from src.scorer import EmbeddingScorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class EmbeddingsAnalysis:
|
| 11 |
+
def __init__(self, index, all_labels, lookup, scalers, close_threshold=0.95):
|
| 12 |
+
all_labels_np = all_labels['track_id'].to_numpy()
|
| 13 |
+
all_lengths_np = all_labels['length'].to_numpy()
|
| 14 |
+
self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
|
| 15 |
+
specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
|
| 16 |
+
sm = SimpleMethodology(specific_scalers, specific_scalers[99])
|
| 17 |
+
self._scorer = EmbeddingScorer(index, all_labels_np, sm)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_score(self, embeddings, lengths):
|
| 21 |
+
score = self._scorer.score(embeddings, lengths)
|
| 22 |
+
return score
|
| 23 |
+
|
| 24 |
+
def get_neighbours(self, embeddings, limit=None):
|
| 25 |
+
neighbours = self._ecn.get(embeddings, limit)
|
| 26 |
+
return neighbours
|
src/methodology.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
_SCALER_X_LABEL = 'score'
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class _TransformerProtocol:
|
| 9 |
+
def transform(self, X):
|
| 10 |
+
...
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CountBasedMethodology(ABC):
|
| 14 |
+
@abstractmethod
|
| 15 |
+
def execute(self, neighbours_df: pd.DataFrame) -> pd.Series:
|
| 16 |
+
...
|
| 17 |
+
|
| 18 |
+
@abstractmethod
|
| 19 |
+
def radii_needed(self) -> list[float]:
|
| 20 |
+
...
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class SimpleMethodology(CountBasedMethodology):
|
| 25 |
+
def __init__(self, scalers: dict[int, _TransformerProtocol], fallback_scaler: _TransformerProtocol):
|
| 26 |
+
self._scalers = scalers
|
| 27 |
+
self._fallback_scaler = fallback_scaler
|
| 28 |
+
|
| 29 |
+
def radii_needed(self) -> list[float]:
|
| 30 |
+
return [0.8, 0.85, 0.9, 0.925, 0.95]
|
| 31 |
+
|
| 32 |
+
def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
|
| 33 |
+
unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
|
| 34 |
+
scaled = unscaled.apply(
|
| 35 |
+
lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
|
| 36 |
+
axis=1
|
| 37 |
+
)
|
| 38 |
+
return scaled
|
src/neighbours.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import NamedTuple
|
| 3 |
+
|
| 4 |
+
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from src.utils import indices_distances_gen
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Neighbour(NamedTuple):
|
| 12 |
+
distance: float
|
| 13 |
+
label: str
|
| 14 |
+
length: int
|
| 15 |
+
metadata: dict
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class EmbeddingClosestNeighbours:
|
| 19 |
+
"""
|
| 20 |
+
Analyzes embeddings to find close neighbors based on a similarity threshold.
|
| 21 |
+
|
| 22 |
+
:param index: FAISS index for similarity search.
|
| 23 |
+
:param labels: 1-d Numpy array of labels corresponding to index entries.
|
| 24 |
+
:param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
|
| 25 |
+
:param close_threshold: Similarity threshold to consider embeddings as "close".
|
| 26 |
+
"""
|
| 27 |
+
def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray, metadata: pd.DataFrame, close_threshold: float = CLOSE_THRESHOLD):
|
| 28 |
+
self._index = index
|
| 29 |
+
self._labels = labels
|
| 30 |
+
self._lengths = lengths
|
| 31 |
+
self._metadata = metadata
|
| 32 |
+
self._close_threshold = close_threshold
|
| 33 |
+
|
| 34 |
+
def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
|
| 35 |
+
lims, D, I = self._index.range_search(embeddings, self._close_threshold)
|
| 36 |
+
all_neighbours = []
|
| 37 |
+
for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
|
| 38 |
+
lengths_ = self._lengths[indices_]
|
| 39 |
+
labels, unique_indices = np.unique(self._labels[indices_], return_index=True)
|
| 40 |
+
distances = distances_[unique_indices]
|
| 41 |
+
lengths = lengths_[unique_indices]
|
| 42 |
+
sorted_indices = np.flip(np.argsort(distances))
|
| 43 |
+
sorted_labels = labels[sorted_indices]
|
| 44 |
+
sorted_distances = distances[sorted_indices]
|
| 45 |
+
sorted_lengths = lengths[sorted_indices]
|
| 46 |
+
neighbours = [
|
| 47 |
+
Neighbour(
|
| 48 |
+
distance=float(sorted_distances[j]),
|
| 49 |
+
label=sorted_labels[j],
|
| 50 |
+
length=int(sorted_lengths[j]),
|
| 51 |
+
metadata=self._metadata.loc[sorted_labels[j]].to_dict()
|
| 52 |
+
)
|
| 53 |
+
for j in range(len(sorted_labels))
|
| 54 |
+
]
|
| 55 |
+
if limit is not None:
|
| 56 |
+
neighbours = neighbours[:limit]
|
| 57 |
+
all_neighbours.append(neighbours)
|
| 58 |
+
return all_neighbours
|
src/scorer.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from src.methodology import CountBasedMethodology
|
| 6 |
+
from src.utils import indices_distances_gen
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _count_unique_neighbours(embeddings, radius, index, all_labels):
|
| 10 |
+
res = []
|
| 11 |
+
for indices_, _ in indices_distances_gen(embeddings, radius, index):
|
| 12 |
+
neighbours = np.unique(all_labels[indices_])
|
| 13 |
+
res.append(neighbours.shape[0])
|
| 14 |
+
return res
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class EmbeddingScorer:
|
| 18 |
+
"""
|
| 19 |
+
Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
|
| 20 |
+
|
| 21 |
+
:param index: FAISS index for similarity search.
|
| 22 |
+
:param labels: 1-d Numpy array of labels corresponding to index entries.
|
| 23 |
+
:param scorer: Methodology that takes dataframe where columns are the different radii (specified in radii argument), along with length of chord sequence. Each row represents an embedding to be scored.
|
| 24 |
+
"""
|
| 25 |
+
def __init__(self, index: faiss.Index, labels: np.ndarray, methodology: CountBasedMethodology):
|
| 26 |
+
self._index = index
|
| 27 |
+
self._labels = labels
|
| 28 |
+
self._methodology = methodology
|
| 29 |
+
|
| 30 |
+
def score(self, embeddings: np.ndarray, lengths: pd.Series) -> list[float]:
|
| 31 |
+
radii = self._methodology.radii_needed()
|
| 32 |
+
counts = {}
|
| 33 |
+
for radius in radii:
|
| 34 |
+
counts[str(radius)] = _count_unique_neighbours(embeddings, radius, self._index, self._labels)
|
| 35 |
+
neighbours_df = pd.DataFrame(counts)
|
| 36 |
+
scores = self._methodology.execute(neighbours_df, lengths)
|
| 37 |
+
return scores
|
src/utils.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
|
| 6 |
+
lims, D, I = index.range_search(embeddings, radius)
|
| 7 |
+
# Iterate over lims and get indices per embedding
|
| 8 |
+
for i in range(len(lims) - 1):
|
| 9 |
+
start = lims[i]
|
| 10 |
+
end = lims[i + 1]
|
| 11 |
+
indices_ = I[start:end]
|
| 12 |
+
distances_ = D[start:end]
|
| 13 |
+
yield indices_, distances_
|