ohollo commited on
Commit
a7d861a
·
1 Parent(s): dba5265

Foundations of codebase

Browse files
assets/all_labels.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:875d994568b29d69e14d114fcfa8b77d5fc59964d4117fdd763260cb66f39249
3
+ size 6013069
src/analysis.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import pandas as pd
4
+ from src.methodology import SimpleMethodology
5
+ from src.neighbours import EmbeddingClosestNeighbours
6
+ from src.scorer import EmbeddingScorer
7
+
8
+
9
+
10
+ class EmbeddingsAnalysis:
11
+ def __init__(self, index, all_labels, lookup, scalers, close_threshold=0.95):
12
+ all_labels_np = all_labels['track_id'].to_numpy()
13
+ all_lengths_np = all_labels['length'].to_numpy()
14
+ self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
15
+ specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
16
+ sm = SimpleMethodology(specific_scalers, specific_scalers[99])
17
+ self._scorer = EmbeddingScorer(index, all_labels_np, sm)
18
+
19
+
20
+ def get_score(self, embeddings, lengths):
21
+ score = self._scorer.score(embeddings, lengths)
22
+ return score
23
+
24
+ def get_neighbours(self, embeddings, limit=None):
25
+ neighbours = self._ecn.get(embeddings, limit)
26
+ return neighbours
src/methodology.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import pandas as pd
3
+
4
+
5
+ _SCALER_X_LABEL = 'score'
6
+
7
+
8
+ class _TransformerProtocol:
9
+ def transform(self, X):
10
+ ...
11
+
12
+
13
+ class CountBasedMethodology(ABC):
14
+ @abstractmethod
15
+ def execute(self, neighbours_df: pd.DataFrame) -> pd.Series:
16
+ ...
17
+
18
+ @abstractmethod
19
+ def radii_needed(self) -> list[float]:
20
+ ...
21
+
22
+
23
+
24
+ class SimpleMethodology(CountBasedMethodology):
25
+ def __init__(self, scalers: dict[int, _TransformerProtocol], fallback_scaler: _TransformerProtocol):
26
+ self._scalers = scalers
27
+ self._fallback_scaler = fallback_scaler
28
+
29
+ def radii_needed(self) -> list[float]:
30
+ return [0.8, 0.85, 0.9, 0.925, 0.95]
31
+
32
+ def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
33
+ unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
34
+ scaled = unscaled.apply(
35
+ lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
36
+ axis=1
37
+ )
38
+ return scaled
src/neighbours.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import NamedTuple
3
+
4
+ import faiss
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from src.utils import indices_distances_gen
9
+
10
+
11
+ class Neighbour(NamedTuple):
12
+ distance: float
13
+ label: str
14
+ length: int
15
+ metadata: dict
16
+
17
+
18
+ class EmbeddingClosestNeighbours:
19
+ """
20
+ Analyzes embeddings to find close neighbors based on a similarity threshold.
21
+
22
+ :param index: FAISS index for similarity search.
23
+ :param labels: 1-d Numpy array of labels corresponding to index entries.
24
+ :param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
25
+ :param close_threshold: Similarity threshold to consider embeddings as "close".
26
+ """
27
+ def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray, metadata: pd.DataFrame, close_threshold: float = CLOSE_THRESHOLD):
28
+ self._index = index
29
+ self._labels = labels
30
+ self._lengths = lengths
31
+ self._metadata = metadata
32
+ self._close_threshold = close_threshold
33
+
34
+ def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
35
+ lims, D, I = self._index.range_search(embeddings, self._close_threshold)
36
+ all_neighbours = []
37
+ for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
38
+ lengths_ = self._lengths[indices_]
39
+ labels, unique_indices = np.unique(self._labels[indices_], return_index=True)
40
+ distances = distances_[unique_indices]
41
+ lengths = lengths_[unique_indices]
42
+ sorted_indices = np.flip(np.argsort(distances))
43
+ sorted_labels = labels[sorted_indices]
44
+ sorted_distances = distances[sorted_indices]
45
+ sorted_lengths = lengths[sorted_indices]
46
+ neighbours = [
47
+ Neighbour(
48
+ distance=float(sorted_distances[j]),
49
+ label=sorted_labels[j],
50
+ length=int(sorted_lengths[j]),
51
+ metadata=self._metadata.loc[sorted_labels[j]].to_dict()
52
+ )
53
+ for j in range(len(sorted_labels))
54
+ ]
55
+ if limit is not None:
56
+ neighbours = neighbours[:limit]
57
+ all_neighbours.append(neighbours)
58
+ return all_neighbours
src/scorer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from src.methodology import CountBasedMethodology
6
+ from src.utils import indices_distances_gen
7
+
8
+
9
+ def _count_unique_neighbours(embeddings, radius, index, all_labels):
10
+ res = []
11
+ for indices_, _ in indices_distances_gen(embeddings, radius, index):
12
+ neighbours = np.unique(all_labels[indices_])
13
+ res.append(neighbours.shape[0])
14
+ return res
15
+
16
+
17
+ class EmbeddingScorer:
18
+ """
19
+ Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
20
+
21
+ :param index: FAISS index for similarity search.
22
+ :param labels: 1-d Numpy array of labels corresponding to index entries.
23
+ :param scorer: Methodology that takes dataframe where columns are the different radii (specified in radii argument), along with length of chord sequence. Each row represents an embedding to be scored.
24
+ """
25
+ def __init__(self, index: faiss.Index, labels: np.ndarray, methodology: CountBasedMethodology):
26
+ self._index = index
27
+ self._labels = labels
28
+ self._methodology = methodology
29
+
30
+ def score(self, embeddings: np.ndarray, lengths: pd.Series) -> list[float]:
31
+ radii = self._methodology.radii_needed()
32
+ counts = {}
33
+ for radius in radii:
34
+ counts[str(radius)] = _count_unique_neighbours(embeddings, radius, self._index, self._labels)
35
+ neighbours_df = pd.DataFrame(counts)
36
+ scores = self._methodology.execute(neighbours_df, lengths)
37
+ return scores
src/utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+
4
+
5
+ def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
6
+ lims, D, I = index.range_search(embeddings, radius)
7
+ # Iterate over lims and get indices per embedding
8
+ for i in range(len(lims) - 1):
9
+ start = lims[i]
10
+ end = lims[i + 1]
11
+ indices_ = I[start:end]
12
+ distances_ = D[start:end]
13
+ yield indices_, distances_