Spaces:
Running
Running
Update assets and fix up scoring
Browse files- assets/chords_20251119.index +3 -0
- assets/quantile_transformers_20251122.joblib +3 -0
- src/__pycache__/analysis.cpython-311.pyc +0 -0
- src/__pycache__/convert.cpython-311.pyc +0 -0
- src/__pycache__/methodology.cpython-311.pyc +0 -0
- src/__pycache__/neighbours.cpython-311.pyc +0 -0
- src/__pycache__/scorer.cpython-311.pyc +0 -0
- src/__pycache__/utils.cpython-311.pyc +0 -0
- src/analysis.py +4 -4
- src/convert.py +28 -0
- src/methodology.py +2 -1
- src/neighbours.py +5 -2
- src/scorer.py +1 -1
- src/utils.py +9 -1
assets/chords_20251119.index
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1170d567f73e344462260571285ece153c468a79be65cd21cf6d52fc435bce6
|
| 3 |
+
size 281385005
|
assets/quantile_transformers_20251122.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20b23160289a8a3b3c56ce9a27193c1e2eb658824f843fd2c8b8db103c15818d
|
| 3 |
+
size 164145
|
src/__pycache__/analysis.cpython-311.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
src/__pycache__/convert.cpython-311.pyc
ADDED
|
Binary file (2.13 kB). View file
|
|
|
src/__pycache__/methodology.cpython-311.pyc
ADDED
|
Binary file (3.74 kB). View file
|
|
|
src/__pycache__/neighbours.cpython-311.pyc
ADDED
|
Binary file (3.93 kB). View file
|
|
|
src/__pycache__/scorer.cpython-311.pyc
ADDED
|
Binary file (2.77 kB). View file
|
|
|
src/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (1.47 kB). View file
|
|
|
src/analysis.py
CHANGED
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
| 3 |
import pandas as pd
|
| 4 |
from src.methodology import SimpleMethodology
|
| 5 |
from src.neighbours import EmbeddingClosestNeighbours
|
| 6 |
-
from src.scorer import
|
| 7 |
|
| 8 |
|
| 9 |
|
|
@@ -14,11 +14,11 @@ class EmbeddingsAnalysis:
|
|
| 14 |
self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
|
| 15 |
specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
|
| 16 |
sm = SimpleMethodology(specific_scalers, specific_scalers[99])
|
| 17 |
-
self._scorer =
|
| 18 |
|
| 19 |
|
| 20 |
-
def
|
| 21 |
-
score = self._scorer.score(embeddings, lengths)
|
| 22 |
return score
|
| 23 |
|
| 24 |
def get_neighbours(self, embeddings, limit=None):
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from src.methodology import SimpleMethodology
|
| 5 |
from src.neighbours import EmbeddingClosestNeighbours
|
| 6 |
+
from src.scorer import EmbeddingsOriginalityScorer
|
| 7 |
|
| 8 |
|
| 9 |
|
|
|
|
| 14 |
self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
|
| 15 |
specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
|
| 16 |
sm = SimpleMethodology(specific_scalers, specific_scalers[99])
|
| 17 |
+
self._scorer = EmbeddingsOriginalityScorer(index, all_labels_np, sm)
|
| 18 |
|
| 19 |
|
| 20 |
+
def get_scores(self, embeddings, lengths):
|
| 21 |
+
score = self._scorer.score(embeddings, pd.Series(lengths))
|
| 22 |
return score
|
| 23 |
|
| 24 |
def get_neighbours(self, embeddings, limit=None):
|
src/convert.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from gradio_client import Client
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
_CONSTANT_GAP_SECS = 2
|
| 7 |
+
_SEQ_EMBED_SPACE = 'ohollo/chord-seq-embed'
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
_client = Client(_SEQ_EMBED_SPACE)
|
| 11 |
+
|
| 12 |
+
def _call_embedding_service(chords_w_timestamps):
|
| 13 |
+
result = _client.predict(json.dumps(chords_w_timestamps), api_name="/predict")
|
| 14 |
+
return json.loads(result)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_embeddings_from_chord_sequences(chord_sequences: list[list[str]], constant_gap_secs: float = _CONSTANT_GAP_SECS) -> np.ndarray:
|
| 18 |
+
"""
|
| 19 |
+
Converts chord sequences into its corresponding embeddings.
|
| 20 |
+
|
| 21 |
+
:param chord_sequence: List of chords representing the chord sequence.
|
| 22 |
+
:return: 2-d numpy array of embeddings per chord sequence.
|
| 23 |
+
"""
|
| 24 |
+
chords_w_timestamps = [
|
| 25 |
+
{'label': chord_sequence, 'timestamp': [i* constant_gap_secs for i, _ in enumerate(chord_sequence)]}
|
| 26 |
+
for chord_sequence in chord_sequences
|
| 27 |
+
]
|
| 28 |
+
return np.array(_call_embedding_service(chords_w_timestamps)['embeddings'])
|
src/methodology.py
CHANGED
|
@@ -31,7 +31,8 @@ class SimpleMethodology(CountBasedMethodology):
|
|
| 31 |
|
| 32 |
def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
|
| 33 |
unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
|
| 34 |
-
|
|
|
|
| 35 |
lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
|
| 36 |
axis=1
|
| 37 |
)
|
|
|
|
| 31 |
|
| 32 |
def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
|
| 33 |
unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
|
| 34 |
+
concat = pd.concat([unscaled.rename('unscaled'), lengths.rename('length')], axis=1)
|
| 35 |
+
scaled = concat.apply(
|
| 36 |
lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
|
| 37 |
axis=1
|
| 38 |
)
|
src/neighbours.py
CHANGED
|
@@ -7,6 +7,8 @@ import pandas as pd
|
|
| 7 |
|
| 8 |
from src.utils import indices_distances_gen
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class Neighbour(NamedTuple):
|
| 12 |
distance: float
|
|
@@ -24,7 +26,8 @@ class EmbeddingClosestNeighbours:
|
|
| 24 |
:param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
|
| 25 |
:param close_threshold: Similarity threshold to consider embeddings as "close".
|
| 26 |
"""
|
| 27 |
-
def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray,
|
|
|
|
| 28 |
self._index = index
|
| 29 |
self._labels = labels
|
| 30 |
self._lengths = lengths
|
|
@@ -32,7 +35,7 @@ class EmbeddingClosestNeighbours:
|
|
| 32 |
self._close_threshold = close_threshold
|
| 33 |
|
| 34 |
def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
|
| 35 |
-
lims, D, I = self._index.range_search(embeddings, self._close_threshold)
|
| 36 |
all_neighbours = []
|
| 37 |
for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
|
| 38 |
lengths_ = self._lengths[indices_]
|
|
|
|
| 7 |
|
| 8 |
from src.utils import indices_distances_gen
|
| 9 |
|
| 10 |
+
_CLOSE_THRESHOLD_DEFAULT = 0.99
|
| 11 |
+
|
| 12 |
|
| 13 |
class Neighbour(NamedTuple):
|
| 14 |
distance: float
|
|
|
|
| 26 |
:param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
|
| 27 |
:param close_threshold: Similarity threshold to consider embeddings as "close".
|
| 28 |
"""
|
| 29 |
+
def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray,
|
| 30 |
+
metadata: pd.DataFrame, close_threshold: float = _CLOSE_THRESHOLD_DEFAULT):
|
| 31 |
self._index = index
|
| 32 |
self._labels = labels
|
| 33 |
self._lengths = lengths
|
|
|
|
| 35 |
self._close_threshold = close_threshold
|
| 36 |
|
| 37 |
def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
|
| 38 |
+
# lims, D, I = self._index.range_search(embeddings, self._close_threshold)
|
| 39 |
all_neighbours = []
|
| 40 |
for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
|
| 41 |
lengths_ = self._lengths[indices_]
|
src/scorer.py
CHANGED
|
@@ -14,7 +14,7 @@ def _count_unique_neighbours(embeddings, radius, index, all_labels):
|
|
| 14 |
return res
|
| 15 |
|
| 16 |
|
| 17 |
-
class
|
| 18 |
"""
|
| 19 |
Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
|
| 20 |
|
|
|
|
| 14 |
return res
|
| 15 |
|
| 16 |
|
| 17 |
+
class EmbeddingsOriginalityScorer:
|
| 18 |
"""
|
| 19 |
Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
|
| 20 |
|
src/utils.py
CHANGED
|
@@ -3,7 +3,15 @@ import numpy as np
|
|
| 3 |
|
| 4 |
|
| 5 |
def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Iterate over lims and get indices per embedding
|
| 8 |
for i in range(len(lims) - 1):
|
| 9 |
start = lims[i]
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
|
| 6 |
+
"""
|
| 7 |
+
Generator that yields indices and distances of neighbors within a given radius for each embedding.
|
| 8 |
+
:param embeddings: 2-d Numpy array where each row is an embedding to search neighbors for.
|
| 9 |
+
:param radius: Similarity radius to search within.
|
| 10 |
+
:param index: FAISS index for similarity search.
|
| 11 |
+
"""
|
| 12 |
+
embeddings_copy = embeddings.copy().astype(np.float32)
|
| 13 |
+
faiss.normalize_L2(embeddings_copy)
|
| 14 |
+
lims, D, I = index.range_search(embeddings_copy, radius)
|
| 15 |
# Iterate over lims and get indices per embedding
|
| 16 |
for i in range(len(lims) - 1):
|
| 17 |
start = lims[i]
|