ohollo commited on
Commit
681b241
·
1 Parent(s): a7d861a

Update assets and fix up scoring

Browse files
assets/chords_20251119.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1170d567f73e344462260571285ece153c468a79be65cd21cf6d52fc435bce6
3
+ size 281385005
assets/quantile_transformers_20251122.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20b23160289a8a3b3c56ce9a27193c1e2eb658824f843fd2c8b8db103c15818d
3
+ size 164145
src/__pycache__/analysis.cpython-311.pyc ADDED
Binary file (2.41 kB). View file
 
src/__pycache__/convert.cpython-311.pyc ADDED
Binary file (2.13 kB). View file
 
src/__pycache__/methodology.cpython-311.pyc ADDED
Binary file (3.74 kB). View file
 
src/__pycache__/neighbours.cpython-311.pyc ADDED
Binary file (3.93 kB). View file
 
src/__pycache__/scorer.cpython-311.pyc ADDED
Binary file (2.77 kB). View file
 
src/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.47 kB). View file
 
src/analysis.py CHANGED
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
  import pandas as pd
4
  from src.methodology import SimpleMethodology
5
  from src.neighbours import EmbeddingClosestNeighbours
6
- from src.scorer import EmbeddingScorer
7
 
8
 
9
 
@@ -14,11 +14,11 @@ class EmbeddingsAnalysis:
14
  self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
15
  specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
16
  sm = SimpleMethodology(specific_scalers, specific_scalers[99])
17
- self._scorer = EmbeddingScorer(index, all_labels_np, sm)
18
 
19
 
20
- def get_score(self, embeddings, lengths):
21
- score = self._scorer.score(embeddings, lengths)
22
  return score
23
 
24
  def get_neighbours(self, embeddings, limit=None):
 
3
  import pandas as pd
4
  from src.methodology import SimpleMethodology
5
  from src.neighbours import EmbeddingClosestNeighbours
6
+ from src.scorer import EmbeddingsOriginalityScorer
7
 
8
 
9
 
 
14
  self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
15
  specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
16
  sm = SimpleMethodology(specific_scalers, specific_scalers[99])
17
+ self._scorer = EmbeddingsOriginalityScorer(index, all_labels_np, sm)
18
 
19
 
20
+ def get_scores(self, embeddings, lengths):
21
+ score = self._scorer.score(embeddings, pd.Series(lengths))
22
  return score
23
 
24
  def get_neighbours(self, embeddings, limit=None):
src/convert.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from gradio_client import Client
3
+ import os
4
+ import json
5
+
6
+ _CONSTANT_GAP_SECS = 2
7
+ _SEQ_EMBED_SPACE = 'ohollo/chord-seq-embed'
8
+
9
+
10
+ _client = Client(_SEQ_EMBED_SPACE)
11
+
12
+ def _call_embedding_service(chords_w_timestamps):
13
+ result = _client.predict(json.dumps(chords_w_timestamps), api_name="/predict")
14
+ return json.loads(result)
15
+
16
+
17
+ def get_embeddings_from_chord_sequences(chord_sequences: list[list[str]], constant_gap_secs: float = _CONSTANT_GAP_SECS) -> np.ndarray:
18
+ """
19
+ Converts chord sequences into its corresponding embeddings.
20
+
21
+ :param chord_sequence: List of chords representing the chord sequence.
22
+ :return: 2-d numpy array of embeddings per chord sequence.
23
+ """
24
+ chords_w_timestamps = [
25
+ {'label': chord_sequence, 'timestamp': [i* constant_gap_secs for i, _ in enumerate(chord_sequence)]}
26
+ for chord_sequence in chord_sequences
27
+ ]
28
+ return np.array(_call_embedding_service(chords_w_timestamps)['embeddings'])
src/methodology.py CHANGED
@@ -31,7 +31,8 @@ class SimpleMethodology(CountBasedMethodology):
31
 
32
  def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
33
  unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
34
- scaled = unscaled.apply(
 
35
  lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
36
  axis=1
37
  )
 
31
 
32
  def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
33
  unscaled = (neighbours_df['0.8'] - 1 ) * 1 + (neighbours_df['0.85'] - 1) * 2 + (neighbours_df['0.9'] - 1) * 3 + (neighbours_df['0.925'] - 1) * 4 + (neighbours_df['0.95'] - 1) * 5
34
+ concat = pd.concat([unscaled.rename('unscaled'), lengths.rename('length')], axis=1)
35
+ scaled = concat.apply(
36
  lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
37
  axis=1
38
  )
src/neighbours.py CHANGED
@@ -7,6 +7,8 @@ import pandas as pd
7
 
8
  from src.utils import indices_distances_gen
9
 
 
 
10
 
11
  class Neighbour(NamedTuple):
12
  distance: float
@@ -24,7 +26,8 @@ class EmbeddingClosestNeighbours:
24
  :param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
25
  :param close_threshold: Similarity threshold to consider embeddings as "close".
26
  """
27
- def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray, metadata: pd.DataFrame, close_threshold: float = CLOSE_THRESHOLD):
 
28
  self._index = index
29
  self._labels = labels
30
  self._lengths = lengths
@@ -32,7 +35,7 @@ class EmbeddingClosestNeighbours:
32
  self._close_threshold = close_threshold
33
 
34
  def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
35
- lims, D, I = self._index.range_search(embeddings, self._close_threshold)
36
  all_neighbours = []
37
  for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
38
  lengths_ = self._lengths[indices_]
 
7
 
8
  from src.utils import indices_distances_gen
9
 
10
+ _CLOSE_THRESHOLD_DEFAULT = 0.99
11
+
12
 
13
  class Neighbour(NamedTuple):
14
  distance: float
 
26
  :param metadata: Pandas DataFrame containing metadata for each indexed entry. Index should be aligned with labels.
27
  :param close_threshold: Similarity threshold to consider embeddings as "close".
28
  """
29
+ def __init__(self, index: faiss.Index, labels: np.ndarray, lengths: np.ndarray,
30
+ metadata: pd.DataFrame, close_threshold: float = _CLOSE_THRESHOLD_DEFAULT):
31
  self._index = index
32
  self._labels = labels
33
  self._lengths = lengths
 
35
  self._close_threshold = close_threshold
36
 
37
  def get(self, embeddings: np.ndarray, limit: int = None) -> list[list[Neighbour]]:
38
+ # lims, D, I = self._index.range_search(embeddings, self._close_threshold)
39
  all_neighbours = []
40
  for indices_, distances_ in indices_distances_gen(embeddings, self._close_threshold, self._index):
41
  lengths_ = self._lengths[indices_]
src/scorer.py CHANGED
@@ -14,7 +14,7 @@ def _count_unique_neighbours(embeddings, radius, index, all_labels):
14
  return res
15
 
16
 
17
- class EmbeddingScorer:
18
  """
19
  Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
20
 
 
14
  return res
15
 
16
 
17
+ class EmbeddingsOriginalityScorer:
18
  """
19
  Scores embeddings based on their originality. Specifically using counts of unique neighbours within certain radii.
20
 
src/utils.py CHANGED
@@ -3,7 +3,15 @@ import numpy as np
3
 
4
 
5
  def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
6
- lims, D, I = index.range_search(embeddings, radius)
 
 
 
 
 
 
 
 
7
  # Iterate over lims and get indices per embedding
8
  for i in range(len(lims) - 1):
9
  start = lims[i]
 
3
 
4
 
5
  def indices_distances_gen(embeddings: np.ndarray, radius: float, index: faiss.Index):
6
+ """
7
+ Generator that yields indices and distances of neighbors within a given radius for each embedding.
8
+ :param embeddings: 2-d Numpy array where each row is an embedding to search neighbors for.
9
+ :param radius: Similarity radius to search within.
10
+ :param index: FAISS index for similarity search.
11
+ """
12
+ embeddings_copy = embeddings.copy().astype(np.float32)
13
+ faiss.normalize_L2(embeddings_copy)
14
+ lims, D, I = index.range_search(embeddings_copy, radius)
15
  # Iterate over lims and get indices per embedding
16
  for i in range(len(lims) - 1):
17
  start = lims[i]