ohollo commited on
Commit
007017f
·
1 Parent(s): 87093fc

Introduce score power

Browse files
Files changed (5) hide show
  1. app.py +5 -5
  2. cfg.py +4 -3
  3. src/analysis.py +2 -2
  4. src/convert.py +4 -1
  5. src/methodology.py +4 -3
app.py CHANGED
@@ -20,7 +20,7 @@ logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
  # Load models and data
23
- print("Loading models and data...")
24
  all_labels = pd.read_csv(cfg.LABELS_LOCATION)
25
  scalers = joblib.load(cfg.SCALER_DICT_LOCATION)
26
  index = faiss.read_index(cfg.INDEX_LOCATION)
@@ -28,8 +28,8 @@ ds = load_dataset(cfg.LOOKUP_DS_NAME)
28
  lookup = ds['train'].to_pandas().set_index('track_id')[['title', 'artist']]
29
 
30
  # Initialize analysis
31
- ea = EmbeddingsAnalysis(index, all_labels, lookup, scalers, cfg.RADII, close_threshold=cfg.CLOSE_THRESHOLD)
32
- print("Models loaded successfully!")
33
 
34
  # Load how it works content
35
  with open(cfg.HOW_IT_WORKS_MD_LOCATION, 'r') as f:
@@ -130,8 +130,8 @@ def analyze_chord_sequence_text(chord_text: str) -> tuple[Optional[float], Optio
130
  embeddings = get_embeddings_from_chord_sequences([chords])
131
  neighbour_embeddings = None
132
  if len(chords) < cfg.MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS:
133
- padded_chords = _pad_sequence_by_repetition(chords, cfg.MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS)
134
- neighbour_embeddings = get_embeddings_from_chord_sequences([padded_chords])
135
  score, neighbours = _perform_analysis(embeddings, [len(chords)], neighbour_embeddings)
136
  return score, neighbours
137
  except AppError as e:
 
20
  logger = logging.getLogger(__name__)
21
 
22
  # Load models and data
23
+ logging.info("Loading models and data...")
24
  all_labels = pd.read_csv(cfg.LABELS_LOCATION)
25
  scalers = joblib.load(cfg.SCALER_DICT_LOCATION)
26
  index = faiss.read_index(cfg.INDEX_LOCATION)
 
28
  lookup = ds['train'].to_pandas().set_index('track_id')[['title', 'artist']]
29
 
30
  # Initialize analysis
31
+ ea = EmbeddingsAnalysis(index, all_labels, lookup, scalers, cfg.RADII, close_threshold=cfg.CLOSE_THRESHOLD, score_power=cfg.SCORE_POWER)
32
+ logging.info("Models loaded successfully!")
33
 
34
  # Load how it works content
35
  with open(cfg.HOW_IT_WORKS_MD_LOCATION, 'r') as f:
 
130
  embeddings = get_embeddings_from_chord_sequences([chords])
131
  neighbour_embeddings = None
132
  if len(chords) < cfg.MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS:
133
+ chords = _pad_sequence_by_repetition(chords, cfg.MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS)
134
+ neighbour_embeddings = get_embeddings_from_chord_sequences([chords])
135
  score, neighbours = _perform_analysis(embeddings, [len(chords)], neighbour_embeddings)
136
  return score, neighbours
137
  except AppError as e:
cfg.py CHANGED
@@ -6,6 +6,7 @@ SCALER_DICT_LOCATION = './assets/quantile_transformers.joblib'
6
  MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS = 18
7
  HOW_IT_WORKS_MD_LOCATION = './how_it_works.md'
8
  HOW_IT_WORKS_SVG_LOCATION = './assets/harmonic_analysis_simple.svg'
9
- RADII = (0.8, 0.85, 0.9, 0.925, 0.95)
10
- # RADII = (0.7, 0.925, 0.95, 0.99, 0.995)
11
- # RADII = (0.7, 0.85, 0.9, 0.99, 0.995)
 
 
6
  MIN_SEQUENCE_LENGTH_FOR_NEIGHBOURS = 18
7
  HOW_IT_WORKS_MD_LOCATION = './how_it_works.md'
8
  HOW_IT_WORKS_SVG_LOCATION = './assets/harmonic_analysis_simple.svg'
9
+ # RADII = (0.8, 0.85, 0.9, 0.925, 0.95)
10
+ RADII = (0.85, 0.9) + tuple(range(1, 50)) + (0.925, 0.95, 0.975)
11
+ SCORE_POWER = 0.5
12
+
src/analysis.py CHANGED
@@ -17,12 +17,12 @@ class EmbeddingsAnalysis:
17
  :param scalers: Dictionary mapping length ranges to quantile transformers for score normalization.
18
  :param close_threshold: Similarity threshold for neighbor search.
19
  """
20
- def __init__(self, index, all_labels, lookup, scalers, radii, close_threshold=0.95):
21
  all_labels_np = all_labels['track_id'].to_numpy()
22
  all_lengths_np = all_labels['length'].to_numpy()
23
  self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
24
  specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
25
- sm = SimpleMethodology(specific_scalers, specific_scalers[_FALLBACK_INDEX])
26
  self._scorer = EmbeddingsOriginalityScorer(index, all_labels_np, radii, sm)
27
 
28
 
 
17
  :param scalers: Dictionary mapping length ranges to quantile transformers for score normalization.
18
  :param close_threshold: Similarity threshold for neighbor search.
19
  """
20
+ def __init__(self, index, all_labels, lookup, scalers, radii, close_threshold=0.95, score_power=1.0):
21
  all_labels_np = all_labels['track_id'].to_numpy()
22
  all_lengths_np = all_labels['length'].to_numpy()
23
  self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold)
24
  specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)}
25
+ sm = SimpleMethodology(specific_scalers, specific_scalers[_FALLBACK_INDEX], score_power=score_power)
26
  self._scorer = EmbeddingsOriginalityScorer(index, all_labels_np, radii, sm)
27
 
28
 
src/convert.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  from gradio_client import Client
3
  import os
@@ -5,6 +6,8 @@ import json
5
  import time
6
  import httpx
7
 
 
 
8
  from chord_extractor.extractors import Chordino
9
  from chord_extractor import clear_conversion_cache, LabelledChordSequence
10
 
@@ -28,7 +31,7 @@ def _create_client():
28
  _client = _create_client()
29
 
30
  def _call_embedding_service(chords_w_timestamps):
31
- print(chords_w_timestamps)
32
  result = _client.predict(json.dumps(chords_w_timestamps), api_name="/predict")
33
  return json.loads(result)
34
 
 
1
+ import logging
2
  import numpy as np
3
  from gradio_client import Client
4
  import os
 
6
  import time
7
  import httpx
8
 
9
+ logger = logging.getLogger(__name__)
10
+
11
  from chord_extractor.extractors import Chordino
12
  from chord_extractor import clear_conversion_cache, LabelledChordSequence
13
 
 
31
  _client = _create_client()
32
 
33
  def _call_embedding_service(chords_w_timestamps):
34
+ logger.info(chords_w_timestamps)
35
  result = _client.predict(json.dumps(chords_w_timestamps), api_name="/predict")
36
  return json.loads(result)
37
 
src/methodology.py CHANGED
@@ -18,15 +18,16 @@ class CountBasedMethodology(ABC):
18
 
19
 
20
  class SimpleMethodology(CountBasedMethodology):
21
- def __init__(self, scalers: dict[int, _TransformerProtocol], fallback_scaler: _TransformerProtocol):
22
  self._scalers = scalers
23
  self._fallback_scaler = fallback_scaler
 
24
 
25
  def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
26
- unscaled = sum((neighbours_df[col] - 1) * (i + 1) for i, col in enumerate(neighbours_df.columns))
27
  concat = pd.concat([unscaled.rename('unscaled'), lengths.rename('length')], axis=1)
28
  scaled = concat.apply(
29
  lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
30
  axis=1
31
  )
32
- return 1 - scaled
 
18
 
19
 
20
  class SimpleMethodology(CountBasedMethodology):
21
+ def __init__(self, scalers: dict[int, _TransformerProtocol], fallback_scaler: _TransformerProtocol, score_power: float = 1.0):
22
  self._scalers = scalers
23
  self._fallback_scaler = fallback_scaler
24
+ self._score_power = score_power
25
 
26
  def execute(self, neighbours_df: pd.DataFrame, lengths: pd.Series) -> pd.Series:
27
+ unscaled = sum(neighbours_df[col] * (i + 1) for i, col in enumerate(neighbours_df.columns))
28
  concat = pd.concat([unscaled.rename('unscaled'), lengths.rename('length')], axis=1)
29
  scaled = concat.apply(
30
  lambda row: self._scalers.get(row['length'], self._fallback_scaler).transform(pd.DataFrame({_SCALER_X_LABEL: row['unscaled']}, index=[0]))[0][0],
31
  axis=1
32
  )
33
+ return 1 - scaled ** self._score_power