Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import time | |
| import math | |
| from difflib import SequenceMatcher | |
| def recalc_diversity(portfolio, player_columns): | |
| """ | |
| Vectorized version of recalc_diversity using NumPy operations. | |
| """ | |
| # Extract player data and convert to string array | |
| player_data = portfolio[player_columns].astype(str).fillna('').values | |
| # Get all unique players and create a mapping to numeric IDs | |
| all_players = set() | |
| for row in player_data: | |
| for val in row: | |
| if isinstance(val, str) and val.strip() != '': | |
| all_players.add(val) | |
| # Create player ID mapping | |
| player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))} | |
| # Convert each row to a binary vector (1 if player is present, 0 if not) | |
| n_players = len(all_players) | |
| n_rows = len(portfolio) | |
| binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8) | |
| # Vectorized binary matrix creation | |
| for i, row in enumerate(player_data): | |
| for val in row: | |
| if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id: | |
| binary_matrix[i, player_to_id[str(val)]] = 1 | |
| # Vectorized Jaccard distance calculation | |
| intersection_matrix = np.dot(binary_matrix, binary_matrix.T) | |
| row_sums = np.sum(binary_matrix, axis=1) | |
| union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix | |
| # Calculate Jaccard distance: 1 - (intersection / union) | |
| with np.errstate(divide='ignore', invalid='ignore'): | |
| jaccard_similarity = np.divide(intersection_matrix, union_matrix, | |
| out=np.zeros_like(intersection_matrix, dtype=float), | |
| where=union_matrix != 0) | |
| jaccard_distance = 1 - jaccard_similarity | |
| # Exclude self-comparison and calculate average distance for each row | |
| np.fill_diagonal(jaccard_distance, 0) | |
| row_counts = n_rows - 1 | |
| similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts | |
| # Normalize to 0-1 scale | |
| score_range = similarity_scores.max() - similarity_scores.min() | |
| if score_range > 0: | |
| similarity_scores = (similarity_scores - similarity_scores.min()) / score_range | |
| return similarity_scores |