PawMatchAI / smart_breed_matcher.py
DawnC's picture
Update smart_breed_matcher.py
edb9086
raw
history blame
22.5 kB
import torch
import re
import numpy as np
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from breed_health_info import breed_health_info
from breed_noise_info import breed_noise_info
from dog_database import dog_data
from scoring_calculation_system import UserPreferences
from sentence_transformers import SentenceTransformer, util
class SmartBreedMatcher:
def __init__(self, dog_data: List[Tuple]):
self.dog_data = dog_data
self.model = SentenceTransformer('all-mpnet-base-v2')
self._embedding_cache = {}
def _get_cached_embedding(self, text: str) -> torch.Tensor:
if text not in self._embedding_cache:
self._embedding_cache[text] = self.model.encode(text)
return self._embedding_cache[text]
def _categorize_breeds(self) -> Dict:
"""自動將狗品種分類"""
categories = {
'working_dogs': [],
'herding_dogs': [],
'hunting_dogs': [],
'companion_dogs': [],
'guard_dogs': []
}
for breed_info in self.dog_data:
description = breed_info[9].lower()
temperament = breed_info[4].lower()
# 根據描述和性格特徵自動分類
if any(word in description for word in ['herding', 'shepherd', 'cattle', 'flock']):
categories['herding_dogs'].append(breed_info[1])
elif any(word in description for word in ['hunting', 'hunt', 'retriever', 'pointer']):
categories['hunting_dogs'].append(breed_info[1])
elif any(word in description for word in ['companion', 'toy', 'family', 'lap']):
categories['companion_dogs'].append(breed_info[1])
elif any(word in description for word in ['guard', 'protection', 'watchdog']):
categories['guard_dogs'].append(breed_info[1])
elif any(word in description for word in ['working', 'draft', 'cart']):
categories['working_dogs'].append(breed_info[1])
return categories
def find_similar_breeds(self, breed_name: str, top_n: int = 5) -> List[Tuple[str, float]]:
"""找出與指定品種最相似的其他品種"""
target_breed = next((breed for breed in self.dog_data if breed[1] == breed_name), None)
if not target_breed:
return []
# 獲取目標品種的特徵
target_features = {
'breed_name': target_breed[1], # 添加品種名稱
'size': target_breed[2],
'temperament': target_breed[4],
'exercise': target_breed[7],
'description': target_breed[9]
}
similarities = []
for breed in self.dog_data:
if breed[1] != breed_name:
breed_features = {
'breed_name': breed[1], # 添加品種名稱
'size': breed[2],
'temperament': breed[4],
'exercise': breed[7],
'description': breed[9]
}
similarity_score = self._calculate_breed_similarity(target_features, breed_features)
similarities.append((breed[1], similarity_score))
return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
# def _calculate_breed_similarity(self, breed1_features: Dict, breed2_features: Dict) -> float:
# """計算兩個品種之間的相似度,包含健康和噪音因素"""
# # 計算描述文本的相似度
# desc1_embedding = self._get_cached_embedding(breed1_features['description'])
# desc2_embedding = self._get_cached_embedding(breed2_features['description'])
# description_similarity = float(util.pytorch_cos_sim(desc1_embedding, desc2_embedding))
# # 基本特徵相似度
# size_similarity = 1.0 if breed1_features['size'] == breed2_features['size'] else 0.5
# exercise_similarity = 1.0 if breed1_features['exercise'] == breed2_features['exercise'] else 0.5
# # 性格相似度
# temp1_embedding = self._get_cached_embedding(breed1_features['temperament'])
# temp2_embedding = self._get_cached_embedding(breed2_features['temperament'])
# temperament_similarity = float(util.pytorch_cos_sim(temp1_embedding, temp2_embedding))
# # 健康分數相似度
# health_score1 = self._calculate_health_score(breed1_features['breed_name'])
# health_score2 = self._calculate_health_score(breed2_features['breed_name'])
# health_similarity = 1.0 - abs(health_score1 - health_score2)
# # 噪音水平相似度
# noise_similarity = self._calculate_noise_similarity(
# breed1_features['breed_name'],
# breed2_features['breed_name']
# )
# # 加權計算
# weights = {
# 'description': 0.25,
# 'temperament': 0.20,
# 'exercise': 0.2,
# 'size': 0.05,
# 'health': 0.15,
# 'noise': 0.15
# }
# final_similarity = (
# description_similarity * weights['description'] +
# temperament_similarity * weights['temperament'] +
# exercise_similarity * weights['exercise'] +
# size_similarity * weights['size'] +
# health_similarity * weights['health'] +
# noise_similarity * weights['noise']
# )
# return final_similarity
def _calculate_breed_similarity(self, breed1_features: Dict, breed2_features: Dict) -> float:
"""計算兩個品種之間的相似度,包含健康和噪音因素"""
# 計算描述文本的相似度
desc1_embedding = self._get_cached_embedding(breed1_features['description'])
desc2_embedding = self._get_cached_embedding(breed2_features['description'])
description_similarity = float(util.pytorch_cos_sim(desc1_embedding, desc2_embedding))
# 使用改進後的尺寸相似度計算
size_similarity = self._calculate_size_similarity(
breed1_features['size'],
breed2_features['size'],
self._get_preferred_size_range(breed1_features['description'])
)
# 其他相似度計算
exercise_similarity = self._calculate_exercise_similarity(breed1_features['exercise'], breed2_features['exercise'])
temp1_embedding = self._get_cached_embedding(breed1_features['temperament'])
temp2_embedding = self._get_cached_embedding(breed2_features['temperament'])
temperament_similarity = float(util.pytorch_cos_sim(temp1_embedding, temp2_embedding))
health_score1 = self._calculate_health_score(breed1_features['breed_name'])
health_score2 = self._calculate_health_score(breed2_features['breed_name'])
health_similarity = 1.0 - abs(health_score1 - health_score2)
noise_similarity = self._calculate_noise_similarity(
breed1_features['breed_name'],
breed2_features['breed_name']
)
# 調整權重,增加尺寸的重要性
weights = {
'description': 0.20, # 降低描述權重
'temperament': 0.20,
'exercise': 0.20,
'size': 0.20, # 顯著提高尺寸權重
'health': 0.10, # 略微降低
'noise': 0.10 # 略微降低
}
final_similarity = (
description_similarity * weights['description'] +
temperament_similarity * weights['temperament'] +
exercise_similarity * weights['exercise'] +
size_similarity * weights['size'] +
health_similarity * weights['health'] +
noise_similarity * weights['noise']
)
return final_similarity
def _calculate_final_scores(self, breed_name: str, base_scores: Dict,
smart_score: float, is_preferred: bool,
similarity_score: float = 0.0) -> Dict:
"""
計算最終分數,包含基礎分數和獎勵分數
Args:
breed_name: 品種名稱
base_scores: 基礎評分 (空間、運動等)
smart_score: 智能匹配分數
is_preferred: 是否為用戶指定品種
similarity_score: 與指定品種的相似度 (0-1)
"""
# 基礎權重
weights = {
'base': 0.6, # 基礎分數權重
'smart': 0.25, # 智能匹配權重
'bonus': 0.15 # 獎勵分數權重
}
# 計算基礎分數
base_score = base_scores.get('overall', 0.7)
# 計算獎勵分數
bonus_score = 0.0
if is_preferred:
# 用戶指定品種獲得最高獎勵
bonus_score = 0.95
elif similarity_score > 0:
# 相似品種獲得部分獎勵,但不超過80%的最高獎勵
bonus_score = min(0.8, similarity_score) * 0.95
# 計算最終分數
final_score = (
base_score * weights['base'] +
smart_score * weights['smart'] +
bonus_score * weights['bonus']
)
# 更新各項分數
scores = base_scores.copy()
# 如果是用戶指定品種,稍微提升各項基礎分數,但保持合理範圍
if is_preferred:
for key in scores:
if key != 'overall':
scores[key] = min(1.0, scores[key] * 1.1) # 最多提升10%
# 為相似品種調整分數
elif similarity_score > 0:
boost_factor = 1.0 + (similarity_score * 0.05) # 最多提升5%
for key in scores:
if key != 'overall':
scores[key] = min(0.95, scores[key] * boost_factor) # 確保不超過95%
return {
'final_score': round(final_score, 4),
'base_score': round(base_score, 4),
'bonus_score': round(bonus_score, 4),
'scores': {k: round(v, 4) for k, v in scores.items()}
}
def _get_preferred_size_range(self, description: str) -> tuple:
"""分析描述文本,確定用戶偏好的尺寸範圍"""
description = description.lower()
# 定義關鍵詞匹配
size_indicators = {
'small': ['small', 'tiny', 'little'],
'medium': ['medium', 'medium-sized', 'moderate size'],
'medium-large': ['medium to large', 'slightly larger', 'medium-large'],
'large': ['large', 'big'],
'giant': ['giant', 'huge', 'very large']
}
# 檢測負面提及
negative_indicators = {
'small': ['not too small', 'not small'],
'large': ['not too large', 'not too big', 'not large'],
'giant': ['not giant', 'not huge']
}
# 默認為中型
preferred_min = 2 # medium
preferred_max = 3 # large
# 分析描述中的尺寸偏好
for size, keywords in size_indicators.items():
for keyword in keywords:
if keyword in description:
if size == 'small':
preferred_min, preferred_max = 1, 2
elif size == 'medium':
preferred_min, preferred_max = 2, 2
elif size == 'medium-large':
preferred_min, preferred_max = 2, 3
elif size == 'large':
preferred_min, preferred_max = 3, 3
elif size == 'giant':
preferred_min, preferred_max = 3, 4
# 檢查負面提及並調整
for size, keywords in negative_indicators.items():
for keyword in keywords:
if keyword in description:
if size == 'small':
preferred_min = max(2, preferred_min)
elif size == 'large':
preferred_max = min(2, preferred_max)
elif size == 'giant':
preferred_max = min(3, preferred_max)
return (preferred_min, preferred_max)
def _calculate_size_similarity(self, size1: str, size2: str, preferred_range: tuple = None) -> float:
"""改進的尺寸相似度計算"""
# 更細緻的尺寸映射
size_map = {
'Tiny': 0.5,
'Small': 1,
'Small-Medium': 1.5,
'Medium': 2,
'Medium-Large': 2.5,
'Large': 3,
'Giant': 4
}
# 獲取數值
value1 = size_map.get(size1, 2)
value2 = size_map.get(size2, 2)
# 基礎相似度計算
base_similarity = 1.0 - (abs(value1 - value2) / 3.5) # 3.5 是最大可能差異
# 如果有偏好範圍,進行額外調整
if preferred_range:
preferred_min, preferred_max = preferred_range
# 檢查是否在偏好範圍內
in_range = (preferred_min <= value2 <= preferred_max)
# 如果不在範圍內,根據距離降低分數
if not in_range:
distance_to_range = min(
abs(value2 - preferred_min),
abs(value2 - preferred_max)
)
penalty = distance_to_range * 0.2 # 每單位差異降低20%
base_similarity *= (1 - penalty)
return max(0.0, min(1.0, base_similarity)) # 確保在 [0, 1] 範圍內
def _calculate_exercise_similarity(self, exercise1: str, exercise2: str) -> float:
exercise_map = {'Low': 1, 'Moderate': 2, 'High': 3, 'Very High': 4}
value1 = exercise_map.get(exercise1, 2) # 預設為 'Moderate'
value2 = exercise_map.get(exercise2, 2) # 預設為 'Moderate'
# 計算相似度
exercise_similarity = 1.0 - abs(value1 - value2) / 3
return max(0.0, exercise_similarity) # 確保相似度在 [0, 1] 範圍內
def _calculate_health_score(self, breed_name: str) -> float:
"""計算品種的健康分數"""
if breed_name not in breed_health_info:
return 0.5
health_notes = breed_health_info[breed_name]['health_notes'].lower()
# 嚴重健康問題
severe_conditions = [
'cancer', 'cardiomyopathy', 'epilepsy', 'dysplasia',
'bloat', 'progressive', 'syndrome'
]
# 中等健康問題
moderate_conditions = [
'allergies', 'infections', 'thyroid', 'luxation',
'skin problems', 'ear'
]
severe_count = sum(1 for condition in severe_conditions if condition in health_notes)
moderate_count = sum(1 for condition in moderate_conditions if condition in health_notes)
health_score = 1.0
health_score -= (severe_count * 0.1)
health_score -= (moderate_count * 0.05)
# 特殊條件調整(根據用戶偏好)
if hasattr(self, 'user_preferences'):
if self.user_preferences.has_children:
if 'requires frequent' in health_notes or 'regular monitoring' in health_notes:
health_score *= 0.9
if self.user_preferences.health_sensitivity == 'high':
health_score *= 0.9
return max(0.3, min(1.0, health_score))
def _calculate_noise_similarity(self, breed1: str, breed2: str) -> float:
"""計算兩個品種的噪音相似度"""
noise_levels = {
'Low': 1,
'Moderate': 2,
'High': 3,
'Unknown': 2 # 默認為中等
}
noise1 = breed_noise_info.get(breed1, {}).get('noise_level', 'Unknown')
noise2 = breed_noise_info.get(breed2, {}).get('noise_level', 'Unknown')
# 獲取數值級別
level1 = noise_levels.get(noise1, 2)
level2 = noise_levels.get(noise2, 2)
# 計算差異並歸一化
difference = abs(level1 - level2)
similarity = 1.0 - (difference / 2) # 最大差異是2,所以除以2來歸一化
return similarity
def _general_matching(self, description: str, top_n: int = 10) -> List[Dict]:
"""基本的品種匹配邏輯,考慮描述、性格、噪音和健康因素"""
matches = []
# 預先計算描述的 embedding 並快取
desc_embedding = self._get_cached_embedding(description)
for breed in self.dog_data:
breed_name = breed[1]
breed_description = breed[9]
temperament = breed[4]
# 使用快取計算相似度
breed_desc_embedding = self._get_cached_embedding(breed_description)
breed_temp_embedding = self._get_cached_embedding(temperament)
desc_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_desc_embedding))
temp_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_temp_embedding))
# 其餘計算保持不變
noise_similarity = self._calculate_noise_similarity(breed_name, breed_name)
health_score = self._calculate_health_score(breed_name)
health_similarity = 1.0 - abs(health_score - 0.8)
weights = {
'description': 0.35,
'temperament': 0.25,
'noise': 0.2,
'health': 0.2
}
final_score = (
desc_similarity * weights['description'] +
temp_similarity * weights['temperament'] +
noise_similarity * weights['noise'] +
health_similarity * weights['health']
)
matches.append({
'breed': breed_name,
'score': final_score,
'is_preferred': False,
'similarity': final_score,
'reason': "Matched based on description, temperament, noise level, and health score"
})
return sorted(matches, key=lambda x: -x['score'])[:top_n]
def _detect_breed_preference(self, description: str) -> Optional[str]:
"""檢測用戶是否提到特定品種"""
description_lower = f" {description.lower()} "
for breed_info in self.dog_data:
breed_name = breed_info[1]
normalized_breed = breed_name.lower().replace('_', ' ')
pattern = rf"\b{re.escape(normalized_breed)}\b"
if re.search(pattern, description_lower):
return breed_name
return None
def match_user_preference(self, description: str, top_n: int = 10) -> List[Dict]:
"""根據用戶描述匹配最適合的品種"""
preferred_breed = self._detect_breed_preference(description)
matches = []
if preferred_breed:
# 首先添加偏好品種
breed_info = next((breed for breed in self.dog_data if breed[1] == preferred_breed), None)
if breed_info:
base_scores = {'overall': 1.0} # 給予最高基礎分數
# 計算偏好品種的最終分數
scores = self._calculate_final_scores(
preferred_breed,
base_scores,
smart_score=1.0,
is_preferred=True,
similarity_score=1.0
)
matches.append({
'breed': preferred_breed,
'score': 1.0, # 確保最高分
'final_score': scores['final_score'],
'base_score': scores['base_score'],
'bonus_score': scores['bonus_score'],
'is_preferred': True,
'priority': 1, # 最高優先級
'health_score': self._calculate_health_score(preferred_breed),
'noise_level': breed_noise_info.get(preferred_breed, {}).get('noise_level', 'Unknown'),
'reason': "Directly matched your preferred breed"
})
# 添加相似品種
similar_breeds = self.find_similar_breeds(preferred_breed, top_n=top_n-1)
for breed_name, similarity in similar_breeds:
if breed_name != preferred_breed:
# 使用 _calculate_final_scores 計算相似品種分數
scores = self._calculate_final_scores(
breed_name,
{'overall': similarity * 0.9}, # 基礎分數略低於偏好品種
smart_score=similarity,
is_preferred=False,
similarity_score=similarity
)
matches.append({
'breed': breed_name,
'score': min(0.95, similarity), # 確保不超過偏好品種
'final_score': scores['final_score'],
'base_score': scores['base_score'],
'bonus_score': scores['bonus_score'],
'is_preferred': False,
'priority': 2,
'health_score': self._calculate_health_score(breed_name),
'noise_level': breed_noise_info.get(breed_name, {}).get('noise_level', 'Unknown'),
'reason': f"Similar to {preferred_breed}"
})
else:
matches = self._general_matching(description, top_n)
for match in matches:
match['priority'] = 3
# 使用複合排序鍵
final_matches = sorted(
matches,
key=lambda x: (
x.get('priority', 3) * -1, # 優先級倒序(1最高)
x.get('is_preferred', False) * 1, # 偏好品種優先
float(x.get('final_score', 0)) * -1, # 分數倒序
x.get('breed', '') # 品種名稱正序
)
)[:top_n]
return final_matches