File size: 7,070 Bytes
300fe5d 3ea7670 f7b283c 3ea7670 300fe5d 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 300fe5d 3ea7670 a763857 f7b283c a763857 300fe5d 7a0020b 3ea7670 7a0020b 3ea7670 300fe5d f7b283c 3ea7670 f7b283c 3ea7670 f7b283c 3ea7670 300fe5d 3ea7670 f7b283c 3ea7670 f7b283c 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 f7b283c 3ea7670 f7b283c 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 7a0020b 300fe5d 3ea7670 7a0020b 3ea7670 7a0020b 3ea7670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import numpy as np
from typing import List, Tuple, Dict, Any
from tf_data_pipeline import TFDataPipeline
from logger_config import config_logger
logger = config_logger(__name__)
class ResponseQualityChecker:
"""
The Response Quality Checker measures:
- Relevance (embedding or cross-encoder)
- Diversity among top responses
- Length
- Score gap
- Confidence
"""
def __init__(
self,
data_pipeline: "TFDataPipeline",
confidence_threshold: float = 0.45,
diversity_threshold: float = 0.10,
min_response_length: int = 5,
similarity_cap: float = 0.90
):
"""
Args:
data_pipeline: TFDataPipeline for encoding
confidence_threshold: Min top_score for 'confident'
diversity_threshold: Min average diversity for top responses
min_response_length: Min word count - 'valid length'
similarity_cap: Cap pairwise similarity to reduce outliers
"""
self.data_pipeline = data_pipeline
self.confidence_threshold = confidence_threshold
self.diversity_threshold = diversity_threshold
self.min_response_length = min_response_length
self.similarity_cap = similarity_cap
# Additional thresholds
self.thresholds = {
'relevance': 0.30,
'length_score': 0.80,
'score_gap': 0.05
}
def check_response_quality(
self,
query: str,
responses: List[Tuple[str, float]]
) -> Dict[str, Any]:
"""
Evaluate the quality of top-k responses:
- response_diversity
- query_response_relevance
- response_length_score
- top_score
- top_3_score_gap
- is_confident
"""
if not responses:
return {
'response_diversity': 0.0,
'query_response_relevance': 0.0,
'response_length_score': 0.0,
'top_score': 0.0,
'top_3_score_gap': 0.0,
'is_confident': False
}
metrics = {}
metrics['response_diversity'] = self._calc_diversity(responses)
metrics['query_response_relevance'] = self._calc_relevance(query, responses)
metrics['response_length_score'] = self._calc_length_score(responses)
metrics['top_score'] = responses[0][1]
metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
metrics['is_confident'] = self._determine_confidence(metrics)
return metrics
def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:
"""
Average similarity among top response embeddings, capped by self.similarity_cap.
"""
if len(responses) < 2:
return 1.0 # Single response
texts = [r for r, _ in responses]
embs = self.data_pipeline.encode_responses(texts)
sim_matrix = self._cosine_similarity(embs, embs)
# Zero out diagonal
np.fill_diagonal(sim_matrix, 0.0)
# Cap similarity
sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
sum_sims = np.sum(sim_matrix)
count = len(responses) * (len(responses) - 1)
avg_sim = sum_sims / count if count > 0 else 0.0
return 1.0 - avg_sim
def _calc_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
"""
Weighted average of exponential-transformed similarities for top-k.
Encourages a high similarity with the top responses.
"""
if not responses:
return 0.0
query_emb = self.data_pipeline.encode_query(query)
texts = [r for r, _ in responses]
resp_embs = self.data_pipeline.encode_responses(texts)
query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-8)
norms = (np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-8)
resp_embs = resp_embs / norms
# Cosine similarity, then exponential transform
sims = np.sum(query_emb[np.newaxis, :] * resp_embs, axis=1) # shape [k]
sims = np.exp(sims - 1.0)
# Weighted average to boost top responses
weights = np.exp(-np.arange(len(responses)) / 2.0)
weighted_avg = np.average(sims, weights=weights)
return float(weighted_avg)
def _calc_length_score(self, responses: List[Tuple[str, float]]) -> float:
"""
Average length-based score across top responses.
"""
scores = []
for text, _ in responses:
words = len(text.strip().split())
if words < self.min_response_length:
# Penalty for too short
s = words / float(self.min_response_length)
elif words > 50:
# Penalty for excessive length
s = max(0.5, 50.0 / words)
else:
s = 1.0
scores.append(s)
return float(np.mean(scores)) if scores else 0.0
def _calc_score_gap(self, scores: List[float], top_n: int = 3) -> float:
"""
Average difference between consecutive ranks for top_n.
"""
if len(scores) < 2:
return 0.0
top_n = min(top_n, len(scores))
gaps = []
for i in range(top_n - 1):
gaps.append(scores[i] - scores[i + 1])
return float(np.mean(gaps)) if gaps else 0.0
def _determine_confidence(self, m: Dict[str, float]) -> bool:
"""
Require:
- top_score >= self.confidence_threshold
- response_diversity >= self.diversity_threshold
- response_length_score >= self.thresholds['length_score']
Secondary conditions (2 of 3 required):
- query_response_relevance >= self.thresholds['relevance']
- top_3_score_gap >= self.thresholds['score_gap']
- top_score >= (confidence_threshold + 0.05)
"""
primary = [
m['top_score'] >= self.confidence_threshold,
m['response_diversity'] >= self.diversity_threshold,
m['response_length_score'] >= self.thresholds['length_score']
]
secondary = [
m['query_response_relevance'] >= self.thresholds['relevance'],
m['top_3_score_gap'] >= self.thresholds['score_gap'],
m['top_score'] >= (self.confidence_threshold + 0.05)
]
if all(primary) and sum(secondary) >= 2:
return True
return False
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""Manual cosine sim matrix: a-> shape [N, d], b-> shape [M, d]. Return shape [N, M]."""
a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-8)
return np.dot(a_norm, b_norm.T)
|