File size: 7,070 Bytes
300fe5d
3ea7670
 
f7b283c
 
3ea7670
300fe5d
 
7a0020b
3ea7670
 
7a0020b
3ea7670
 
 
7a0020b
3ea7670
300fe5d
 
3ea7670
a763857
 
f7b283c
a763857
300fe5d
7a0020b
 
3ea7670
 
 
 
 
7a0020b
3ea7670
300fe5d
 
 
f7b283c
3ea7670
 
f7b283c
3ea7670
 
 
f7b283c
3ea7670
300fe5d
 
 
 
 
 
3ea7670
 
 
 
 
 
 
f7b283c
 
 
 
 
 
3ea7670
 
 
f7b283c
3ea7670
7a0020b
3ea7670
 
 
7a0020b
3ea7670
f7b283c
3ea7670
f7b283c
3ea7670
 
7a0020b
3ea7670
7a0020b
 
3ea7670
 
 
 
 
 
 
7a0020b
3ea7670
 
7a0020b
3ea7670
7a0020b
3ea7670
 
 
7a0020b
3ea7670
 
7a0020b
3ea7670
 
7a0020b
3ea7670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a0020b
 
300fe5d
3ea7670
7a0020b
 
 
 
3ea7670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a0020b
3ea7670
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import numpy as np
from typing import List, Tuple, Dict, Any
from tf_data_pipeline import TFDataPipeline
from logger_config import config_logger

logger = config_logger(__name__)

class ResponseQualityChecker:
    """
    The Response Quality Checker measures:
      - Relevance (embedding or cross-encoder)
      - Diversity among top responses
      - Length
      - Score gap
      - Confidence
    """
    
    def __init__(
        self,
        data_pipeline: "TFDataPipeline",
        confidence_threshold: float = 0.45,
        diversity_threshold: float = 0.10,
        min_response_length: int = 5,
        similarity_cap: float = 0.90
    ):
        """
        Args:
            data_pipeline: TFDataPipeline for encoding
            confidence_threshold: Min top_score for 'confident'
            diversity_threshold: Min average diversity for top responses
            min_response_length: Min word count - 'valid length'
            similarity_cap: Cap pairwise similarity to reduce outliers
        """
        self.data_pipeline = data_pipeline
        self.confidence_threshold = confidence_threshold
        self.diversity_threshold = diversity_threshold
        self.min_response_length = min_response_length
        self.similarity_cap = similarity_cap
        
        # Additional thresholds
        self.thresholds = {
            'relevance': 0.30,
            'length_score': 0.80,
            'score_gap': 0.05
        }
        
    def check_response_quality(
        self,
        query: str,
        responses: List[Tuple[str, float]]
    ) -> Dict[str, Any]:
        """
        Evaluate the quality of top-k responses:
         - response_diversity
         - query_response_relevance
         - response_length_score
         - top_score
         - top_3_score_gap
         - is_confident
        """
        if not responses:
            return {
                'response_diversity': 0.0,
                'query_response_relevance': 0.0,
                'response_length_score': 0.0,
                'top_score': 0.0,
                'top_3_score_gap': 0.0,
                'is_confident': False
            }
            
        metrics = {}
        metrics['response_diversity'] = self._calc_diversity(responses)
        metrics['query_response_relevance'] = self._calc_relevance(query, responses)
        metrics['response_length_score'] = self._calc_length_score(responses)
        metrics['top_score'] = responses[0][1]
        metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
        metrics['is_confident'] = self._determine_confidence(metrics)
        
        return metrics
    
    def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:
        """
        Average similarity among top response embeddings, capped by self.similarity_cap.
        """
        if len(responses) < 2:
            return 1.0  # Single response
        
        texts = [r for r, _ in responses]
        embs = self.data_pipeline.encode_responses(texts)
        sim_matrix = self._cosine_similarity(embs, embs)
        
        # Zero out diagonal
        np.fill_diagonal(sim_matrix, 0.0)
        
        # Cap similarity
        sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
        
        sum_sims = np.sum(sim_matrix)
        count = len(responses) * (len(responses) - 1)
        avg_sim = sum_sims / count if count > 0 else 0.0
        
        return 1.0 - avg_sim
    
    def _calc_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
        """
        Weighted average of exponential-transformed similarities for top-k.
        Encourages a high similarity with the top responses.
        """
        if not responses:
            return 0.0
        
        query_emb = self.data_pipeline.encode_query(query)
        texts = [r for r, _ in responses]
        resp_embs = self.data_pipeline.encode_responses(texts)
        
        query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-8)
        norms = (np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-8)
        resp_embs = resp_embs / norms
        
        # Cosine similarity, then exponential transform
        sims = np.sum(query_emb[np.newaxis, :] * resp_embs, axis=1)  # shape [k]
        sims = np.exp(sims - 1.0)
        
        # Weighted average to boost top responses
        weights = np.exp(-np.arange(len(responses)) / 2.0)
        weighted_avg = np.average(sims, weights=weights)
        return float(weighted_avg)
    
    def _calc_length_score(self, responses: List[Tuple[str, float]]) -> float:
        """
        Average length-based score across top responses.
        """
        scores = []
        for text, _ in responses:
            words = len(text.strip().split())
            if words < self.min_response_length:
                # Penalty for too short
                s = words / float(self.min_response_length)
            elif words > 50:
                # Penalty for excessive length
                s = max(0.5, 50.0 / words)
            else:
                s = 1.0
            scores.append(s)
            
        return float(np.mean(scores)) if scores else 0.0
    
    def _calc_score_gap(self, scores: List[float], top_n: int = 3) -> float:
        """
        Average difference between consecutive ranks for top_n.
        """
        if len(scores) < 2:
            return 0.0
        top_n = min(top_n, len(scores))
        gaps = []
        for i in range(top_n - 1):
            gaps.append(scores[i] - scores[i + 1])
        return float(np.mean(gaps)) if gaps else 0.0
    
    def _determine_confidence(self, m: Dict[str, float]) -> bool:
        """
        Require:
         - top_score >= self.confidence_threshold
         - response_diversity >= self.diversity_threshold
         - response_length_score >= self.thresholds['length_score']
         
        Secondary conditions (2 of 3 required):
         - query_response_relevance >= self.thresholds['relevance']
         - top_3_score_gap >= self.thresholds['score_gap']
         - top_score >= (confidence_threshold + 0.05)
        """
        primary = [
            m['top_score'] >= self.confidence_threshold,
            m['response_diversity'] >= self.diversity_threshold,
            m['response_length_score'] >= self.thresholds['length_score']
        ]
        secondary = [
            m['query_response_relevance'] >= self.thresholds['relevance'],
            m['top_3_score_gap'] >= self.thresholds['score_gap'],
            m['top_score'] >= (self.confidence_threshold + 0.05)
        ]
        
        if all(primary) and sum(secondary) >= 2:
            return True
        return False
    
    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
        """Manual cosine sim matrix: a-> shape [N, d], b-> shape [M, d]. Return shape [N, M]."""
        a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
        b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-8)
        return np.dot(a_norm, b_norm.T)