File size: 11,164 Bytes
18d7e32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
# Smart Binary Model: Usage Examples
## 1. Basic Retrieval Example
```python
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load smart binary model
model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')
# Example query và chunks
query = "query: Định nghĩa đạo hàm của hàm số"
chunks = [
"passage: Đạo hàm của hàm số f(x) tại x₀ là giới hạn của tỉ số...", # Correct
"passage: Các quy tắc tính đạo hàm: (xⁿ)' = nxⁿ⁻¹, (sin x)' = cos x...", # Related
"passage: Tích phân xác định của hàm số trên đoạn [a,b]...", # Irrelevant
"passage: Phương trình vi phân bậc nhất có dạng y' + P(x)y = Q(x)" # Irrelevant
]
# Smart binary retrieval
query_emb = model.encode([query])
chunk_embs = model.encode(chunks)
similarities = cosine_similarity(query_emb, chunk_embs)[0]
print("Smart Binary Rankings:")
ranked_indices = similarities.argsort()[::-1]
for rank, idx in enumerate(ranked_indices, 1):
chunk_type = ["CORRECT", "RELATED", "IRRELEVANT", "IRRELEVANT"][idx]
print(f"Rank {rank}: {chunk_type} (Score: {similarities[idx]:.4f})")
print(f" {chunks[idx][:70]}...")
print()
# Expected smart binary results:
# Rank 1: CORRECT (Score: ~0.87)
# Rank 2: RELATED (Score: ~0.65)
# Rank 3: IRRELEVANT (Score: ~0.25)
# Rank 4: IRRELEVANT (Score: ~0.20)
```
## 2. Batch Processing Multiple Queries
```python
# Multiple Vietnamese math queries
queries = [
"query: Cách giải phương trình bậc hai",
"query: Định nghĩa hàm số đồng biến",
"query: Công thức tính thể tích hình cầu"
]
math_content_pool = [
"passage: Phương trình bậc hai ax² + bx + c = 0 có nghiệm x = (-b ± √Δ)/2a",
"passage: Hàm số đồng biến trên khoảng I khi f'(x) > 0 với mọi x ∈ I",
"passage: Thể tích hình cầu bán kính R là V = (4/3)πR³",
"passage: Diện tích hình tròn bán kính r là S = πr²",
"passage: Định lý Pythagoras: a² + b² = c² trong tam giác vuông"
]
# Process all queries efficiently
for query in queries:
print(f"\nQuery: {query.replace('query: ', '')}")
query_emb = model.encode([query])
chunk_embs = model.encode(math_content_pool)
similarities = cosine_similarity(query_emb, chunk_embs)[0]
# Get top 3 với smart binary model
top_3_indices = similarities.argsort()[::-1][:3]
for rank, idx in enumerate(top_3_indices, 1):
score = similarities[idx]
confidence = "HIGH" if score > 0.8 else "MEDIUM" if score > 0.5 else "LOW"
print(f" {rank}. [{confidence}] {score:.3f} - {math_content_pool[idx]}")
```
## 3. Production Class Implementation
```python
class SmartBinaryMathRetriever:
def __init__(self, model_name='ThanhLe0125/e5-math-smart-binary'):
self.model = SentenceTransformer(model_name)
print(f"Smart Binary Model loaded: {model_name}")
def retrieve_with_confidence(self, query, chunks, top_k=5, min_confidence=0.3):
"""
Smart binary retrieval với confidence scoring
Args:
query: Vietnamese math question
chunks: List of educational content
top_k: Number of results to return
min_confidence: Minimum similarity threshold
"""
# Ensure E5 format
formatted_query = f"query: {query}" if not query.startswith("query:") else query
formatted_chunks = [
f"passage: {chunk}" if not chunk.startswith("passage:") else chunk
for chunk in chunks
]
# Encode với smart binary model
query_emb = self.model.encode([formatted_query])
chunk_embs = self.model.encode(formatted_chunks)
similarities = cosine_similarity(query_emb, chunk_embs)[0]
# Filter by confidence và rank
results = []
for idx, similarity in enumerate(similarities):
if similarity >= min_confidence:
results.append({
'chunk_index': idx,
'chunk': chunks[idx],
'similarity': float(similarity),
'confidence_level': self._get_confidence_level(similarity)
})
# Sort by similarity và limit
results.sort(key=lambda x: x['similarity'], reverse=True)
results = results[:top_k]
# Add ranking
for rank, result in enumerate(results, 1):
result['rank'] = rank
return results
def _get_confidence_level(self, similarity):
"""Convert similarity to confidence level"""
if similarity >= 0.85:
return "VERY_HIGH"
elif similarity >= 0.7:
return "HIGH"
elif similarity >= 0.5:
return "MEDIUM"
elif similarity >= 0.3:
return "LOW"
else:
return "VERY_LOW"
def batch_retrieve(self, queries, chunk_pool, top_k_per_query=3):
"""Process multiple queries efficiently"""
all_results = {}
for query in queries:
results = self.retrieve_with_confidence(query, chunk_pool, top_k_per_query)
all_results[query] = results
return all_results
# Usage example
retriever = SmartBinaryMathRetriever()
# Single query
query = "Cách tính đạo hàm của hàm hợp"
chunks = [
"Đạo hàm hàm hợp: (f(g(x)))' = f'(g(x)) × g'(x)",
"Ví dụ: Tính đạo hàm của (x² + 1)³",
"Tích phân từng phần: ∫u dv = uv - ∫v du"
]
results = retriever.retrieve_with_confidence(query, chunks, top_k=3, min_confidence=0.2)
print("Smart Binary Retrieval Results:")
for result in results:
print(f"Rank {result['rank']}: {result['confidence_level']}")
print(f" Similarity: {result['similarity']:.4f}")
print(f" Content: {result['chunk'][:60]}...")
print()
```
## 4. Comparison và Evaluation
```python
# Compare smart binary với base model
def compare_models(query, chunks):
# Load models
base_model = SentenceTransformer('intfloat/multilingual-e5-base')
smart_binary_model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')
# Format query
formatted_query = f"query: {query}"
formatted_chunks = [f"passage: {chunk}" for chunk in chunks]
# Encode với both models
query_emb_base = base_model.encode([formatted_query])
query_emb_smart = smart_binary_model.encode([formatted_query])
chunk_embs_base = base_model.encode(formatted_chunks)
chunk_embs_smart = smart_binary_model.encode(formatted_chunks)
# Calculate similarities
similarities_base = cosine_similarity(query_emb_base, chunk_embs_base)[0]
similarities_smart = cosine_similarity(query_emb_smart, chunk_embs_smart)[0]
# Compare rankings
print(f"Query: {query}")
print("="*50)
for i, chunk in enumerate(chunks):
base_score = similarities_base[i]
smart_score = similarities_smart[i]
improvement = smart_score - base_score
print(f"Chunk {i+1}:")
print(f" Base Model: {base_score:.4f}")
print(f" Smart Binary: {smart_score:.4f}")
print(f" Improvement: {improvement:+.4f}")
print(f" Content: {chunk[:50]}...")
print()
# Example comparison
compare_models(
"Định nghĩa hàm số liên tục",
[
"Hàm số f liên tục tại x₀ nếu lim(x→x₀) f(x) = f(x₀)", # Correct
"Ví dụ hàm số liên tục: f(x) = x², g(x) = sin(x)", # Related
"Phương trình vi phân có nghiệm tổng quát y = Ce^x" # Irrelevant
]
)
```
## 5. Advanced Analytics
```python
def analyze_smart_binary_performance(queries, chunks, ground_truth):
"""
Comprehensive performance analysis
Args:
queries: List of test queries
chunks: List of content chunks
ground_truth: List of correct chunk indices for each query
"""
model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')
metrics = {
'mrr_scores': [],
'hit_at_1': 0,
'hit_at_3': 0,
'hit_at_5': 0,
'total_queries': len(queries)
}
for i, query in enumerate(queries):
# Format và encode
formatted_query = f"query: {query}"
formatted_chunks = [f"passage: {chunk}" for chunk in chunks]
query_emb = model.encode([formatted_query])
chunk_embs = model.encode(formatted_chunks)
similarities = cosine_similarity(query_emb, chunk_embs)[0]
# Rank chunks
ranked_indices = similarities.argsort()[::-1]
correct_idx = ground_truth[i]
# Find rank of correct answer
correct_rank = None
for rank, idx in enumerate(ranked_indices, 1):
if idx == correct_idx:
correct_rank = rank
break
if correct_rank:
# Calculate MRR
mrr = 1.0 / correct_rank
metrics['mrr_scores'].append(mrr)
# Hit@K metrics
if correct_rank <= 1:
metrics['hit_at_1'] += 1
if correct_rank <= 3:
metrics['hit_at_3'] += 1
if correct_rank <= 5:
metrics['hit_at_5'] += 1
# Calculate final metrics
avg_mrr = np.mean(metrics['mrr_scores']) if metrics['mrr_scores'] else 0
hit_1_rate = metrics['hit_at_1'] / metrics['total_queries']
hit_3_rate = metrics['hit_at_3'] / metrics['total_queries']
hit_5_rate = metrics['hit_at_5'] / metrics['total_queries']
print("Smart Binary Model Performance Analysis:")
print(f" MRR (Mean Reciprocal Rank): {avg_mrr:.4f}")
print(f" Hit@1 (Accuracy): {hit_1_rate:.4f} ({metrics['hit_at_1']}/{metrics['total_queries']})")
print(f" Hit@3: {hit_3_rate:.4f} ({metrics['hit_at_3']}/{metrics['total_queries']})")
print(f" Hit@5: {hit_5_rate:.4f} ({metrics['hit_at_5']}/{metrics['total_queries']})")
return {
'mrr': avg_mrr,
'hit_at_1': hit_1_rate,
'hit_at_3': hit_3_rate,
'hit_at_5': hit_5_rate
}
# Example usage
test_queries = [
"Công thức tính đạo hàm",
"Định nghĩa tích phân",
"Cách giải phương trình bậc hai"
]
test_chunks = [
"Đạo hàm của hàm số f(x) = lim[h→0] (f(x+h)-f(x))/h", # For query 1
"Tích phân của f(x) trên [a,b] = ∫[a,b] f(x)dx", # For query 2
"Nghiệm phương trình ax²+bx+c=0 là x = (-b±√Δ)/2a", # For query 3
"Định lý vi phân trung bình",
"Công thức Taylor"
]
ground_truth = [0, 1, 2] # Correct chunk indices
performance = analyze_smart_binary_performance(test_queries, test_chunks, ground_truth)
```
These examples demonstrate the smart binary model's balanced approach to precision and recall, making it ideal for Vietnamese mathematical content retrieval with optimal user experience.
|