Spaces:
Sleeping
Sleeping
File size: 5,369 Bytes
2b88d9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from typing import List, Dict, Tuple
import numpy as np
from collections import defaultdict
import re
class EnhancedRetriever:
"""Enhanced RAG with semantic similarity scoring"""
def __init__(self, guideline_path: str = "tone_guidelines.txt"):
self.guideline_path = guideline_path
self.guidelines = self._load_guidelines()
self.embeddings_cache = {}
def _load_guidelines(self) -> Dict[str, List[str]]:
"""Load guidelines from file"""
guidelines = defaultdict(list)
current_key = None
with open(self.guideline_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
if ":" in line:
current_key = line.replace(":", "").strip().lower()
elif current_key:
guidelines[current_key].append(line.strip("- ").strip())
return dict(guidelines)
def _simple_embedding(self, text: str) -> np.ndarray:
"""Create simple word-based embeddings for semantic similarity"""
# Normalize text
text = text.lower()
# Extract key features
features = {
'length': len(text.split()),
'has_emoji': int(bool(re.search(r'[π-π]', text))),
'has_exclamation': int('!' in text),
'formal_words': sum(1 for word in ['professional', 'value', 'benefits', 'business'] if word in text),
'casual_words': sum(1 for word in ['fun', 'playful', 'emoji', 'snappy'] if word in text),
'cta_presence': int(any(word in text for word in ['cta', 'button', 'click'])),
'hashtag_mention': int('#' in text or 'hashtag' in text),
}
# Convert to vector
return np.array(list(features.values()), dtype=np.float32)
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
"""Calculate cosine similarity between two vectors"""
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0.0
return dot_product / (norm1 * norm2)
def semantic_search(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
"""Perform semantic search across all guidelines"""
query_embedding = self._simple_embedding(query)
results = []
for category, items in self.guidelines.items():
for item in items:
item_embedding = self._simple_embedding(item)
similarity = self._cosine_similarity(query_embedding, item_embedding)
results.append((category, item, similarity))
# Sort by similarity score
results.sort(key=lambda x: x[2], reverse=True)
return results[:top_k]
def retrieve_with_relevance(self, tone: str, platforms: List[str]) -> Dict[str, any]:
"""Enhanced retrieval with relevance scoring"""
context_query = f"{tone} tone for {' '.join(platforms)} platforms"
semantic_results = self.semantic_search(context_query)
# Structure the response with relevance scores
response = {
"direct_matches": {},
"semantic_matches": [],
"relevance_scores": {}
}
# Direct matches (existing logic)
tone_lower = tone.lower()
if tone_lower in self.guidelines:
response["direct_matches"][tone] = self.guidelines[tone_lower]
response["relevance_scores"][tone] = 1.0
for platform in platforms:
p_lower = platform.lower()
if p_lower in self.guidelines:
response["direct_matches"][platform] = self.guidelines[p_lower]
response["relevance_scores"][platform] = 1.0
# Add semantic matches
for category, item, score in semantic_results:
if category not in response["direct_matches"]:
response["semantic_matches"].append({
"category": category,
"guideline": item,
"relevance": score
})
return response
def format_guidance_with_scores(self, retrieval_result: Dict) -> str:
"""Format retrieval results with relevance scores"""
output = []
# Direct matches
for key, guidelines in retrieval_result["direct_matches"].items():
score = retrieval_result["relevance_scores"].get(key, 0)
output.append(f"\n{key} Guidelines (Relevance: {score:.2f}):")
for guideline in guidelines:
output.append(f" - {guideline}")
# Semantic matches
if retrieval_result["semantic_matches"]:
output.append("\nAdditional Relevant Guidelines:")
for match in retrieval_result["semantic_matches"][:3]: # Top 3
output.append(f" - [{match['category']}] {match['guideline']} (Score: {match['relevance']:.2f})")
return "\n".join(output) |