File size: 4,087 Bytes
891ed69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import json
app = Flask(__name__)
class EnhancedSemanticSearchEvaluator:
    def __init__(self, relevance_threshold=3, top_k=300, similarity_threshold=0.5):
        self.models = {
            "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
            "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
            "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
        }
        self.relevance_threshold = relevance_threshold
        self.top_k = top_k
        self.similarity_threshold = similarity_threshold

    def compute_similarity(self, model, query, matches):
        query_embedding = model.encode(query, convert_to_tensor=True)
        match_embeddings = model.encode(
            [match['metadata'] for match in matches], convert_to_tensor=True
        )
        scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
        return scores

    def rank_results(self, model, query, matches):
        similarity_scores = self.compute_similarity(model, query, matches)
        for match, score in zip(matches, similarity_scores):
            match['similarity_score'] = score
        ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
        return ranked_matches

    def evaluate_results(self, query, results):
        all_metrics = {}
        results_status = {}

        for model_name, model in self.models.items():
            ranked_matches = self.rank_results(model, query, results['matches'])

            results_with_scores = []
            for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
                doc_id = match['id']
                similarity_score = match['similarity_score']

                if similarity_score >= 0.7:
                    llm_score = 5
                elif similarity_score >= 0.5:
                    llm_score = 4
                elif similarity_score >= 0.3:
                    llm_score = 3
                elif similarity_score >= 0.1:
                    llm_score = 2
                else:
                    llm_score = 1

                results_with_scores.append({
                    "Rank": rank,
                    "Document ID": doc_id,
                    "Similarity Score": similarity_score,
                    "LLM Score": llm_score
                })

            results_df = pd.DataFrame(results_with_scores)
            results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold

            pass_rate = results_df['Pass'].mean()
            metrics = {
                "Pass Rate": pass_rate,
                "Precision@K": results_df.head(self.top_k)['Pass'].mean(),
                "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1),
                "F1@K": (
                    2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) /
                    (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1)))
                    if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0)
                
            }

            all_metrics[model_name] = metrics
            results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed"

        return results_status

evaluator = EnhancedSemanticSearchEvaluator()

@app.route('/evaluate', methods=['POST'])
def evaluate():
    content = request.json
    query = content['query']
    results = content['results']
    evaluator = EnhancedSemanticSearchEvaluator()
    evaluation_result = evaluator.evaluate_results(query, results)
    return jsonify(evaluation_result)

# if __name__ == '__main__':
#     app.run(debug=True, host='0.0.0.0', port=8000)