File size: 4,499 Bytes
c871381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
import logging
from typing import List, Tuple
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict

# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# OpenAI API配置
API_KEY = "sk-u0S4iYA2kJmaDNBgBc48D2A6Fa904fF0B6E19dF0F6A39717"
API_URL = "https://api.ltcld.cn/v1/embeddings"
MODEL = "text-embedding-ada-002"

def generate_embeddings(text: str) -> List[float]:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}"
    }

    data = {
        "input": text,
        "model": MODEL
    }

    try:
        response = requests.post(API_URL, headers=headers, json=data)
        response.raise_for_status()
        embedding = response.json()["data"][0]["embedding"]
    except requests.exceptions.RequestException as e:
        logging.error(f"OpenAI API request failed: {e}")
        return []

    return embedding

def rerank_results(query: str, documents: List[str]) -> List[Tuple[str, float]]:
    vectorizer = TfidfVectorizer()
    
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    query_vector = vectorizer.transform([query])
    
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    reranked_results = list(zip(documents, similarity_scores))
    
    reranked_results.sort(key=lambda x: x[1], reverse=True)
    
    return reranked_results

def search_dataset(queries: List[str], top_k: int = 5, similarity_threshold: float = 0.5) -> List[dict]:
    results = []
    
    for query in queries:
        query_embedding = generate_embeddings(query)
        
        embeddings = df['Embedding'].tolist()
        similarity_scores = cosine_similarity([query_embedding], embeddings)[0]
        df['Similarity'] = similarity_scores
        
        print(f"Similarity scores for query '{query}': {similarity_scores}")
        
        top_results = df.sort_values('Similarity', ascending=False).head(top_k)
        
        print(f"Top {top_k} results for query '{query}':")
        print(top_results)
        
        query_results = []
        for _, row in top_results.iterrows():
            if row['Similarity'] >= similarity_threshold:
                query_results.append({
                    'question': row['Question'],
                    'answer': row['Answer'],
                    'similarity': row['Similarity']
                })
        
        print(f"Filtered results for query '{query}': {query_results}")
        
        results.append(query_results)
    
    merged_results = []
    for query_results in results:
        merged_results.extend(query_results)
    
    print(f"Merged results: {merged_results}")
    
    for query in queries:
        documents = [result['question'] + ' ' + result['answer'] for result in merged_results]
        reranked_results = rerank_results(query, documents)
        
        final_results = []
        for doc, score in reranked_results:
            for result in merged_results:
                if doc == result['question'] + ' ' + result['answer']:
                    result['score'] = score
                    final_results.append(result)
                    break
    
    unique_results = []
    seen_questions = set()
    seen_answers = set()
    for result in final_results:
        if result['question'] not in seen_questions and result['answer'] not in seen_answers:
            unique_results.append(result)
            seen_questions.add(result['question'])
            seen_answers.add(result['answer'])
    
    print(f"Unique results: {unique_results}")
    
    filtered_results = [result for result in unique_results if result['similarity'] >= similarity_threshold]    
    print(f"Filtered results: {filtered_results}")
    
    return filtered_results
df = pd.read_csv('output/qa_embeddings.csv')

df['Embedding'] = df['Embedding'].apply(eval)

# search_queries = ["原神","minecraft"]
# search_results = search_dataset(search_queries, top_k=1, similarity_threshold=0.5)

# for i, result in enumerate(search_results):
#     print(f"Search Result {i+1}:")
#     print(f"Question: {result['question']}")
#     print(f"Answer: {result['answer']}")
#     print(f"Similarity: {result['similarity']}")
#     print(f"Rerank Score: {result['score']}")
#     print("----------------------------------------------------")