File size: 6,292 Bytes
eee239e
f0539b9
 
32046db
 
 
72d58ce
32046db
9c1e786
 
 
6fda102
69f5cc3
 
9c1e786
 
 
 
 
 
 
f0539b9
 
 
72d58ce
f0539b9
7ffa9fc
9b0bd3d
7ffa9fc
9b0bd3d
 
 
 
86f7fa2
9b0bd3d
7ffa9fc
9b0bd3d
 
 
 
32046db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b0bd3d
72d58ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee239e
9c1e786
eee239e
 
3adff1b
32046db
 
 
 
 
eee239e
32046db
72d58ce
 
f0539b9
d4bf6ac
72d58ce
f0539b9
d4bf6ac
72d58ce
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63cf07b
d4bf6ac
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
767e056
 
 
eee239e
 
 
767e056
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0539b9
 
eee239e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import gradio as gr
import pandas as pd
from time import perf_counter as timer
from datasets import Dataset, load_dataset
from huggingface_hub import login
import os
from openai import OpenAI

# Load credentials from environment variables or a secure source
def load_credentials():
    credentials = {}
    for i in range(1, 51):  # Assuming you have 10 credentials
        username = os.environ.get(f"login_{i}")
        password = os.environ.get(f"password_{i}")
        if username and password:
            credentials[username] = password
    return credentials

# Authentication function
def authenticate(username, password, credentials):
    return credentials.get(username) == password

def load_data(database_file):
    df = pd.read_parquet(database_file)
    return df

def save_reactions_to_dataset(user_type, query, results):
    data = {
        "user_type": [],
        "query": [],
        "retrieved_text": [],
        "reaction": []
    }

    for result in results:
        data["user_type"].append(user_type)
        data["query"].append(query)
        data["retrieved_text"].append(result["text"])
        data["reaction"].append(result["reaction"])

    # Load existing dataset from the Hub (if it exists)
    try:
        dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
        existing_data = dataset.to_dict()
    except Exception:
        # If the dataset doesn't exist, start with an empty dataset
        existing_data = {
            "user_type": [],
            "query": [],
            "retrieved_text": [],
            "reaction": []
        }

    # Append new data to existing data
    for key in data:
        existing_data[key].extend(data[key])

    # Create a new dataset from the combined data
    updated_dataset = Dataset.from_dict(existing_data)

    # Push the updated dataset to the Hub
    updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")

def generate_openai_embeddings(client, text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def cosine_similarity(embedding_0, embedding_1):
    dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
    norm_0 = sum(a * a for a in embedding_0) ** 0.5
    norm_1 = sum(b * b for b in embedding_1) ** 0.5
    return dot_product / (norm_0 * norm_1)

def search_query(client, query, df, n=3):
    embedding = generate_openai_embeddings(client, query)
    df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities', ascending=False).head(n)
    return res

def main(username, password, user_type, query, reactions=None):
    credentials = load_credentials()
    if not authenticate(username, password, credentials):
        return "Invalid username or password", [], []

    # Access the Hugging Face token from the environment variable
    huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
    if huggingface_token:
        login(token=huggingface_token)
    else:
        return "Hugging Face API token not found in environment variables.", [], []

    # Initialize OpenAI client
    client = OpenAI()

    # Load database from predefined path
    database_file = '[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet'

    try:
        df = load_data(database_file)
        start_time = timer()
        res = search_query(client, query, df, n=3)
        end_time = timer()

        results = []
        for idx in res.index.tolist():
            text = df.iloc[int(idx)]["ext"]
            results.append({"text": text, "index": idx})

        # If reactions are provided, save them to the dataset
        if reactions:
            reaction_results = []
            for idx, reaction in reactions.items():
                reaction_results.append({
                    "text": df.iloc[int(idx)]["ext"],
                    "reaction": reaction
                })
            save_reactions_to_dataset(user_type, query, reaction_results)
            return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, "Reactions saved successfully!"

        return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, ""

    except Exception as e:
        return f"Failed to load database: {str(e)}", [], []

# Gradio interface for collecting reactions
def collect_reactions(results, reaction_1, reaction_2, reaction_3):
    reactions = {}
    for i, reaction in enumerate([reaction_1, reaction_2, reaction_3]):
        if results and i < len(results):
            reactions[results[i]["index"]] = reaction
    return reactions

# Define the Gradio interface
def gradio_interface(username, password, user_type, query, reaction_1=None, reaction_2=None, reaction_3=None):
    time_taken, results, save_message = main(username, password, user_type, query)
    
    # Only collect reactions if they are provided
    if reaction_1 is not None or reaction_2 is not None or reaction_3 is not None:
        reactions = collect_reactions(results, reaction_1, reaction_2, reaction_3)
        if any(reactions.values()):  # If any reaction is provided, save them
            _, _, save_message = main(username, password, user_type, query, reactions)
    
    return time_taken, results, save_message

# Input and output components for Gradio
inputs = [
    gr.Textbox(label="Username"),
    gr.Textbox(label="Password", type="password"),
    gr.Radio(["Layman", "Enthusiast", "Ustaz (Expert)"], label="Select your user type:"),
    gr.Textbox(label="Enter your query:"),
    gr.Radio(["πŸ‘Ž", "🀷", "πŸ‘"], label="Reaction for Result 1"),
    gr.Radio(["πŸ‘Ž", "🀷", "πŸ‘"], label="Reaction for Result 2"),
    gr.Radio(["πŸ‘Ž", "🀷", "πŸ‘"], label="Reaction for Result 3"),
]

outputs = [
    gr.Textbox(label="Time taken"),
    gr.JSON(label="Results"),
    gr.Textbox(label="Save Status"),
]

iface = gr.Interface(
    fn=gradio_interface,
    inputs=inputs,
    outputs=outputs,
    title="EnlightenQalb (Alchemy of Happiness)",
    description="Search and rate results from The Alchemy of Happiness."
)

if __name__ == "__main__":
    iface.launch()