Spaces:

HumbleBeeAI
/

al-ghazali-rag-retrieval

Running

File size: 6,292 Bytes

eee239e
f0539b9
 
32046db
 
 
72d58ce
32046db
9c1e786
 
 
6fda102
69f5cc3
 
9c1e786
 
 
 
 
 
 
f0539b9
 
 
72d58ce
f0539b9
7ffa9fc
9b0bd3d
7ffa9fc
9b0bd3d
 
 
 
86f7fa2
9b0bd3d
7ffa9fc
9b0bd3d
 
 
 
32046db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b0bd3d
72d58ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee239e
9c1e786
eee239e
 
3adff1b
32046db
 
 
 
 
eee239e
32046db
72d58ce
 
f0539b9
d4bf6ac
72d58ce
f0539b9
d4bf6ac
72d58ce
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63cf07b
d4bf6ac
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
767e056
 
 
eee239e
 
 
767e056
eee239e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0539b9
 
eee239e

import gradio as gr
import pandas as pd
from time import perf_counter as timer
from datasets import Dataset, load_dataset
from huggingface_hub import login
import os
from openai import OpenAI

# Load credentials from environment variables or a secure source
def load_credentials():
    credentials = {}
    for i in range(1, 51):  # Assuming you have 10 credentials
        username = os.environ.get(f"login_{i}")
        password = os.environ.get(f"password_{i}")
        if username and password:
            credentials[username] = password
    return credentials

# Authentication function
def authenticate(username, password, credentials):
    return credentials.get(username) == password

def load_data(database_file):
    df = pd.read_parquet(database_file)
    return df

def save_reactions_to_dataset(user_type, query, results):
    data = {
        "user_type": [],
        "query": [],
        "retrieved_text": [],
        "reaction": []
    }

    for result in results:
        data["user_type"].append(user_type)
        data["query"].append(query)
        data["retrieved_text"].append(result["text"])
        data["reaction"].append(result["reaction"])

    # Load existing dataset from the Hub (if it exists)
    try:
        dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
        existing_data = dataset.to_dict()
    except Exception:
        # If the dataset doesn't exist, start with an empty dataset
        existing_data = {
            "user_type": [],
            "query": [],
            "retrieved_text": [],
            "reaction": []
        }

    # Append new data to existing data
    for key in data:
        existing_data[key].extend(data[key])

    # Create a new dataset from the combined data
    updated_dataset = Dataset.from_dict(existing_data)

    # Push the updated dataset to the Hub
    updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")

def generate_openai_embeddings(client, text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def cosine_similarity(embedding_0, embedding_1):
    dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
    norm_0 = sum(a * a for a in embedding_0) ** 0.5
    norm_1 = sum(b * b for b in embedding_1) ** 0.5
    return dot_product / (norm_0 * norm_1)

def search_query(client, query, df, n=3):
    embedding = generate_openai_embeddings(client, query)
    df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities', ascending=False).head(n)
    return res

def main(username, password, user_type, query, reactions=None):
    credentials = load_credentials()
    if not authenticate(username, password, credentials):
        return "Invalid username or password", [], []

    # Access the Hugging Face token from the environment variable
    huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
    if huggingface_token:
        login(token=huggingface_token)
    else:
        return "Hugging Face API token not found in environment variables.", [], []

    # Initialize OpenAI client
    client = OpenAI()

    # Load database from predefined path
    database_file = '[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet'

    try:
        df = load_data(database_file)
        start_time = timer()
        res = search_query(client, query, df, n=3)
        end_time = timer()

        results = []
        for idx in res.index.tolist():
            text = df.iloc[int(idx)]["ext"]
            results.append({"text": text, "index": idx})

        # If reactions are provided, save them to the dataset
        if reactions:
            reaction_results = []
            for idx, reaction in reactions.items():
                reaction_results.append({
                    "text": df.iloc[int(idx)]["ext"],
                    "reaction": reaction
                })
            save_reactions_to_dataset(user_type, query, reaction_results)
            return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, "Reactions saved successfully!"

        return f"Time taken to compute scores: {end_time - start_time:.5f} seconds", results, ""

    except Exception as e:
        return f"Failed to load database: {str(e)}", [], []

# Gradio interface for collecting reactions
def collect_reactions(results, reaction_1, reaction_2, reaction_3):
    reactions = {}
    for i, reaction in enumerate([reaction_1, reaction_2, reaction_3]):
        if results and i < len(results):
            reactions[results[i]["index"]] = reaction
    return reactions

# Define the Gradio interface
def gradio_interface(username, password, user_type, query, reaction_1=None, reaction_2=None, reaction_3=None):
    time_taken, results, save_message = main(username, password, user_type, query)
    
    # Only collect reactions if they are provided
    if reaction_1 is not None or reaction_2 is not None or reaction_3 is not None:
        reactions = collect_reactions(results, reaction_1, reaction_2, reaction_3)
        if any(reactions.values()):  # If any reaction is provided, save them
            _, _, save_message = main(username, password, user_type, query, reactions)
    
    return time_taken, results, save_message

# Input and output components for Gradio
inputs = [
    gr.Textbox(label="Username"),
    gr.Textbox(label="Password", type="password"),
    gr.Radio(["Layman", "Enthusiast", "Ustaz (Expert)"], label="Select your user type:"),
    gr.Textbox(label="Enter your query:"),
    gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 1"),
    gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 2"),
    gr.Radio(["👎", "🤷", "👍"], label="Reaction for Result 3"),
]

outputs = [
    gr.Textbox(label="Time taken"),
    gr.JSON(label="Results"),
    gr.Textbox(label="Save Status"),
]

iface = gr.Interface(
    fn=gradio_interface,
    inputs=inputs,
    outputs=outputs,
    title="EnlightenQalb (Alchemy of Happiness)",
    description="Search and rate results from The Alchemy of Happiness."
)

if __name__ == "__main__":
    iface.launch()