update: Enhance dataset management by loading existing data and pushing updates to the Hugging Face Hub
Browse files
app.py
CHANGED
@@ -3,7 +3,10 @@ import torch as t
|
|
3 |
import pandas as pd
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
from time import perf_counter as timer
|
6 |
-
from datasets import Dataset
|
|
|
|
|
|
|
7 |
|
8 |
def load_data(database_file):
|
9 |
df = pd.read_parquet(database_file)
|
@@ -26,8 +29,28 @@ def save_reactions_to_dataset(user_type, query, results):
|
|
26 |
data["retrieved_text"].append(result["text"])
|
27 |
data["reaction"].append(result["reaction"])
|
28 |
|
29 |
-
dataset
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def main():
|
33 |
st.title("Semantic Text Retrieval Evaluation Interface")
|
@@ -42,6 +65,13 @@ def main():
|
|
42 |
if "results_saved" not in st.session_state:
|
43 |
st.session_state.results_saved = False
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Select device
|
46 |
device = "cuda" if t.cuda.is_available() else "cpu"
|
47 |
st.write(f"Using device: {device}")
|
|
|
3 |
import pandas as pd
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
from time import perf_counter as timer
|
6 |
+
from datasets import Dataset, load_dataset
|
7 |
+
from huggingface_hub import login
|
8 |
+
import os
|
9 |
+
|
10 |
|
11 |
def load_data(database_file):
|
12 |
df = pd.read_parquet(database_file)
|
|
|
29 |
data["retrieved_text"].append(result["text"])
|
30 |
data["reaction"].append(result["reaction"])
|
31 |
|
32 |
+
# Load existing dataset from the Hub (if it exists)
|
33 |
+
try:
|
34 |
+
dataset = load_dataset("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation", split="train")
|
35 |
+
existing_data = dataset.to_dict()
|
36 |
+
except Exception:
|
37 |
+
# If the dataset doesn't exist, start with an empty dataset
|
38 |
+
existing_data = {
|
39 |
+
"user_type": [],
|
40 |
+
"query": [],
|
41 |
+
"retrieved_text": [],
|
42 |
+
"reaction": []
|
43 |
+
}
|
44 |
+
|
45 |
+
# Append new data to existing data
|
46 |
+
for key in data:
|
47 |
+
existing_data[key].extend(data[key])
|
48 |
+
|
49 |
+
# Create a new dataset from the combined data
|
50 |
+
updated_dataset = Dataset.from_dict(existing_data)
|
51 |
+
|
52 |
+
# Push the updated dataset to the Hub
|
53 |
+
updated_dataset.push_to_hub("HumbleBeeAI/al-ghazali-rag-retrieval-evaluation")
|
54 |
|
55 |
def main():
|
56 |
st.title("Semantic Text Retrieval Evaluation Interface")
|
|
|
65 |
if "results_saved" not in st.session_state:
|
66 |
st.session_state.results_saved = False
|
67 |
|
68 |
+
# Access the Hugging Face token from the environment variable
|
69 |
+
huggingface_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
|
70 |
+
if huggingface_token:
|
71 |
+
login(token=huggingface_token)
|
72 |
+
else:
|
73 |
+
st.error("Hugging Face API token not found in environment variables.")
|
74 |
+
|
75 |
# Select device
|
76 |
device = "cuda" if t.cuda.is_available() else "cpu"
|
77 |
st.write(f"Using device: {device}")
|