Update app.py
Browse files
app.py
CHANGED
@@ -4,26 +4,51 @@ import os
|
|
4 |
|
5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
-
st.set_page_config(page_title="
|
8 |
-
st.title("
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
return ds
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
samples = load_data(source)
|
21 |
-
n_samples = len(samples)
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
st.
|
26 |
-
st.markdown(samples[index_example]["prompt"])
|
27 |
|
28 |
-
|
29 |
-
st.markdown(
|
|
|
4 |
|
5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
+
st.set_page_config(page_title="FW Clusters inspection", layout="wide")
|
8 |
+
st.title("FW clusters inspection")
|
9 |
+
|
10 |
+
st.markdown("""
|
11 |
+
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
|
12 |
+
|
13 |
+
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
|
14 |
+
|
15 |
+
Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/).
|
16 |
+
|
17 |
+
Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics seperately. Hence the `Select Category Type` dropdown in our interface.
|
18 |
+
""")
|
19 |
+
|
20 |
+
@st.cache_data
|
21 |
+
def load_data(educational_topic):
|
22 |
+
ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2)
|
23 |
+
if educational_topic in ['Yes', 'No']:
|
24 |
+
ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
|
25 |
return ds
|
26 |
|
27 |
+
@st.cache_data
|
28 |
+
def get_categories_by_type(_ds, category_type):
|
29 |
+
filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
|
30 |
+
return list(set(filtered_ds['category']))
|
31 |
+
|
32 |
+
|
33 |
+
st.subheader("Cluster information")
|
34 |
+
col_1, col_2, col_3 = st.columns(3)
|
35 |
+
with col_1:
|
36 |
+
educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"])
|
37 |
+
|
38 |
+
ds = load_data(educational_topic)
|
39 |
+
|
40 |
+
with col_2:
|
41 |
+
category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
|
42 |
+
selected_category_type = st.selectbox("Select Category Type", category_types)
|
43 |
+
with col_3:
|
44 |
+
categories = get_categories_by_type(ds, selected_category_type)
|
45 |
+
selected_category = st.selectbox("Select Category", categories)
|
46 |
|
47 |
+
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
|
|
|
|
|
48 |
|
49 |
+
# Select sample index
|
50 |
+
n_samples = len(selected_cluster["examples"])
|
51 |
+
index_example = st.number_input(f"Index of a sample: 0 - {n_samples}", min_value=0, max_value=n_samples-1, value=0, step=1)
|
|
|
52 |
|
53 |
+
sample = selected_cluster["examples"][index_example]
|
54 |
+
st.markdown(sample)
|