File size: 2,790 Bytes
8aac646
 
 
 
 
 
bf9b80e
 
b19b634
 
bf9b80e
b19b634
bc78067
 
b19b634
bc78067
b19b634
 
85a8c20
b19b634
85a8c20
770946a
 
85a8c20
 
 
6881bc0
 
85a8c20
 
 
 
 
8aac646
 
b19b634
f5985dd
2865184
01206ed
2865184
01206ed
85a8c20
01206ed
85a8c20
 
 
64b11d2
01206ed
b19b634
8aac646
b19b634
c6c5724
 
 
 
 
8aac646
c6c5724
 
 
 
 
 
2858b59
c6c5724
 
2b533cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from datasets import load_dataset
import os 

HF_TOKEN = os.environ.get("HF_TOKEN", None)

st.set_page_config(page_title="Web Clusters inspection", layout="wide")
st.title("Web clusters inspection")

st.markdown("""
We clustered 100k web samples using [text-clustering](https://github.com/huggingface/text-clustering). 

Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \
Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics.

Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples). 
""")


@st.cache_data
def load_data(min_score=1, max_score=10, show_special=False):
    # HuggingFaceTB/FW_clusters_free_topics
    ds = load_dataset("HuggingFaceTB/FW_clusters_100k_145_topics", split="train", token=HF_TOKEN, num_proc=2)
    def filter_func(x):
        try:
            score = int(x['educational_score'])
            value = False if show_special else min_score <= score <= max_score
            return value
        except (ValueError, TypeError):
            # Return True if show_special is checked and educational_score is None or ''
            return show_special

    ds = ds.filter(filter_func)
    return ds

st.subheader("Cluster information")
col_1, col_2, col_3 = st.columns(3)
with col_1:
    show_special = st.checkbox('Show only clusters with undefined educational score', False)
with col_2:
    min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_3:
    max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
    
# Load data based on slider values and checkbox status
ds = load_data(min_value, max_value, show_special)
categories = list(set(ds["category"]))
selected_category = st.selectbox("Select a topic", categories)
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
    col_1, col_2 = st.columns(2)
    with col_1:
        index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one",  min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

    files = selected_cluster[index_cluster]["examples"]

    with col_2:
        index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one",  min_value=0, max_value=len(files)-1, value=0, step=1)

    sample = files[index_example]
    st.markdown(f"**Educational score of the cluster**: {selected_cluster[index_cluster]['educational_score']}")
    st.markdown(sample)
else:
    st.markdown("No files found, change the cluster.")