loubnabnl HF staff commited on
Commit
64b11d2
1 Parent(s): 94366ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -25
app.py CHANGED
@@ -5,45 +5,34 @@ import os
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
  st.set_page_config(page_title="FW Clusters inspection", layout="wide")
8
- st.title("FW clusters inspection (on AFAIK topics)")
9
 
10
  st.markdown("""
11
  We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
12
 
13
- Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
14
 
15
- Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/).
16
-
17
- Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics or seperately. Hence the `Select Category Type` dropdown in our interface.
18
  """)
19
 
20
  @st.cache_data
21
- def load_data(educational_topic):
22
- ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2)
23
- if educational_topic in ['Yes', 'No']:
24
- ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
25
  return ds
26
 
27
- @st.cache_data
28
- def get_categories_by_type(_ds, category_type):
29
- filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
30
- return list(set(filtered_ds['category']))
31
-
32
 
33
  st.subheader("Cluster information")
34
- col_1, col_2, col_3 = st.columns(3)
35
- with col_1:
36
- educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"])
 
 
37
 
38
- ds = load_data(educational_topic)
 
39
 
40
- with col_2:
41
- category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
42
- default_index = 0 if educational_topic == "Yes" else 1
43
- selected_category_type = st.selectbox("Select Category Type", category_types, index=default_index)
44
- with col_3:
45
- categories = get_categories_by_type(ds, selected_category_type)
46
- selected_category = st.selectbox("Select Category", categories)
47
 
48
  selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
49
 
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
  st.set_page_config(page_title="FW Clusters inspection", layout="wide")
8
+ st.title("FW clusters inspection (free topics)")
9
 
10
  st.markdown("""
11
  We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
12
 
13
+ Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10.
14
 
15
+ Additionally, the model was tasked with finding the topic of each cluster.
 
 
16
  """)
17
 
18
  @st.cache_data
19
+ def load_data(educational_topic, min_score=1, max_score=10):
20
+ ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
21
+ ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score)
 
22
  return ds
23
 
 
 
 
 
 
24
 
25
  st.subheader("Cluster information")
26
+ min_score, max_score = st.columns(2)
27
+ with min_score:
28
+ min_value = st.slider('Select minimum educational score', 1, 10, 1)
29
+ with max_score:
30
+ max_value = st.slider('Select maximum educational score', 1, 10, 10)
31
 
32
+ ds = load_data(educational_topic, min_score, max_score)
33
+ categories = list(set(ds["category"]))
34
 
35
+ selected_category_type = st.selectbox("Select a topic", categories)
 
 
 
 
 
 
36
 
37
  selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
38