File size: 2,077 Bytes
1b0a5d8
 
 
 
 
 
 
 
 
 
 
 
3f7c271
1b0a5d8
 
 
3f7c271
 
 
1b0a5d8
 
 
 
 
c99d57a
 
1b0a5d8
 
3f7c271
c99d57a
3f7c271
 
1b0a5d8
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import streamlit as st
import pandas as pd
import streamlit.components.v1 as components

st.sidebar.image("images/logo.png", use_column_width=True)
st.sidebar.write("Bunka Summarizes & Visualizes Information as Maps using LLMs.")
st.sidebar.title("Github Page")
st.sidebar.write(
    "Have a look at the following package on GitHub: https://github.com/charlesdedampierre/BunkaTopics"
)
st.sidebar.title("Dataset")
st.sidebar.write(
    "We used a subset of the Open Assistant 2 dataset: https://huggingface.co/datasets/OpenAssistant/oasst2"
)

st.title("How to understand large textual datasets?")
st.info(
    "We sampled every prompt from the English subset of the oasst2 dataset. Here is a sample:"
)
df = pd.read_csv("data/data_sample.csv", index_col=[0])
df = df[["message_id", "text"]]
df = df.head(300)
st.dataframe(df, use_container_width=True)
st.title("Inside the OASST2 dataset")
element = open("images/map_prompt.html", "r", encoding="utf-8")
st.info("This mapping can be extended to include the assistant answers, and the prompts can be selected on a topic basis through the python package, allowing to filter and curate the data.")

components.html(element.read(), height=900, width=900)
st.info(
    "The different clusters allow to explore the main topics evoked by the prompt. For instance, in the blink of an eye, one may see which topics are dealt with and which topics are lacking content. This visualisation can therefore be used as a stepping stone to investigate bias or content, providing resources to fuel the discussion open by the OASS paper available here: https://drive.google.com/file/d/10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX/view"
)


st.title("Some insights by territory")
df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
df_info = df_info[["name", "size", "percent"]]
df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%")
df_info = df_info.reset_index(drop=True)

st.dataframe(df_info, use_container_width=True)

st.title("Bunka Exploration Engine")
st.image(
    "images/pipeline.png",
    use_column_width=True,
)