import streamlit as st import pandas as pd import streamlit.components.v1 as components st.sidebar.image("images/logo.png", use_column_width=True) st.sidebar.write("Bunka Summarizes & Visualizes Information as Maps using LLMs.") st.sidebar.title("Github Page") st.sidebar.write( "Have a look at the following package on GitHub: https://github.com/charlesdedampierre/BunkaTopics" ) st.sidebar.title("Dataset") st.sidebar.write( "We used a subset of the Open Assistant 2 dataset: https://huggingface.co/datasets/OpenAssistant/oasst2" ) st.title("How to understand large textual datasets?") st.info( "We sampled every prompt from the English subset of the oasst2 dataset. Here is a sample:" ) df = pd.read_csv("data/data_sample.csv", index_col=[0]) df = df[["message_id", "text"]] df = df.head(300) st.dataframe(df, use_container_width=True) st.title("Inside the OASST2 dataset") element = open("images/map_prompt.html", "r", encoding="utf-8") st.info("This mapping can be extended to include the assistant answers, and the prompts can be selected on a topic basis through the python package, allowing to filter and curate the data.") components.html(element.read(), height=900, width=900) st.info( "The different clusters allow to explore the main topics evoked by the prompt. For instance, in the blink of an eye, one may see which topics are dealt with and which topics are lacking content. This visualisation can therefore be used as a stepping stone to investigate bias or content, providing resources to fuel the discussion open by the OASS paper available here: https://drive.google.com/file/d/10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX/view" ) st.title("Some insights by territory") df_info = pd.read_csv("data/topics_info.csv", index_col=[0]) df_info = df_info[["name", "size", "percent"]] df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%") df_info = df_info.reset_index(drop=True) st.dataframe(df_info, use_container_width=True) st.title("Bunka Exploration Engine") st.image( "images/pipeline.png", use_column_width=True, )