Alcime commited on
Commit
5bfd360
1 Parent(s): d2c3cd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -3,35 +3,34 @@ import pandas as pd
3
  import streamlit.components.v1 as components
4
 
5
  st.sidebar.image("images/logo.png", use_column_width=True)
6
- st.sidebar.write("Bunka Summarizes & Visualizes Information as Maps using LLMs.")
7
  st.sidebar.title("Github Page")
8
  st.sidebar.write(
9
- "Have a look at the following package on GitHub: https://github.com/charlesdedampierre/BunkaTopics"
10
  )
11
  st.sidebar.title("Dataset")
12
  st.sidebar.write(
13
- "We used a subset of the Open Assistant 2 dataset: https://huggingface.co/datasets/OpenAssistant/oasst2"
14
  )
15
 
16
- st.title("How to understand large textual datasets?")
17
  st.info(
18
- "We sampled every prompt from the English subset of the oasst2 dataset. Here is a sample:"
19
  )
20
- df = pd.read_csv("data/data_sample.csv", index_col=[0])
21
- df = df[["message_id", "text"]]
22
  df = df.head(300)
23
  st.dataframe(df, use_container_width=True)
24
- st.title("Inside the OASST2 dataset")
25
- element = open("images/map_prompt.html", "r", encoding="utf-8")
26
- st.info("This mapping can be extended to include the assistant answers, and the prompts can be selected on a topic basis through the python package, allowing to filter and curate the data.")
27
 
28
  components.html(element.read(), height=900, width=900)
29
  st.info(
30
- "The different clusters allow to explore the main topics evoked by the prompt. For instance, in the blink of an eye, one may see which topics are dealt with and which topics are lacking content. This visualisation can therefore be used as a stepping stone to investigate bias or content, providing resources to fuel the discussion open by the OASS paper available here: https://drive.google.com/file/d/10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX/view"
31
  )
32
 
33
 
34
- st.title("Some insights by territory")
35
  df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
36
  df_info = df_info[["name", "size", "percent"]]
37
  df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%")
 
3
  import streamlit.components.v1 as components
4
 
5
  st.sidebar.image("images/logo.png", use_column_width=True)
6
+ st.sidebar.write("Bunka Visualise & Explore l'nformation via des cartes utilisant des LLMs.")
7
  st.sidebar.title("Github Page")
8
  st.sidebar.write(
9
+ "N'hésitez pas à découvrir notre package Github: https://github.com/charlesdedampierre/BunkaTopics"
10
  )
11
  st.sidebar.title("Dataset")
12
  st.sidebar.write(
13
+ "Nous avons scrappé tous les publications TTSO publiées en 2023, puis nous avons découpé chaque publication en articles. "
14
  )
15
 
16
+ st.title("Un an de TTSO, par où commencer ?")
17
  st.info(
18
+ "Nous avons commencé par extraire tout le contenu des publications de TTSO en 2023. Voici un extrait:"
19
  )
20
+ df = pd.read_csv("data/tts_cleaned_data.csv", names=["Date", "Contenu"])
 
21
  df = df.head(300)
22
  st.dataframe(df, use_container_width=True)
23
+ st.title("A l'intérieur de TTSO")
24
+ element = open("images/ttso_carte.html", "r", encoding="utf-8")
25
+ st.info("Cette carte permet d'explorer les différents sujets abordés par TTSO en 2023, regroupés automatiquement par thème via la technologie de Bunka.")
26
 
27
  components.html(element.read(), height=900, width=900)
28
  st.info(
29
+ "En un clin d'oeil, il est possible d'explorer les différents sujets abordés par TTSO. On constate le mélange d'actualité internationales, avec en haut de la carte les conflits israelo-palestiniens et la guerre en Ukraine, avec des actualités de plus en plus française lorsque l'on avance vers le bas de la carte."
30
  )
31
 
32
 
33
+ st.title("Des insights par sujet")
34
  df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
35
  df_info = df_info[["name", "size", "percent"]]
36
  df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%")