|
import streamlit as st |
|
|
|
|
|
PATH_PLOTS = "./plots" |
|
|
|
LANGUAGES = { |
|
"Arabic": "ar", |
|
"Basque": "eu", |
|
"Bengali": "bn", |
|
"Catalan": "ca", |
|
"Chinese": "zh", |
|
"English": "en", |
|
"French": "fr", |
|
"Hindi": "hi", |
|
"Indonesian": "id", |
|
"Portuguese": "pt", |
|
"Spanish": "es", |
|
"Urdu": "ur", |
|
"Vietnamese": "vi", |
|
} |
|
|
|
FILTERS = [ |
|
"number of words", |
|
"character repetition ratio", |
|
"word repetition ratio", |
|
"special character ratio", |
|
"closed class word ratio", |
|
"flagged word ratio", |
|
"perplexity score", |
|
] |
|
|
|
|
|
class Visualization: |
|
def __init__(self): |
|
pass |
|
|
|
def set_title(self): |
|
st.title("Visualization of the distributions of the filter values for the BigScience Corpus") |
|
|
|
def choose_language(self): |
|
chosen_language = st.sidebar.selectbox( |
|
"Language", |
|
options=list(LANGUAGES.keys()), |
|
index=5 |
|
) |
|
self.chosen_language = LANGUAGES[chosen_language] |
|
|
|
def choose_filter(self): |
|
chosen_filter = st.sidebar.selectbox( |
|
"Filter on the", |
|
options=FILTERS, |
|
index=0 |
|
) |
|
self.chosen_filter = chosen_filter.replace(" ", "_") |
|
|
|
def display_plot(self): |
|
path_image = f"{PATH_PLOTS}/{self.chosen_language}_{self.chosen_filter}.png" |
|
|
|
col1, col2, col3 = st.columns([1,6,1]) |
|
with col1: |
|
st.write("") |
|
with col2: |
|
st.image(path_image) |
|
with col3: |
|
st.write("") |
|
|
|
def visualization(self): |
|
self.set_title() |
|
self.choose_language() |
|
self.choose_filter() |
|
self.display_plot() |
|
|
|
|
|
if __name__ == "__main__": |
|
st.set_page_config(layout="wide") |
|
visualization = Visualization() |
|
visualization.visualization() |
|
|