|
import streamlit as st |
|
from collections import defaultdict |
|
import tqdm |
|
import transformers |
|
from transformers import AutoTokenizer |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import numpy as np |
|
import plotly.figure_factory as ff |
|
import plotly.express as px |
|
|
|
tokenizer_names_to_test = [ |
|
"xlm-roberta-base", |
|
"bert-base-uncased", |
|
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", |
|
"bigscience/bloom", |
|
"StabilityAI/stablelm-base-alpha-7b", |
|
"google/flan-t5-base", |
|
"facebook/mbart-large-50", |
|
"facebook/nllb-200-distilled-600M", |
|
"EleutherAI/gpt-neox-20b", |
|
] |
|
|
|
with st.sidebar: |
|
with st.spinner('Loading dataset...'): |
|
val_data = pd.read_csv('MassiveDatasetValidationData.csv') |
|
st.success(f'Data loaded: {len(val_data)}') |
|
|
|
languages = st.multiselect( |
|
'Select languages', |
|
options=sorted(val_data.lang.unique()), |
|
default=['English', 'Spanish' ,'Chinese'], |
|
max_selections=5 |
|
) |
|
|
|
|
|
|
|
tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test) |
|
st.write('You selected:', tokenizer_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.container(): |
|
|
|
tokenizer_name = 'num_tokens_openai' |
|
|
|
subset_df = val_data[val_data.lang.isin(languages)] |
|
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages] |
|
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|