import streamlit as st from collections import defaultdict import tqdm import transformers from transformers import AutoTokenizer import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import plotly.figure_factory as ff import plotly.express as px tokenizer_names_to_test = [ "xlm-roberta-base", # old style "bert-base-uncased", # old style "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "bigscience/bloom", # HuggingFace "StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant "google/flan-t5-base", # Flan T5 (better than T5), Google "facebook/mbart-large-50", # Facebook "facebook/nllb-200-distilled-600M", # Facebook "EleutherAI/gpt-neox-20b", # same as Pythia ] with st.sidebar: with st.spinner('Loading dataset...'): val_data = pd.read_csv('MassiveDatasetValidationData.csv') st.success(f'Data loaded: {len(val_data)}') languages = st.multiselect( 'Select languages', options=sorted(val_data.lang.unique()), default=['English', 'Spanish' ,'Chinese'], max_selections=5 ) # TODO multi-select tokenizers # TODO add openai to this options tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test) st.write('You selected:', tokenizer_name) # with st.spinner('Loading tokenizer...'): # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # st.success(f'Tokenizer loaded: {tokenizer_name}') # # TODO - preload the tokenized versions ... much easier! # # TODO - add the metadata data as well??? later on maybe # with st.spinner('Calculating tokenization for data...'): # if tokenizer_name not in val_data.columns: # val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x))) # st.success('Completed.') with st.container(): tokenizer_name = 'num_tokens_openai' subset_df = val_data[val_data.lang.isin(languages)] subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages] fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False) st.plotly_chart(fig, use_container_width=True) # for _lang in languages: # subset = val_data[val_data.lang==_lang] # fig = ff.create_distplot(val_data, bin_size=.5, # curve_type='normal', # override default 'kde' # colors=colors)