File size: 2,375 Bytes
d0ab3b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import streamlit as st
from collections import defaultdict
import tqdm
import transformers
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px

tokenizer_names_to_test = [
  "xlm-roberta-base",  # old style
  "bert-base-uncased",  # old style
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
  "bigscience/bloom",  # HuggingFace
  "StabilityAI/stablelm-base-alpha-7b",  # StableLM with Open Assistant
  "google/flan-t5-base",  # Flan T5 (better than T5), Google
  "facebook/mbart-large-50",  # Facebook
  "facebook/nllb-200-distilled-600M",  # Facebook
  "EleutherAI/gpt-neox-20b",  # same as Pythia
]

with st.sidebar:
	with st.spinner('Loading dataset...'):
	    val_data = pd.read_csv('MassiveDatasetValidationData.csv')
	st.success(f'Data loaded: {len(val_data)}')

	languages = st.multiselect(
		'Select languages',
		options=sorted(val_data.lang.unique()),
		default=['English', 'Spanish' ,'Chinese'],
		max_selections=5
	)

	# TODO multi-select tokenizers
	# TODO add openai to this options
	tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test)
	st.write('You selected:', tokenizer_name)

	# with st.spinner('Loading tokenizer...'):
	#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
	# st.success(f'Tokenizer loaded: {tokenizer_name}')

	# # TODO - preload the tokenized versions ... much easier!
	# # TODO - add the metadata data as well??? later on maybe
	# with st.spinner('Calculating tokenization for data...'):
	# 	if tokenizer_name not in val_data.columns:
	# 		val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
	# st.success('Completed.')

with st.container():

	tokenizer_name = 'num_tokens_openai'

	subset_df = val_data[val_data.lang.isin(languages)]
	subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
	fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
	st.plotly_chart(fig, use_container_width=True)


	# for _lang in languages:
	# 	subset = val_data[val_data.lang==_lang]

	# 	fig = ff.create_distplot(val_data,  bin_size=.5,
	#                          curve_type='normal', # override default 'kde'
	#                          colors=colors)