File size: 7,608 Bytes
87987fe
 
 
 
 
 
 
 
 
 
 
40b05d1
 
8c475b6
87987fe
535e678
 
 
 
62551b2
 
 
5943f5c
 
62551b2
 
 
 
 
5943f5c
 
 
535e678
87987fe
29daff8
8c475b6
 
 
 
 
 
 
87987fe
 
 
 
 
 
 
 
 
 
 
96bd252
cb5b76e
40b05d1
 
 
ba38c3f
96bd252
111053f
c1ad373
535e678
c1ad373
535e678
616477f
 
 
 
 
 
 
 
 
 
87987fe
535e678
87987fe
616477f
5943f5c
 
8641796
 
616477f
 
 
87987fe
 
 
62551b2
aaff21f
 
87987fe
535e678
 
 
616477f
 
 
535e678
87987fe
 
 
 
 
 
 
 
 
 
 
 
29daff8
 
 
5943f5c
 
 
 
 
 
 
 
 
535e678
87987fe
535e678
5943f5c
 
535e678
 
5943f5c
535e678
 
87987fe
 
40b05d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5943f5c
87987fe
616477f
62551b2
 
26e280e
62551b2
 
 
 
5943f5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62551b2
 
 
 
 
 
 
5943f5c
 
96bd252
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import streamlit as st
from collections import defaultdict
import tqdm
import transformers
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random, glob

@st.cache_data
def load_data():
	return pd.read_csv('MassiveDatasetValidationData.csv')

def reload_example_text_data():
	random_id = random.choice(val_data['id'])
	tempdf = subset_df[subset_df['id']==random_id]
	tempdf.rename(columns={'lang': 'Language'}, inplace=True)
	tempdf.set_index('Language', inplace=True)
	tempdf = tempdf[['iso', 'text', tokenizer_name]]
	tempdf.columns=['ISO', 'Text', 'Num Tokens']
	tempdf.sort_values(by='ISO', inplace=True)
	st.session_state.examplesdf  = tempdf




# TODO allow new tokenizers from HF
tokenizer_names_to_test = [
  "openai/gpt4",
  "Xenova/gpt-4o", 
  "Xenova/claude-tokenizer", 
  "CohereForAI/aya-101",
  "meta-llama/Meta-Llama-3-70B", 
  "mistralai/Mixtral-8x22B-v0.1",
  "google/gemma-7b",
  "facebook/nllb-200-distilled-600M",  # Facebook
  "xlm-roberta-base",  # old style
  "bert-base-uncased",  # old style
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
  "bigscience/bloom",  # HuggingFace
  "StabilityAI/stablelm-base-alpha-7b",  # StableLM with Open Assistant
  "google/flan-t5-base",  # Flan T5 (better than T5), Google
  "facebook/mbart-large-50",  # Facebook
  "EleutherAI/gpt-neox-20b",  # same as Pythia
]

with st.sidebar:

	st.header('All languages are NOT created (tokenized) equal!')
	link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese)."
	st.markdown(link)
	link="This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
	st.markdown(link)

	st.header('Data Visualization')
	st.subheader('Tokenizer')
	# TODO multi-select tokenizers
	tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')

	if tokenizer_name not in ['openai/gpt4']:
		url = f'https://huggingface.co/{tokenizer_name}'
		link = f'Tokenizer is available [on the HuggingFace hub]({url})'
		st.markdown(link, unsafe_allow_html=True)
	else:
		link="Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
		st.markdown(link)


	st.subheader('Data')
	with st.spinner('Loading dataset...'):
	    val_data = load_data()
	st.success(f'Data loaded: {len(val_data)}')

	# st.write(val_data.columns, val_data.head())

	with st.expander('Data Source'):
		st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")


	st.subheader('Languages')
	languages = st.multiselect(
		'Select languages',
		options=sorted(val_data.lang.unique()),
		default=['English', 'Spanish' ,'Chinese', 'Burmese'],
		max_selections=6,
		label_visibility='collapsed'
	)
	
	st.subheader('Figure')
	show_hist = st.checkbox('Show histogram', value=False)



	# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)

	# with st.spinner('Loading tokenizer...'):
	#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
	# st.success(f'Tokenizer loaded: {tokenizer_name}')

	# # TODO - add the metadata data as well??? later on maybe
	# with st.spinner('Calculating tokenization for data...'):
	# 	if tokenizer_name not in val_data.columns:
	# 		val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
	# st.success('Completed.')

with st.container():
	if tokenizer_name in val_data.columns:
		subset_df = val_data[val_data.lang.isin(languages)]
		subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]

	# st.header(f'Comparing languages for {tokenizer_name}')

	st.subheader(f'Median Token Length for `{tokenizer_name}`')
	metric_cols = st.columns(len(languages))
	for i, _lang in enumerate(languages):
		metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))


	fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)

	fig.update_layout(
		title=dict(text='Token Distribution', font=dict(size=25), automargin=True, yref='paper', ),
		# title='Distribution of tokens',
		xaxis_title="Number of Tokens",
    yaxis_title="Density",
    height=500
    # title_font_family='"Source Sans Pro", sans-serif'
	) 
	st.plotly_chart(fig, use_container_width=True)


	# Create figures using px.bar
	shortest = val_data.groupby('lang')[tokenizer_name].median().sort_values().head(7).reset_index()
	shortest["type"] = "shortest"
	longest = val_data.groupby('lang')[tokenizer_name].median().sort_values().tail(7).reset_index()
	longest["type"] = "longest"
	combined = pd.concat([shortest, longest]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
	color_sequence = px.colors.qualitative.D3  # You can choose other built-in sequences or define your own
	fig = px.bar(combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=color_sequence)
	fig.update_traces(hovertemplate='%{y}: %{x} tokens')
	fig.update_layout(
			title=dict(text='Top Langs with Shortest and Longest Median Token Lengths',
			 font=dict(size=25), automargin=True, yref='paper', pad=dict(b=20)),  # Add more padding below the title
			# title='Distribution of tokens',
	    xaxis=dict(
        title="Number of Tokens",
        showgrid=True,   # Show vertical gridlines
        gridwidth=1,     # Gridline width
        gridcolor='LightGrey'  # Gridline color
	    ),
	    yaxis=dict(
	        title="",
	    ),
	    height=400,
	    showlegend=False  # Remove the legend
		) 
	st.plotly_chart(fig, use_container_width=True)



	st.subheader('Example Texts')
	reload_example_text_data()
	if st.button("🔄 Randomly sample"):
		reload_example_text_data()
	st.dataframe(st.session_state.examplesdf)  # Same as st.write(df)


	# val_median_data = val_data.groupby('lang')[tokenizer_name].apply(np.median)
	# val_median_data = val_median_data.sort_values(ascending=False)
	# val_median_data = val_median_data.reset_index()
	# # val_median_data = val_median_data[val_median_data.lang.isin(languages)]
	# val_median_data[tokenizer_name] = val_median_data[tokenizer_name].astype(int)
	# val_median_data.columns = ['Language', 'Median Number of Tokens']
	# # st.write(val_median_data.head())
	# bar_fig = px.bar(
	# 	val_median_data, 
	# 	y='Language', 
	# 	x='Median Number of Tokens', 
	# 	text_auto='d', 
	# 	orientation='h',
	# 	hover_data=val_median_data.columns,
	# 	height=1000,
	# 	)
	# bar_fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False)
	# bar_fig.update_layout(
	# 			title=dict(text='Comparison of median token lengths', 
	# 				font=dict(size=20), 
	# 				automargin=True, yref='paper', ),
	# 			)
	# st.plotly_chart(bar_fig, use_container_width=True)