yenniejun commited on
Commit
29daff8
1 Parent(s): 0a9ef68

update code to match updated data with pre-calculated token lens

Browse files
Files changed (2) hide show
  1. app.py +5 -4
  2. main.py +0 -71
app.py CHANGED
@@ -11,6 +11,7 @@ import plotly.figure_factory as ff
11
  import plotly.express as px
12
 
13
  tokenizer_names_to_test = [
 
14
  "xlm-roberta-base", # old style
15
  "bert-base-uncased", # old style
16
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
@@ -51,11 +52,11 @@ with st.sidebar:
51
  # st.success('Completed.')
52
 
53
  with st.container():
 
 
 
54
 
55
- tokenizer_name = 'num_tokens_openai'
56
-
57
- subset_df = val_data[val_data.lang.isin(languages)]
58
- subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
59
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
60
  st.plotly_chart(fig, use_container_width=True)
61
 
 
11
  import plotly.express as px
12
 
13
  tokenizer_names_to_test = [
14
+ "openai/gpt4",
15
  "xlm-roberta-base", # old style
16
  "bert-base-uncased", # old style
17
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
 
52
  # st.success('Completed.')
53
 
54
  with st.container():
55
+ if tokenizer_name in val_data.columns:
56
+ subset_df = val_data[val_data.lang.isin(languages)]
57
+ subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
58
 
59
+
 
 
 
60
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
61
  st.plotly_chart(fig, use_container_width=True)
62
 
main.py DELETED
@@ -1,71 +0,0 @@
1
- import streamlit as st
2
- from collections import defaultdict
3
- import tqdm
4
- import transformers
5
- from transformers import AutoTokenizer
6
- import pandas as pd
7
- import matplotlib.pyplot as plt
8
- import seaborn as sns
9
- import numpy as np
10
- import plotly.figure_factory as ff
11
- import plotly.express as px
12
-
13
- tokenizer_names_to_test = [
14
- "xlm-roberta-base", # old style
15
- "bert-base-uncased", # old style
16
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
17
- "bigscience/bloom", # HuggingFace
18
- "StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
19
- "google/flan-t5-base", # Flan T5 (better than T5), Google
20
- "facebook/mbart-large-50", # Facebook
21
- "facebook/nllb-200-distilled-600M", # Facebook
22
- "EleutherAI/gpt-neox-20b", # same as Pythia
23
- ]
24
-
25
- with st.sidebar:
26
- with st.spinner('Loading dataset...'):
27
- val_data = pd.read_csv('MassiveDatasetValidationData.csv')
28
- st.success(f'Data loaded: {len(val_data)}')
29
-
30
- languages = st.multiselect(
31
- 'Select languages',
32
- options=sorted(val_data.lang.unique()),
33
- default=['English', 'Spanish' ,'Chinese'],
34
- max_selections=5
35
- )
36
-
37
- # TODO multi-select tokenizers
38
- # TODO add openai to this options
39
- tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test)
40
- st.write('You selected:', tokenizer_name)
41
-
42
- # with st.spinner('Loading tokenizer...'):
43
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
44
- # st.success(f'Tokenizer loaded: {tokenizer_name}')
45
-
46
- # # TODO - preload the tokenized versions ... much easier!
47
- # # TODO - add the metadata data as well??? later on maybe
48
- # with st.spinner('Calculating tokenization for data...'):
49
- # if tokenizer_name not in val_data.columns:
50
- # val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
51
- # st.success('Completed.')
52
-
53
- with st.container():
54
-
55
- tokenizer_name = 'num_tokens_openai'
56
-
57
- subset_df = val_data[val_data.lang.isin(languages)]
58
- subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
59
- fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
60
- st.plotly_chart(fig, use_container_width=True)
61
-
62
-
63
- # for _lang in languages:
64
- # subset = val_data[val_data.lang==_lang]
65
-
66
- # fig = ff.create_distplot(val_data, bin_size=.5,
67
- # curve_type='normal', # override default 'kde'
68
- # colors=colors)
69
-
70
-
71
-