yenniejun commited on
Commit
87987fe
1 Parent(s): d0ab3b0

rename to app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from collections import defaultdict
3
+ import tqdm
4
+ import transformers
5
+ from transformers import AutoTokenizer
6
+ import pandas as pd
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ import numpy as np
10
+ import plotly.figure_factory as ff
11
+ import plotly.express as px
12
+
13
+ tokenizer_names_to_test = [
14
+ "xlm-roberta-base", # old style
15
+ "bert-base-uncased", # old style
16
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
17
+ "bigscience/bloom", # HuggingFace
18
+ "StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
19
+ "google/flan-t5-base", # Flan T5 (better than T5), Google
20
+ "facebook/mbart-large-50", # Facebook
21
+ "facebook/nllb-200-distilled-600M", # Facebook
22
+ "EleutherAI/gpt-neox-20b", # same as Pythia
23
+ ]
24
+
25
+ with st.sidebar:
26
+ with st.spinner('Loading dataset...'):
27
+ val_data = pd.read_csv('MassiveDatasetValidationData.csv')
28
+ st.success(f'Data loaded: {len(val_data)}')
29
+
30
+ languages = st.multiselect(
31
+ 'Select languages',
32
+ options=sorted(val_data.lang.unique()),
33
+ default=['English', 'Spanish' ,'Chinese'],
34
+ max_selections=5
35
+ )
36
+
37
+ # TODO multi-select tokenizers
38
+ # TODO add openai to this options
39
+ tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test)
40
+ st.write('You selected:', tokenizer_name)
41
+
42
+ # with st.spinner('Loading tokenizer...'):
43
+ # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
44
+ # st.success(f'Tokenizer loaded: {tokenizer_name}')
45
+
46
+ # # TODO - preload the tokenized versions ... much easier!
47
+ # # TODO - add the metadata data as well??? later on maybe
48
+ # with st.spinner('Calculating tokenization for data...'):
49
+ # if tokenizer_name not in val_data.columns:
50
+ # val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
51
+ # st.success('Completed.')
52
+
53
+ with st.container():
54
+
55
+ tokenizer_name = 'num_tokens_openai'
56
+
57
+ subset_df = val_data[val_data.lang.isin(languages)]
58
+ subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
59
+ fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
60
+ st.plotly_chart(fig, use_container_width=True)
61
+
62
+
63
+ # for _lang in languages:
64
+ # subset = val_data[val_data.lang==_lang]
65
+
66
+ # fig = ff.create_distplot(val_data, bin_size=.5,
67
+ # curve_type='normal', # override default 'kde'
68
+ # colors=colors)
69
+
70
+
71
+