Kamaljp commited on
Commit
21e0900
1 Parent(s): 13b23e1

included upto models architectures

Browse files
Files changed (2) hide show
  1. app.py +260 -16
  2. task_arch.json +1 -0
app.py CHANGED
@@ -1,24 +1,268 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
- from PIL import Image
 
 
 
 
4
 
5
- x = st.slider('Select a value')
6
- st.write(x, 'squared is', x * x)
7
 
8
- pipeline = pipeline(task="image-classification",
9
- model="julien-c/hotdog-not-hotdog")
10
 
11
- st.title("Hot Dog? Or Not?")
12
 
13
- file_name = st.file_uploader("Upload a hot dog candidate image")
 
 
 
 
 
 
14
 
15
- if file_name is not None:
16
- col1, col2 = st.columns(2)
17
 
18
- image = Image.open(file_name)
19
- col1.image(image, use_column_width=True)
20
- predictions = pipeline(image)
21
 
22
- col2.header("Probabilities")
23
- for p in predictions:
24
- col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ XLNetTokenizer
5
+ )
6
+ import pathlib
7
+ import json
8
 
9
+ st.set_page_config(layout='wide')
 
10
 
11
+ st.title("Transformers library For NLP Tasks : Structured by Topics")
 
12
 
13
+ st.write("lets start with the architectures of models")
14
 
15
+ neural_net_models = dict({
16
+ 'encoder': "responsible for understanding the input text.",
17
+ 'decoder': "designed to generate new texts answering queries.",
18
+ 'encoder-decoder': "understand and generate text & have emergent behaviour",
19
+ 'convolution': "used for image recognition and processing.",
20
+ })
21
+ model_types = list(neural_net_models.keys())
22
 
23
+ archs = st.radio("model architectures".capitalize(), model_types)
 
24
 
25
+ st.write(f"{archs.capitalize()} are {neural_net_models[archs]}")
 
 
26
 
27
+ domains = dict({
28
+ "computer_vision": {
29
+ "encoder": ['vit', 'swin', 'segformer', 'beit'],
30
+ "decoder": ['imagegpt'],
31
+ "encoder-decoder": ['detr'],
32
+ "convolution": ['convnext']
33
+ },
34
+ "nlp": {
35
+ "encoder": ["bert", "roberta", "albert", "distillbert",
36
+ "deberta", "longformer",],
37
+ "decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"],
38
+ "encoder-decoder": ["bart", "pegasus", "t5", ],
39
+ },
40
+ "audio": {
41
+ "encoder": ["wav2vec2", "hubert"],
42
+ "encoder-decoder": ["speech2text", "whisper"]
43
+ },
44
+ "multimodal": {
45
+ "encoder": ["visualbert", "vilt", "clip", "owl-vit"],
46
+ "encoder-decoder": ["trocr", "donut"]
47
+ },
48
+ "reinforcement": {
49
+ "decoder": ["trajectory transformer", "decision transformer"]
50
+ }
51
+ })
52
+
53
+ st.write("Lets look at the Individual domains")
54
+
55
+ domain_list = list(domains.keys())
56
+
57
+ doms = st.radio("domains of ai".capitalize(), domain_list)
58
+
59
+ st.write(domains[doms])
60
+
61
+ st.write("Now comes the Tokenizers, the Entry Points")
62
+
63
+ tokenizer_algos = {
64
+ "byte_pair": {
65
+ "base": ['gpt', 'gpt-2(byte_level)'],
66
+ "intro": "https://arxiv.org/abs/1508.07909"
67
+ },
68
+ "wordpiece":{
69
+ "base": ['bert', 'distilbert', 'electra'],
70
+ "intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf"
71
+ },
72
+ "unigram": {
73
+ "base": ['not_used'],
74
+ "intro": "https://arxiv.org/pdf/1804.10959.pdf"
75
+ },
76
+ "sentencepiece":{
77
+ "base": ["xlm", "albert", "xlnet", "marian", "t5"],
78
+ "intro": "https://arxiv.org/pdf/1808.06226.pdf"
79
+ }
80
+ }
81
+
82
+ tokenizer_items = list(tokenizer_algos.keys())
83
+
84
+ algos = st.radio("tokenizer algos".capitalize(), tokenizer_items)
85
+
86
+ st.write(tokenizer_algos[algos])
87
+
88
+ st.write("""We will work on 3 types of tokenizers on a single sentence
89
+ to see how their output differs, by first encoding and decoding them too.""")
90
+
91
+ st.markdown("""### Models in Review:
92
+ - gpt2
93
+ - bert-base-uncased
94
+ - xlm""")
95
+
96
+ input_sentence = "This is a sample sentence for testing tokenizers"
97
+
98
+ gpt2_model = "gpt2"
99
+ bert_model = "bert-base-uncased"
100
+ xlm_model = "xlnet-base-cased"
101
+
102
+ gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model)
103
+ bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
104
+ xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model)
105
+
106
+ st.markdown("#### The input sentence is")
107
+ st.write("The Sample Sentence: ", input_sentence)
108
+
109
+ gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)
110
+ bert_tokenize = bert_tokenizer.tokenize(input_sentence)
111
+ xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)
112
+
113
+ with st.expander(label="Byte Pair Tokenizer", expanded=False):
114
+ st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)")
115
+ st.write(gpt2_tokenize)
116
+ with st.expander(label="Word Piece Tokenizer", expanded=False):
117
+ st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)")
118
+ st.write(bert_tokenize)
119
+ with st.expander(label="SentencePiece Tokenizer", expanded=False):
120
+ st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)")
121
+ st.write(xlm_tokenize)
122
+
123
+ st.markdown("""#### Tokenizer Options:
124
+ There are following parameters in Tokenizer object are most used
125
+ - padding = 'longest'(True), 'max_length', 'do_not_pad'(False)
126
+ - truncation = 'longest_first'(True), 'only_second', 'only_first',
127
+ 'do_not_truncate'(False)
128
+ - max_length = <= model_max_length """)
129
+ ## Refer to https://huggingface.co/docs/transformers/pad_truncation
130
+ gpt2_max_length = gpt2_tokenizer.model_max_length
131
+ bert_max_length = bert_tokenizer.model_max_length
132
+ xlm_max_length = "Not Speced"
133
+
134
+ st.markdown("""We also need the model max length, which is the
135
+ what the model is configured with.""")
136
+ st.write("GPT: ", gpt2_max_length)
137
+ st.write("Bert: ", bert_max_length)
138
+ st.write("XLM: ", xlm_max_length)
139
+
140
+ sent1 = "This app is talking about the variety of Tokenizers and their outputs"
141
+ sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better
142
+ the results"""
143
+
144
+ st.write("We will be working with the following sentences.")
145
+ st.write("Sentence1: ", sent1)
146
+ st.write("Sentence2: ", sent2)
147
+
148
+ st.markdown("#### Tokenization in Action. Using GPT Tokenizer")
149
+ st.markdown("""##### Trial-1:
150
+ > No parameter provided
151
+ > Sentences are given with comma seperation""")
152
+ gpt2_encode = gpt2_tokenizer(sent1, sent2)
153
+ st.write(gpt2_encode)
154
+
155
+ st.markdown("""##### Trial-2:
156
+ > No parameter provided
157
+ > Sentences are made into a List""")
158
+ gpt2_encode = gpt2_tokenizer([sent1, sent2])
159
+ st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])")
160
+ st.write(gpt2_encode)
161
+
162
+ # gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
163
+ gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id
164
+
165
+ st.markdown("""##### Trial-3:
166
+ > Need to add pad token to tokenizer, if the model doesn't have.
167
+ > padding = True
168
+ > Sentences are made into a List""")
169
+ gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)
170
+ st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)")
171
+ st.write(gpt2_encode)
172
+
173
+ st.markdown("""##### Trial-4:
174
+ > Need to add pad token to tokenizer, if the model doesn't have.
175
+ > padding = max_length (requires max_length = int)
176
+ > Sentences are made into a List""")
177
+ gpt2_encode = gpt2_tokenizer([sent1, sent2],
178
+ padding=True,
179
+ max_length=15)
180
+ st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
181
+ padding=True,
182
+ max_length=15""")
183
+
184
+ st.write(gpt2_encode)
185
+
186
+ st.markdown("""##### Trial-5:
187
+ > truncate = True (requires max_length = int)
188
+ > Sentences are seperated by a comma
189
+ Will see total output of 12 token, 6 per sentence""")
190
+
191
+ gpt2_encode = gpt2_tokenizer(sent1, sent2,
192
+ truncation=True,
193
+ max_length=12)
194
+ st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2,
195
+ truncation=True,
196
+ max_length=12)""")
197
+
198
+ st.write(gpt2_encode)
199
+
200
+ st.markdown("""##### Trial-6:
201
+ > truncate = True (requires max_length = int)
202
+ > Sentences are made into a list
203
+ Will have longest first""")
204
+
205
+ gpt2_encode = gpt2_tokenizer([sent1, sent2],
206
+ truncation=True,
207
+ max_length=12)
208
+ st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
209
+ truncation=True,
210
+ max_length=12)""")
211
+
212
+ st.write(gpt2_encode)
213
+
214
+ st.markdown("""##### Trial-7:
215
+ > truncate = only_first
216
+ > Sentences are made into a list
217
+ Will have only 8 tokens """)
218
+
219
+ gpt2_encode = gpt2_tokenizer([sent1, sent2],
220
+ truncation='only_first',
221
+ max_length=8)
222
+ st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
223
+ truncation='only_first',
224
+ max_length=8)""")
225
+
226
+ st.write(gpt2_encode)
227
+
228
+ st.markdown("""##### Trial-8:
229
+ > truncate = False (only_second, is erroring out)
230
+ > Sentences are made into a list
231
+ No Truncation, 2 ids list""")
232
+
233
+ gpt2_encode = gpt2_tokenizer([sent1, sent2],
234
+ truncation=False,
235
+ max_length=7)
236
+ st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
237
+ truncation=False,
238
+ max_length=7)""")
239
+
240
+ st.write(gpt2_encode)
241
+
242
+ curr_dir = pathlib.Path(__file__).parent.resolve()
243
+ file_loc = curr_dir / "task_arch.json"
244
+ file_loc = file_loc.resolve()
245
+
246
+ with open(file_loc, 'r') as arch:
247
+ data = json.load(arch)
248
+
249
+ tasks = list(data.keys())
250
+ st.markdown("#### Lets dive into the model architectures...")
251
+
252
+ task = st.radio("The NLP tasks", tasks)
253
+
254
+ task_data = data[task]
255
+
256
+ num_models = len(task_data['architectures'])
257
+
258
+ show_archs = st.slider("How many archs to Show",
259
+ min_value=4, max_value=num_models)
260
+
261
+ pruned_data = {
262
+ "architectures": task_data['architectures'][:show_archs],
263
+ "AutoModelClass": task_data["AutoModelClass"],
264
+ "dataset": task_data["dataset"],
265
+ "model_used": task_data["model_used"]
266
+ }
267
+
268
+ st.write(pruned_data)
task_arch.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"text_classification": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "BigBird-Pegasus", "BioGpt", "BLOOM", "CamemBERT", "CANINE", "CodeLlama", "ConvBERT", "CTRL", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "ESM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "GPT-J", "I-BERT", "LayoutLM", "LayoutLMv2", "LayoutLMv3", "LED", "LiLT", "LLaMA", "Longformer", "LUKE", "MarkupLM", "mBART", "MEGA", "Megatron-BERT", "Mistral", "Mixtral", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "MVP", "Nezha", "Nystr\u00f6mformer", "OpenLlama", "OpenAI GPT", "OPT", "Perceiver", "Persimmon", "Phi", "PLBart", "QDQBert", "Qwen2", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "T5", "TAPAS", "Transformer-XL", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForSequenceClassification", "dataset": "imdb", "model_used": "distilbert-base-uncased"}, "token_classification": {"architectures": ["ALBERT", "BERT", "BigBird", "BioGpt", "BLOOM", "BROS", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "ESM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "I-BERT", "LayoutLM", "LayoutLMv2", "LayoutLMv3", "LiLT", "Longformer", "LUKE", "MarkupLM", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "Nezha", "Nystr\u00f6mformer", "Phi", "QDQBert", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "T5", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForTokenClassification", "dataset": "wnut_17", "model_used": "distilbert-base-uncased"}, "question_answering": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "BigBird-Pegasus", "BLOOM", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "OpenAI GPT-2", "GPT Neo", "GPT NeoX", "GPT-J", "I-BERT", "LayoutLMv2", "LayoutLMv3", "LED", "LiLT", "LLaMA", "Longformer", "LUKE", "LXMERT", "MarkupLM", "mBART", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "MVP", "Nezha", "Nystr\u00f6mformer", "OPT", "QDQBert", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "Splinter", "SqueezeBERT", "T5", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForQuestionAnswering", "dataset": "squad", "model_used": "distilbert-base-uncased"}, "causal_lm": {"architectures": ["BART", "BERT", "Bert Generation", "BigBird", "BigBird-Pegasus", "BioGpt", "Blenderbot", "BlenderbotSmall", "BLOOM", "CamemBERT", "CodeLlama", "CodeGen", "CPM-Ant", "CTRL", "Data2VecText", "ELECTRA", "ERNIE", "Falcon", "Fuyu", "GIT", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "GPT NeoX Japanese", "GPT-J", "LLaMA", "Marian", "mBART", "MEGA", "Megatron-BERT", "Mistral", "Mixtral", "MPT", "MusicGen", "MVP", "OpenLlama", "OpenAI GPT", "OPT", "Pegasus", "Persimmon", "Phi", "PLBart", "ProphetNet", "QDQBert", "Qwen2", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "RWKV", "Speech2Text2", "Transformer-XL", "TrOCR", "Whisper", "XGLM", "XLM", "XLM-ProphetNet", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD"], "AutoModelClass": "AutoModelForCausalLM", "dataset": "eli5_category", "model_used": "distilgpt2"}, "masked_lm": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "CamemBERT", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ESM", "FlauBERT", "FNet", "Funnel Transformer", "I-BERT", "LayoutLM", "Longformer", "LUKE", "mBART", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MRA", "MVP", "Nezha", "Nystr\u00f6mformer", "Perceiver", "QDQBert", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "TAPAS", "Wav2Vec2", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForMaskedLM", "dataset": "eli-5", "model_used": "distilroberta-base"}, "translation": {"architectures": ["BART", "BigBird-Pegasus", "Blenderbot", "BlenderbotSmall", "Encoder decoder", "FairSeq Machine-Translation", "GPTSAN-japanese", "LED", "LongT5", "M2M100", "Marian", "mBART", "MT5", "MVP", "NLLB", "NLLB-MOE", "Pegasus", "PEGASUS-X", "PLBart", "ProphetNet", "SeamlessM4T", "SeamlessM4Tv2", "SwitchTransformers", "T5", "UMT5", "XLM-ProphetNet"], "AutoModelClass": "AutoModelForSeq2SeqLM", "dataset": "opus_books", "model_used": "t5-small"}, "summarization": {"architectures": ["BART", "BigBird-Pegasus", "Blenderbot", "BlenderbotSmall", "Encoder decoder", "FairSeq Machine-Translation", "GPTSAN-japanese", "LED", "LongT5", "M2M100", "Marian", "mBART", "MT5", "MVP", "NLLB", "NLLB-MOE", "Pegasus", "PEGASUS-X", "PLBart", "ProphetNet", "SeamlessM4T", "SeamlessM4Tv2", "SwitchTransformers", "T5", "UMT5", "XLM-ProphetNet"], "AutoModelClass": "AutoModelForSeq2SeqLM", "dataset": "billsum", "model_used": "t5-small"}, "multiple_choice": {"architectures": ["ALBERT", "BERT", "BigBird", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "FlauBERT", "FNet", "Funnel Transformer", "I-BERT", "Longformer", "LUKE", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MRA", "Nezha", "Nystr\u00f6mformer", "QDQBert", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForMultipleChoice", "dataset": "swag", "model_used": "bert-base-uncased"}}