Spaces:

Kamaljp
/

transformers_universe

Running

App Files Files Community

Kamaljp commited on Feb 14

Commit

21e0900

•

1 Parent(s): 13b23e1

included upto models architectures

Browse files

Files changed (2) hide show

app.py +260 -16
task_arch.json +1 -0

app.py CHANGED Viewed

@@ -1,24 +1,268 @@
 import streamlit as st
-from transformers import pipeline
-from PIL import Image
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)
-pipeline = pipeline(task="image-classification",
-                    model="julien-c/hotdog-not-hotdog")
-st.title("Hot Dog? Or Not?")
-file_name = st.file_uploader("Upload a hot dog candidate image")
-if file_name is not None:
-    col1, col2 = st.columns(2)
-    image = Image.open(file_name)
-    col1.image(image, use_column_width=True)
-    predictions = pipeline(image)
-    col2.header("Probabilities")
-    for p in predictions:
-        col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")

 import streamlit as st
+from transformers import (
+    AutoTokenizer,
+    XLNetTokenizer
+)
+import pathlib
+import json
+st.set_page_config(layout='wide')
+st.title("Transformers library For NLP Tasks : Structured by Topics")
+st.write("lets start with the architectures of models")
+neural_net_models = dict({
+    'encoder': "responsible for understanding the input text.",
+    'decoder': "designed to generate new texts answering queries.",
+    'encoder-decoder': "understand and generate text & have emergent behaviour",
+    'convolution': "used for image recognition and processing.",
+})
+model_types = list(neural_net_models.keys())
+archs = st.radio("model architectures".capitalize(), model_types)
+st.write(f"{archs.capitalize()} are {neural_net_models[archs]}")
+domains = dict({
+    "computer_vision": {
+        "encoder": ['vit', 'swin', 'segformer', 'beit'],
+        "decoder": ['imagegpt'],
+        "encoder-decoder": ['detr'],
+        "convolution": ['convnext']
+    },
+    "nlp": {
+        "encoder": ["bert", "roberta", "albert", "distillbert",
+                    "deberta", "longformer",],
+        "decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"],
+        "encoder-decoder": ["bart", "pegasus", "t5", ],
+    },
+    "audio": {
+        "encoder": ["wav2vec2", "hubert"],
+        "encoder-decoder": ["speech2text", "whisper"]
+    },
+    "multimodal": {
+        "encoder": ["visualbert", "vilt", "clip", "owl-vit"],
+        "encoder-decoder": ["trocr", "donut"]
+    },
+    "reinforcement": {
+        "decoder": ["trajectory transformer", "decision transformer"]
+    }
+})
+st.write("Lets look at the Individual domains")
+domain_list = list(domains.keys())
+doms = st.radio("domains of ai".capitalize(), domain_list)
+st.write(domains[doms])
+st.write("Now comes the Tokenizers, the Entry Points")
+tokenizer_algos = {
+    "byte_pair": {
+        "base": ['gpt', 'gpt-2(byte_level)'],
+        "intro": "https://arxiv.org/abs/1508.07909"
+    },
+    "wordpiece":{
+        "base": ['bert', 'distilbert', 'electra'],
+        "intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf"
+    },
+    "unigram": {
+        "base": ['not_used'],
+        "intro": "https://arxiv.org/pdf/1804.10959.pdf"
+    },
+    "sentencepiece":{
+        "base": ["xlm", "albert", "xlnet", "marian", "t5"],
+        "intro": "https://arxiv.org/pdf/1808.06226.pdf"
+    }
+}
+tokenizer_items = list(tokenizer_algos.keys())
+algos = st.radio("tokenizer algos".capitalize(), tokenizer_items)
+st.write(tokenizer_algos[algos])
+st.write("""We will work on 3 types of tokenizers on a single sentence
+         to see how their output differs, by first encoding and decoding them too.""")
+st.markdown("""### Models in Review:
+    - gpt2
+    - bert-base-uncased
+    - xlm""")
+input_sentence = "This is a sample sentence for testing tokenizers"
+gpt2_model = "gpt2"
+bert_model = "bert-base-uncased"
+xlm_model = "xlnet-base-cased"
+gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model)
+bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
+xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model)
+st.markdown("#### The input sentence is")
+st.write("The Sample Sentence: ", input_sentence)
+gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)
+bert_tokenize = bert_tokenizer.tokenize(input_sentence)
+xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)
+with st.expander(label="Byte Pair Tokenizer", expanded=False):
+    st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)")
+    st.write(gpt2_tokenize)
+with st.expander(label="Word Piece Tokenizer", expanded=False):
+    st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)")
+    st.write(bert_tokenize)
+with st.expander(label="SentencePiece Tokenizer", expanded=False):
+    st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)")
+    st.write(xlm_tokenize)
+st.markdown("""#### Tokenizer Options:
+            There are following parameters in Tokenizer object are most used
+    - padding =  'longest'(True), 'max_length', 'do_not_pad'(False)
+    - truncation =  'longest_first'(True), 'only_second', 'only_first',
+             'do_not_truncate'(False)
+    - max_length = <= model_max_length """)
+## Refer to https://huggingface.co/docs/transformers/pad_truncation
+gpt2_max_length = gpt2_tokenizer.model_max_length
+bert_max_length = bert_tokenizer.model_max_length
+xlm_max_length = "Not Speced"
+st.markdown("""We also need the model max length, which is the
+         what the model is configured with.""")
+st.write("GPT: ", gpt2_max_length)
+st.write("Bert: ", bert_max_length)
+st.write("XLM: ", xlm_max_length)
+sent1 = "This app is talking about the variety of Tokenizers and their outputs"
+sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better
+        the results"""
+st.write("We will be working with the following sentences.")
+st.write("Sentence1: ", sent1)
+st.write("Sentence2: ", sent2)
+st.markdown("#### Tokenization in Action. Using GPT Tokenizer")
+st.markdown("""##### Trial-1:
+    > No parameter provided
+    > Sentences are given with comma seperation""")
+gpt2_encode = gpt2_tokenizer(sent1, sent2)
+st.write(gpt2_encode)
+st.markdown("""##### Trial-2:
+    > No parameter provided
+    > Sentences are made into a List""")
+gpt2_encode = gpt2_tokenizer([sent1, sent2])
+st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])")
+st.write(gpt2_encode)
+# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id
+st.markdown("""##### Trial-3:
+    > Need to add pad token to tokenizer, if the model doesn't have.
+    > padding = True
+    > Sentences are made into a List""")
+gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)
+st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)")
+st.write(gpt2_encode)
+st.markdown("""##### Trial-4:
+    > Need to add pad token to tokenizer, if the model doesn't have.
+    > padding = max_length (requires max_length = int)
+    > Sentences are made into a List""")
+gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             padding=True,
+                             max_length=15)
+st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             padding=True,
+                             max_length=15""")
+st.write(gpt2_encode)
+st.markdown("""##### Trial-5:
+    > truncate = True (requires max_length = int)
+    > Sentences are seperated by a comma
+    Will see total output of 12 token, 6 per sentence""")
+gpt2_encode = gpt2_tokenizer(sent1, sent2,
+                             truncation=True,
+                             max_length=12)
+st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2,
+                             truncation=True,
+                              max_length=12)""")
+st.write(gpt2_encode)
+st.markdown("""##### Trial-6:
+    > truncate = True (requires max_length = int)
+    > Sentences are made into a list
+    Will have longest first""")
+gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation=True,
+                             max_length=12)
+st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation=True,
+                              max_length=12)""")
+st.write(gpt2_encode)
+st.markdown("""##### Trial-7:
+    > truncate = only_first
+    > Sentences are made into a list
+    Will have only 8 tokens """)
+gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation='only_first',
+                             max_length=8)
+st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation='only_first',
+                              max_length=8)""")
+st.write(gpt2_encode)
+st.markdown("""##### Trial-8:
+    > truncate = False (only_second, is erroring out)
+    > Sentences are made into a list
+    No Truncation, 2 ids list""")
+gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation=False,
+                             max_length=7)
+st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
+                             truncation=False,
+                             max_length=7)""")
+st.write(gpt2_encode)
+curr_dir = pathlib.Path(__file__).parent.resolve()
+file_loc = curr_dir / "task_arch.json"
+file_loc = file_loc.resolve()
+with open(file_loc, 'r') as arch:
+    data = json.load(arch)
+tasks = list(data.keys())
+st.markdown("#### Lets dive into the model architectures...")
+task = st.radio("The NLP tasks", tasks)
+task_data = data[task]
+num_models = len(task_data['architectures'])
+show_archs = st.slider("How many archs to Show",
+                       min_value=4, max_value=num_models)
+pruned_data = {
+    "architectures": task_data['architectures'][:show_archs],
+    "AutoModelClass": task_data["AutoModelClass"],
+    "dataset": task_data["dataset"],
+    "model_used": task_data["model_used"]
+}
+st.write(pruned_data)

task_arch.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"text_classification": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "BigBird-Pegasus", "BioGpt", "BLOOM", "CamemBERT", "CANINE", "CodeLlama", "ConvBERT", "CTRL", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "ESM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "GPT-J", "I-BERT", "LayoutLM", "LayoutLMv2", "LayoutLMv3", "LED", "LiLT", "LLaMA", "Longformer", "LUKE", "MarkupLM", "mBART", "MEGA", "Megatron-BERT", "Mistral", "Mixtral", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "MVP", "Nezha", "Nystr\u00f6mformer", "OpenLlama", "OpenAI GPT", "OPT", "Perceiver", "Persimmon", "Phi", "PLBart", "QDQBert", "Qwen2", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "T5", "TAPAS", "Transformer-XL", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForSequenceClassification", "dataset": "imdb", "model_used": "distilbert-base-uncased"}, "token_classification": {"architectures": ["ALBERT", "BERT", "BigBird", "BioGpt", "BLOOM", "BROS", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "ESM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "I-BERT", "LayoutLM", "LayoutLMv2", "LayoutLMv3", "LiLT", "Longformer", "LUKE", "MarkupLM", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "Nezha", "Nystr\u00f6mformer", "Phi", "QDQBert", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "T5", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForTokenClassification", "dataset": "wnut_17", "model_used": "distilbert-base-uncased"}, "question_answering": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "BigBird-Pegasus", "BLOOM", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "Falcon", "FlauBERT", "FNet", "Funnel Transformer", "OpenAI GPT-2", "GPT Neo", "GPT NeoX", "GPT-J", "I-BERT", "LayoutLMv2", "LayoutLMv3", "LED", "LiLT", "LLaMA", "Longformer", "LUKE", "LXMERT", "MarkupLM", "mBART", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MPT", "MRA", "MT5", "MVP", "Nezha", "Nystr\u00f6mformer", "OPT", "QDQBert", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "Splinter", "SqueezeBERT", "T5", "UMT5", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForQuestionAnswering", "dataset": "squad", "model_used": "distilbert-base-uncased"}, "causal_lm": {"architectures": ["BART", "BERT", "Bert Generation", "BigBird", "BigBird-Pegasus", "BioGpt", "Blenderbot", "BlenderbotSmall", "BLOOM", "CamemBERT", "CodeLlama", "CodeGen", "CPM-Ant", "CTRL", "Data2VecText", "ELECTRA", "ERNIE", "Falcon", "Fuyu", "GIT", "GPT-Sw3", "OpenAI GPT-2", "GPTBigCode", "GPT Neo", "GPT NeoX", "GPT NeoX Japanese", "GPT-J", "LLaMA", "Marian", "mBART", "MEGA", "Megatron-BERT", "Mistral", "Mixtral", "MPT", "MusicGen", "MVP", "OpenLlama", "OpenAI GPT", "OPT", "Pegasus", "Persimmon", "Phi", "PLBart", "ProphetNet", "QDQBert", "Qwen2", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "RWKV", "Speech2Text2", "Transformer-XL", "TrOCR", "Whisper", "XGLM", "XLM", "XLM-ProphetNet", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD"], "AutoModelClass": "AutoModelForCausalLM", "dataset": "eli5_category", "model_used": "distilgpt2"}, "masked_lm": {"architectures": ["ALBERT", "BART", "BERT", "BigBird", "CamemBERT", "ConvBERT", "Data2VecText", "DeBERTa", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ESM", "FlauBERT", "FNet", "Funnel Transformer", "I-BERT", "LayoutLM", "Longformer", "LUKE", "mBART", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MRA", "MVP", "Nezha", "Nystr\u00f6mformer", "Perceiver", "QDQBert", "Reformer", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "TAPAS", "Wav2Vec2", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForMaskedLM", "dataset": "eli-5", "model_used": "distilroberta-base"}, "translation": {"architectures": ["BART", "BigBird-Pegasus", "Blenderbot", "BlenderbotSmall", "Encoder decoder", "FairSeq Machine-Translation", "GPTSAN-japanese", "LED", "LongT5", "M2M100", "Marian", "mBART", "MT5", "MVP", "NLLB", "NLLB-MOE", "Pegasus", "PEGASUS-X", "PLBart", "ProphetNet", "SeamlessM4T", "SeamlessM4Tv2", "SwitchTransformers", "T5", "UMT5", "XLM-ProphetNet"], "AutoModelClass": "AutoModelForSeq2SeqLM", "dataset": "opus_books", "model_used": "t5-small"}, "summarization": {"architectures": ["BART", "BigBird-Pegasus", "Blenderbot", "BlenderbotSmall", "Encoder decoder", "FairSeq Machine-Translation", "GPTSAN-japanese", "LED", "LongT5", "M2M100", "Marian", "mBART", "MT5", "MVP", "NLLB", "NLLB-MOE", "Pegasus", "PEGASUS-X", "PLBart", "ProphetNet", "SeamlessM4T", "SeamlessM4Tv2", "SwitchTransformers", "T5", "UMT5", "XLM-ProphetNet"], "AutoModelClass": "AutoModelForSeq2SeqLM", "dataset": "billsum", "model_used": "t5-small"}, "multiple_choice": {"architectures": ["ALBERT", "BERT", "BigBird", "CamemBERT", "CANINE", "ConvBERT", "Data2VecText", "DeBERTa-v2", "DistilBERT", "ELECTRA", "ERNIE", "ErnieM", "FlauBERT", "FNet", "Funnel Transformer", "I-BERT", "Longformer", "LUKE", "MEGA", "Megatron-BERT", "MobileBERT", "MPNet", "MRA", "Nezha", "Nystr\u00f6mformer", "QDQBert", "RemBERT", "RoBERTa", "RoBERTa-PreLayerNorm", "RoCBert", "RoFormer", "SqueezeBERT", "XLM", "XLM-RoBERTa", "XLM-RoBERTa-XL", "XLNet", "X-MOD", "YOSO"], "AutoModelClass": "AutoModelForMultipleChoice", "dataset": "swag", "model_used": "bert-base-uncased"}}