import streamlit as st from transformers import ( AutoTokenizer, XLNetTokenizer ) import pathlib import json st.set_page_config(layout='wide') st.title("Transformers library For NLP Tasks : Structured by Topics") st.write("lets start with the architectures of models") neural_net_models = dict({ 'encoder': "responsible for understanding the input text.", 'decoder': "designed to generate new texts answering queries.", 'encoder-decoder': "understand and generate text & have emergent behaviour", 'convolution': "used for image recognition and processing.", }) model_types = list(neural_net_models.keys()) archs = st.radio("model architectures".capitalize(), model_types) st.write(f"{archs.capitalize()} are {neural_net_models[archs]}") domains = dict({ "computer_vision": { "encoder": ['vit', 'swin', 'segformer', 'beit'], "decoder": ['imagegpt'], "encoder-decoder": ['detr'], "convolution": ['convnext'] }, "nlp": { "encoder": ["bert", "roberta", "albert", "distillbert", "deberta", "longformer",], "decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"], "encoder-decoder": ["bart", "pegasus", "t5", ], }, "audio": { "encoder": ["wav2vec2", "hubert"], "encoder-decoder": ["speech2text", "whisper"] }, "multimodal": { "encoder": ["visualbert", "vilt", "clip", "owl-vit"], "encoder-decoder": ["trocr", "donut"] }, "reinforcement": { "decoder": ["trajectory transformer", "decision transformer"] } }) st.write("Lets look at the Individual domains") domain_list = list(domains.keys()) doms = st.radio("domains of ai".capitalize(), domain_list) st.write(domains[doms]) st.write("Now comes the Tokenizers, the Entry Points") tokenizer_algos = { "byte_pair": { "base": ['gpt', 'gpt-2(byte_level)'], "intro": "https://arxiv.org/abs/1508.07909" }, "wordpiece":{ "base": ['bert', 'distilbert', 'electra'], "intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf" }, "unigram": { "base": ['not_used'], "intro": "https://arxiv.org/pdf/1804.10959.pdf" }, "sentencepiece":{ "base": ["xlm", "albert", "xlnet", "marian", "t5"], "intro": "https://arxiv.org/pdf/1808.06226.pdf" } } tokenizer_items = list(tokenizer_algos.keys()) algos = st.radio("tokenizer algos".capitalize(), tokenizer_items) st.write(tokenizer_algos[algos]) st.write("""We will work on 3 types of tokenizers on a single sentence to see how their output differs, by first encoding and decoding them too.""") st.markdown("""### Models in Review: - gpt2 - bert-base-uncased - xlm""") input_sentence = "This is a sample sentence for testing tokenizers" gpt2_model = "gpt2" bert_model = "bert-base-uncased" xlm_model = "xlnet-base-cased" gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model) bert_tokenizer = AutoTokenizer.from_pretrained(bert_model) xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model) st.markdown("#### The input sentence is") st.write("The Sample Sentence: ", input_sentence) gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence) bert_tokenize = bert_tokenizer.tokenize(input_sentence) xlm_tokenize = xlm_tokenizer.tokenize(input_sentence) with st.expander(label="Byte Pair Tokenizer", expanded=False): st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)") st.write(gpt2_tokenize) with st.expander(label="Word Piece Tokenizer", expanded=False): st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)") st.write(bert_tokenize) with st.expander(label="SentencePiece Tokenizer", expanded=False): st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)") st.write(xlm_tokenize) st.markdown("""#### Tokenizer Options: There are following parameters in Tokenizer object are most used - padding = 'longest'(True), 'max_length', 'do_not_pad'(False) - truncation = 'longest_first'(True), 'only_second', 'only_first', 'do_not_truncate'(False) - max_length = <= model_max_length """) ## Refer to https://huggingface.co/docs/transformers/pad_truncation gpt2_max_length = gpt2_tokenizer.model_max_length bert_max_length = bert_tokenizer.model_max_length xlm_max_length = "Not Speced" st.markdown("""We also need the model max length, which is the what the model is configured with.""") st.write("GPT: ", gpt2_max_length) st.write("Bert: ", bert_max_length) st.write("XLM: ", xlm_max_length) sent1 = "This app is talking about the variety of Tokenizers and their outputs" sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better the results""" st.write("We will be working with the following sentences.") st.write("Sentence1: ", sent1) st.write("Sentence2: ", sent2) st.markdown("#### Tokenization in Action. Using GPT Tokenizer") st.markdown("""##### Trial-1: > No parameter provided > Sentences are given with comma seperation""") gpt2_encode = gpt2_tokenizer(sent1, sent2) st.write(gpt2_encode) st.markdown("""##### Trial-2: > No parameter provided > Sentences are made into a List""") gpt2_encode = gpt2_tokenizer([sent1, sent2]) st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])") st.write(gpt2_encode) # gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id st.markdown("""##### Trial-3: > Need to add pad token to tokenizer, if the model doesn't have. > padding = True > Sentences are made into a List""") gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True) st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)") st.write(gpt2_encode) st.markdown("""##### Trial-4: > Need to add pad token to tokenizer, if the model doesn't have. > padding = max_length (requires max_length = int) > Sentences are made into a List""") gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True, max_length=15) st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True, max_length=15""") st.write(gpt2_encode) st.markdown("""##### Trial-5: > truncate = True (requires max_length = int) > Sentences are seperated by a comma Will see total output of 12 token, 6 per sentence""") gpt2_encode = gpt2_tokenizer(sent1, sent2, truncation=True, max_length=12) st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2, truncation=True, max_length=12)""") st.write(gpt2_encode) st.markdown("""##### Trial-6: > truncate = True (requires max_length = int) > Sentences are made into a list Will have longest first""") gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation=True, max_length=12) st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation=True, max_length=12)""") st.write(gpt2_encode) st.markdown("""##### Trial-7: > truncate = only_first > Sentences are made into a list Will have only 8 tokens """) gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation='only_first', max_length=8) st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation='only_first', max_length=8)""") st.write(gpt2_encode) st.markdown("""##### Trial-8: > truncate = False (only_second, is erroring out) > Sentences are made into a list No Truncation, 2 ids list""") gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation=False, max_length=7) st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], truncation=False, max_length=7)""") st.write(gpt2_encode) curr_dir = pathlib.Path(__file__).parent.resolve() file_loc = curr_dir / "task_arch.json" file_loc = file_loc.resolve() with open(file_loc, 'r') as arch: data = json.load(arch) tasks = list(data.keys()) st.markdown("#### Lets dive into the model architectures...") task = st.radio("The NLP tasks", tasks) task_data = data[task] num_models = len(task_data['architectures']) show_archs = st.slider("How many archs to Show", min_value=4, max_value=num_models) pruned_data = { "architectures": task_data['architectures'][:show_archs], "AutoModelClass": task_data["AutoModelClass"], "dataset": task_data["dataset"], "model_used": task_data["model_used"] } st.write(pruned_data)