Kamaljp's picture
included upto models architectures
21e0900
import streamlit as st
from transformers import (
AutoTokenizer,
XLNetTokenizer
)
import pathlib
import json
st.set_page_config(layout='wide')
st.title("Transformers library For NLP Tasks : Structured by Topics")
st.write("lets start with the architectures of models")
neural_net_models = dict({
'encoder': "responsible for understanding the input text.",
'decoder': "designed to generate new texts answering queries.",
'encoder-decoder': "understand and generate text & have emergent behaviour",
'convolution': "used for image recognition and processing.",
})
model_types = list(neural_net_models.keys())
archs = st.radio("model architectures".capitalize(), model_types)
st.write(f"{archs.capitalize()} are {neural_net_models[archs]}")
domains = dict({
"computer_vision": {
"encoder": ['vit', 'swin', 'segformer', 'beit'],
"decoder": ['imagegpt'],
"encoder-decoder": ['detr'],
"convolution": ['convnext']
},
"nlp": {
"encoder": ["bert", "roberta", "albert", "distillbert",
"deberta", "longformer",],
"decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"],
"encoder-decoder": ["bart", "pegasus", "t5", ],
},
"audio": {
"encoder": ["wav2vec2", "hubert"],
"encoder-decoder": ["speech2text", "whisper"]
},
"multimodal": {
"encoder": ["visualbert", "vilt", "clip", "owl-vit"],
"encoder-decoder": ["trocr", "donut"]
},
"reinforcement": {
"decoder": ["trajectory transformer", "decision transformer"]
}
})
st.write("Lets look at the Individual domains")
domain_list = list(domains.keys())
doms = st.radio("domains of ai".capitalize(), domain_list)
st.write(domains[doms])
st.write("Now comes the Tokenizers, the Entry Points")
tokenizer_algos = {
"byte_pair": {
"base": ['gpt', 'gpt-2(byte_level)'],
"intro": "https://arxiv.org/abs/1508.07909"
},
"wordpiece":{
"base": ['bert', 'distilbert', 'electra'],
"intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf"
},
"unigram": {
"base": ['not_used'],
"intro": "https://arxiv.org/pdf/1804.10959.pdf"
},
"sentencepiece":{
"base": ["xlm", "albert", "xlnet", "marian", "t5"],
"intro": "https://arxiv.org/pdf/1808.06226.pdf"
}
}
tokenizer_items = list(tokenizer_algos.keys())
algos = st.radio("tokenizer algos".capitalize(), tokenizer_items)
st.write(tokenizer_algos[algos])
st.write("""We will work on 3 types of tokenizers on a single sentence
to see how their output differs, by first encoding and decoding them too.""")
st.markdown("""### Models in Review:
- gpt2
- bert-base-uncased
- xlm""")
input_sentence = "This is a sample sentence for testing tokenizers"
gpt2_model = "gpt2"
bert_model = "bert-base-uncased"
xlm_model = "xlnet-base-cased"
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model)
st.markdown("#### The input sentence is")
st.write("The Sample Sentence: ", input_sentence)
gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)
bert_tokenize = bert_tokenizer.tokenize(input_sentence)
xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)
with st.expander(label="Byte Pair Tokenizer", expanded=False):
st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)")
st.write(gpt2_tokenize)
with st.expander(label="Word Piece Tokenizer", expanded=False):
st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)")
st.write(bert_tokenize)
with st.expander(label="SentencePiece Tokenizer", expanded=False):
st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)")
st.write(xlm_tokenize)
st.markdown("""#### Tokenizer Options:
There are following parameters in Tokenizer object are most used
- padding = 'longest'(True), 'max_length', 'do_not_pad'(False)
- truncation = 'longest_first'(True), 'only_second', 'only_first',
'do_not_truncate'(False)
- max_length = <= model_max_length """)
## Refer to https://huggingface.co/docs/transformers/pad_truncation
gpt2_max_length = gpt2_tokenizer.model_max_length
bert_max_length = bert_tokenizer.model_max_length
xlm_max_length = "Not Speced"
st.markdown("""We also need the model max length, which is the
what the model is configured with.""")
st.write("GPT: ", gpt2_max_length)
st.write("Bert: ", bert_max_length)
st.write("XLM: ", xlm_max_length)
sent1 = "This app is talking about the variety of Tokenizers and their outputs"
sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better
the results"""
st.write("We will be working with the following sentences.")
st.write("Sentence1: ", sent1)
st.write("Sentence2: ", sent2)
st.markdown("#### Tokenization in Action. Using GPT Tokenizer")
st.markdown("""##### Trial-1:
> No parameter provided
> Sentences are given with comma seperation""")
gpt2_encode = gpt2_tokenizer(sent1, sent2)
st.write(gpt2_encode)
st.markdown("""##### Trial-2:
> No parameter provided
> Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2])
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])")
st.write(gpt2_encode)
# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id
st.markdown("""##### Trial-3:
> Need to add pad token to tokenizer, if the model doesn't have.
> padding = True
> Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)")
st.write(gpt2_encode)
st.markdown("""##### Trial-4:
> Need to add pad token to tokenizer, if the model doesn't have.
> padding = max_length (requires max_length = int)
> Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2],
padding=True,
max_length=15)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
padding=True,
max_length=15""")
st.write(gpt2_encode)
st.markdown("""##### Trial-5:
> truncate = True (requires max_length = int)
> Sentences are seperated by a comma
Will see total output of 12 token, 6 per sentence""")
gpt2_encode = gpt2_tokenizer(sent1, sent2,
truncation=True,
max_length=12)
st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2,
truncation=True,
max_length=12)""")
st.write(gpt2_encode)
st.markdown("""##### Trial-6:
> truncate = True (requires max_length = int)
> Sentences are made into a list
Will have longest first""")
gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation=True,
max_length=12)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation=True,
max_length=12)""")
st.write(gpt2_encode)
st.markdown("""##### Trial-7:
> truncate = only_first
> Sentences are made into a list
Will have only 8 tokens """)
gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation='only_first',
max_length=8)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation='only_first',
max_length=8)""")
st.write(gpt2_encode)
st.markdown("""##### Trial-8:
> truncate = False (only_second, is erroring out)
> Sentences are made into a list
No Truncation, 2 ids list""")
gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation=False,
max_length=7)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
truncation=False,
max_length=7)""")
st.write(gpt2_encode)
curr_dir = pathlib.Path(__file__).parent.resolve()
file_loc = curr_dir / "task_arch.json"
file_loc = file_loc.resolve()
with open(file_loc, 'r') as arch:
data = json.load(arch)
tasks = list(data.keys())
st.markdown("#### Lets dive into the model architectures...")
task = st.radio("The NLP tasks", tasks)
task_data = data[task]
num_models = len(task_data['architectures'])
show_archs = st.slider("How many archs to Show",
min_value=4, max_value=num_models)
pruned_data = {
"architectures": task_data['architectures'][:show_archs],
"AutoModelClass": task_data["AutoModelClass"],
"dataset": task_data["dataset"],
"model_used": task_data["model_used"]
}
st.write(pruned_data)