Spaces:

Kamaljp
/

transformers_universe

Runtime error

App Files Files Community

transformers_universe / app.py

Kamaljp

included upto models architectures

21e0900 over 1 year ago

raw

history blame contribute delete

8.95 kB

	import streamlit as st
	from transformers import (
	AutoTokenizer,
	XLNetTokenizer
	)
	import pathlib
	import json

	st.set_page_config(layout='wide')

	st.title("Transformers library For NLP Tasks : Structured by Topics")

	st.write("lets start with the architectures of models")

	neural_net_models = dict({
	'encoder': "responsible for understanding the input text.",
	'decoder': "designed to generate new texts answering queries.",
	'encoder-decoder': "understand and generate text & have emergent behaviour",
	'convolution': "used for image recognition and processing.",
	})
	model_types = list(neural_net_models.keys())

	archs = st.radio("model architectures".capitalize(), model_types)

	st.write(f"{archs.capitalize()} are {neural_net_models[archs]}")

	domains = dict({
	"computer_vision": {
	"encoder": ['vit', 'swin', 'segformer', 'beit'],
	"decoder": ['imagegpt'],
	"encoder-decoder": ['detr'],
	"convolution": ['convnext']
	},
	"nlp": {
	"encoder": ["bert", "roberta", "albert", "distillbert",
	"deberta", "longformer",],
	"decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"],
	"encoder-decoder": ["bart", "pegasus", "t5", ],
	},
	"audio": {
	"encoder": ["wav2vec2", "hubert"],
	"encoder-decoder": ["speech2text", "whisper"]
	},
	"multimodal": {
	"encoder": ["visualbert", "vilt", "clip", "owl-vit"],
	"encoder-decoder": ["trocr", "donut"]
	},
	"reinforcement": {
	"decoder": ["trajectory transformer", "decision transformer"]
	}
	})

	st.write("Lets look at the Individual domains")

	domain_list = list(domains.keys())

	doms = st.radio("domains of ai".capitalize(), domain_list)

	st.write(domains[doms])

	st.write("Now comes the Tokenizers, the Entry Points")

	tokenizer_algos = {
	"byte_pair": {
	"base": ['gpt', 'gpt-2(byte_level)'],
	"intro": "https://arxiv.org/abs/1508.07909"
	},
	"wordpiece":{
	"base": ['bert', 'distilbert', 'electra'],
	"intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf"
	},
	"unigram": {
	"base": ['not_used'],
	"intro": "https://arxiv.org/pdf/1804.10959.pdf"
	},
	"sentencepiece":{
	"base": ["xlm", "albert", "xlnet", "marian", "t5"],
	"intro": "https://arxiv.org/pdf/1808.06226.pdf"
	}
	}

	tokenizer_items = list(tokenizer_algos.keys())

	algos = st.radio("tokenizer algos".capitalize(), tokenizer_items)

	st.write(tokenizer_algos[algos])

	st.write("""We will work on 3 types of tokenizers on a single sentence
	to see how their output differs, by first encoding and decoding them too.""")

	st.markdown("""### Models in Review:
	- gpt2
	- bert-base-uncased
	- xlm""")

	input_sentence = "This is a sample sentence for testing tokenizers"

	gpt2_model = "gpt2"
	bert_model = "bert-base-uncased"
	xlm_model = "xlnet-base-cased"

	gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model)
	bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
	xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model)

	st.markdown("#### The input sentence is")
	st.write("The Sample Sentence: ", input_sentence)

	gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)
	bert_tokenize = bert_tokenizer.tokenize(input_sentence)
	xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)

	with st.expander(label="Byte Pair Tokenizer", expanded=False):
	st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)")
	st.write(gpt2_tokenize)
	with st.expander(label="Word Piece Tokenizer", expanded=False):
	st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)")
	st.write(bert_tokenize)
	with st.expander(label="SentencePiece Tokenizer", expanded=False):
	st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)")
	st.write(xlm_tokenize)

	st.markdown("""#### Tokenizer Options:
	There are following parameters in Tokenizer object are most used
	- padding = 'longest'(True), 'max_length', 'do_not_pad'(False)
	- truncation = 'longest_first'(True), 'only_second', 'only_first',
	'do_not_truncate'(False)
	- max_length = <= model_max_length """)
	## Refer to https://huggingface.co/docs/transformers/pad_truncation
	gpt2_max_length = gpt2_tokenizer.model_max_length
	bert_max_length = bert_tokenizer.model_max_length
	xlm_max_length = "Not Speced"

	st.markdown("""We also need the model max length, which is the
	what the model is configured with.""")
	st.write("GPT: ", gpt2_max_length)
	st.write("Bert: ", bert_max_length)
	st.write("XLM: ", xlm_max_length)

	sent1 = "This app is talking about the variety of Tokenizers and their outputs"
	sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better
	the results"""

	st.write("We will be working with the following sentences.")
	st.write("Sentence1: ", sent1)
	st.write("Sentence2: ", sent2)

	st.markdown("#### Tokenization in Action. Using GPT Tokenizer")
	st.markdown("""##### Trial-1:
	> No parameter provided
	> Sentences are given with comma seperation""")
	gpt2_encode = gpt2_tokenizer(sent1, sent2)
	st.write(gpt2_encode)

	st.markdown("""##### Trial-2:
	> No parameter provided
	> Sentences are made into a List""")
	gpt2_encode = gpt2_tokenizer([sent1, sent2])
	st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])")
	st.write(gpt2_encode)

	# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id

	st.markdown("""##### Trial-3:
	> Need to add pad token to tokenizer, if the model doesn't have.
	> padding = True
	> Sentences are made into a List""")
	gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)
	st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)")
	st.write(gpt2_encode)

	st.markdown("""##### Trial-4:
	> Need to add pad token to tokenizer, if the model doesn't have.
	> padding = max_length (requires max_length = int)
	> Sentences are made into a List""")
	gpt2_encode = gpt2_tokenizer([sent1, sent2],
	padding=True,
	max_length=15)
	st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
	padding=True,
	max_length=15""")

	st.write(gpt2_encode)

	st.markdown("""##### Trial-5:
	> truncate = True (requires max_length = int)
	> Sentences are seperated by a comma
	Will see total output of 12 token, 6 per sentence""")

	gpt2_encode = gpt2_tokenizer(sent1, sent2,
	truncation=True,
	max_length=12)
	st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2,
	truncation=True,
	max_length=12)""")

	st.write(gpt2_encode)

	st.markdown("""##### Trial-6:
	> truncate = True (requires max_length = int)
	> Sentences are made into a list
	Will have longest first""")

	gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation=True,
	max_length=12)
	st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation=True,
	max_length=12)""")

	st.write(gpt2_encode)

	st.markdown("""##### Trial-7:
	> truncate = only_first
	> Sentences are made into a list
	Will have only 8 tokens """)

	gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation='only_first',
	max_length=8)
	st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation='only_first',
	max_length=8)""")

	st.write(gpt2_encode)

	st.markdown("""##### Trial-8:
	> truncate = False (only_second, is erroring out)
	> Sentences are made into a list
	No Truncation, 2 ids list""")

	gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation=False,
	max_length=7)
	st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
	truncation=False,
	max_length=7)""")

	st.write(gpt2_encode)

	curr_dir = pathlib.Path(__file__).parent.resolve()
	file_loc = curr_dir / "task_arch.json"
	file_loc = file_loc.resolve()

	with open(file_loc, 'r') as arch:
	data = json.load(arch)

	tasks = list(data.keys())
	st.markdown("#### Lets dive into the model architectures...")

	task = st.radio("The NLP tasks", tasks)

	task_data = data[task]

	num_models = len(task_data['architectures'])

	show_archs = st.slider("How many archs to Show",
	min_value=4, max_value=num_models)

	pruned_data = {
	"architectures": task_data['architectures'][:show_archs],
	"AutoModelClass": task_data["AutoModelClass"],
	"dataset": task_data["dataset"],
	"model_used": task_data["model_used"]
	}

	st.write(pruned_data)