Spaces:

Omartificial-Intelligence-Space
/

Kalemat

Running

Kalemat / app.py

Update app.py

71be925 verified 8 months ago

1.45 kB

	import gradio as gr
	from transformers import AutoTokenizer

	# Define a function to tokenize text with a selected tokenizer
	def tokenize_text(text, tokenizer_name):
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
	tokenized_text = tokenizer.tokenize(text)
	input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
	decoded_text = tokenizer.decode(input_ids) # Decode the input IDs
	return f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}"


	# Define available tokenizers
	tokenizer_names = [
	"riotu-lab/ArabianGPT-01B",
	"riotu-lab/ArabianGPT-03B",
	"riotu-lab/ArabianGPT-08B",
	"FreedomIntelligence/AceGPT-13B",
	"FreedomIntelligence/AceGPT-7B",
	"inception-mbzuai/jais-13b",
	"aubmindlab/aragpt2-base",
	"aubmindlab/aragpt2-medium",
	"aubmindlab/aragpt2-large",
	"aubmindlab/aragpt2-mega"
	]

	# Create the Gradio interface
	iface = gr.Interface(
	fn=tokenize_text,
	inputs=[
	gr.Textbox(label="Enter Text"),
	gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
	],
	outputs="text",
	title="Kalemat: Explore Arabic Tokenizers",
	description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process.",
	)

	# Launch the app
	iface.launch()