import gradio as gr import aranizer # Function to dynamically load tokenizers def load_tokenizers(): tokenizer_methods = [ "aranizer_bpe32k", "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k", "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k" ] tokenizers = {} for method in tokenizer_methods: module, function = method.rsplit("_", 1) try: tokenizer = getattr(getattr(aranizer, module), f'get_tokenizer_{function}')() tokenizers[method] = tokenizer except AttributeError as e: print(f"Error loading tokenizer {method}: {e}") return tokenizers # Now, let's load all available tokenizers tokenizers = load_tokenizers() def compare_tokenizers(text): results = [] for name, tokenizer in tokenizers.items(): try: tokens = tokenizer.tokenize(text) encoded_output = tokenizer.encode(text, add_special_tokens=True) decoded_text = tokenizer.decode(encoded_output) results.append((name, tokens, encoded_output, decoded_text)) except Exception as e: results.append((name, f"Error: {e}", "", "")) return results inputs_component = gr.inputs.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text") outputs_component = gr.outputs.Table(label="Results", headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"]) iface = gr.Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="Tokenizer Comparison Tool") iface.launch()