File size: 980 Bytes
bee3802
 
 
 
fcbfd45
bee3802
 
 
 
 
 
 
ab010ed
bee3802
 
 
c21432d
 
 
bee3802
 
ab010ed
bee3802
 
fcbfd45
bee3802
fcbfd45
bee3802
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from gradio.inputs import Textbox
from gradio.outputs import Textbox, Table
from gradio import Interface
import aranizer

# Load your tokenizers
tokenizers = {
    "aranizer_bpe50k": aranizer.aranizer_bpe50k.get_tokenizer(),
    "aranizer_bpe64k": aranizer.aranizer_bpe64k.get_tokenizer(),
    "aranizer_sp32k": aranizer.aranizer_sp32k.get_tokenizer(),
    # Add more tokenizers as needed
}

def compare_tokenizers(text):
    results = []
    for name, tokenizer in tokenizers.items():
        tokens = tokenizer.tokenize(text)
        encoded_output = tokenizer.encode(text, add_special_tokens=True)
        decoded_text = tokenizer.decode(encoded_output)
        results.append((name, tokens, encoded_output, decoded_text))
    return results

inputs = Textbox(label="Enter Arabic text")
outputs = Table(label="Results", columns=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])

iface = Interface(fn=compare_tokenizers, inputs=inputs, outputs=outputs)

iface.launch()