HeshamHaroon commited on
Commit
bee3802
1 Parent(s): ab010ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -43
app.py CHANGED
@@ -1,51 +1,28 @@
1
- import gradio as gr
2
- from random import random
3
- from aranizer import aranizer_bpe32k, aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
 
4
 
5
- def load_tokenizer(tokenizer_choice):
6
- # Dictionary mapping tokenizer choice to actual tokenizer initializer
7
- tokenizer_map = {
8
- "aranizer_bpe32k": aranizer_bpe32k.get_tokenizer(),
9
- "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer(),
10
- "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer(),
11
- "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer(),
12
- "aranizer_sp32k": aranizer_sp32k.get_tokenizer(),
13
- "aranizer_sp50k": aranizer_sp50k.get_tokenizer(),
14
- "aranizer_sp64k": aranizer_sp64k.get_tokenizer(),
15
- "aranizer_sp86k": aranizer_sp86k.get_tokenizer(),
16
- }
17
 
18
- return tokenizer_map.get(tokenizer_choice, None)
19
-
20
- def tokenize_and_encode_and_embed(text, tokenizer_choice):
21
- tokenizer = load_tokenizer(tokenizer_choice) # Use load_tokenizer here
22
- if tokenizer:
23
- # Example methods. Replace with actual methods from your tokenizer
24
  tokens = tokenizer.tokenize(text)
25
  encoded_output = tokenizer.encode(text, add_special_tokens=True)
26
  decoded_text = tokenizer.decode(encoded_output)
 
 
27
 
28
- # Example embedding (replace with actual embedding generation from your model)
29
- embeddings = [random() for _ in range(10)] # Example 10-dimensional embedding vector
30
-
31
- return " ".join(tokens), str(encoded_output), decoded_text, embeddings
32
- else:
33
- return "Tokenizer not loaded correctly", "", "", []
34
 
35
- demo = gr.Interface(
36
- fn=tokenize_and_encode_and_embed,
37
- inputs=[
38
- gr.Textbox(lines=5, label="النص العربي"),
39
- gr.Dropdown(choices=["aranizer_bpe32k", "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k", "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"], label="اختر المحلل اللفظي")
40
- ],
41
- outputs=[
42
- gr.Text(label="Tokens"),
43
- gr.Text(label="Encoded Output"),
44
- gr.Text(label="Decoded Text"),
45
- gr.Text(label="Embeddings (Example Vector)")
46
- ],
47
- title="مقارنة المحللات اللفظية وعمليات التضمين للنص العربي",
48
- description="حدد نوع المحلل اللفظي وأدخل نصًا لرؤية النتائج ومتجه التضمين.",
49
- )
50
 
51
- demo.launch()
 
1
+ from gradio.inputs import Textbox
2
+ from gradio.outputs import Textbox, Table
3
+ from gradio import Interface
4
+ import aranizer
5
 
6
+ # Load your tokenizers
7
+ tokenizers = {
8
+ "aranizer_bpe50k": aranizer.aranizer_bpe50k.get_tokenizer(),
9
+ "aranizer_bpe64k": aranizer.aranizer_bpe64k.get_tokenizer(),
10
+ "aranizer_sp32k": aranizer.aranizer_sp32k.get_tokenizer(),
11
+ # Add more tokenizers as needed
12
+ }
 
 
 
 
 
13
 
14
+ def compare_tokenizers(text):
15
+ results = []
16
+ for name, tokenizer in tokenizers.items():
 
 
 
17
  tokens = tokenizer.tokenize(text)
18
  encoded_output = tokenizer.encode(text, add_special_tokens=True)
19
  decoded_text = tokenizer.decode(encoded_output)
20
+ results.append((name, tokens, encoded_output, decoded_text))
21
+ return results
22
 
23
+ inputs = Textbox(label="Enter Arabic text")
24
+ outputs = Table(label="Results", columns=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
 
 
 
 
25
 
26
+ iface = Interface(fn=compare_tokenizers, inputs=inputs, outputs=outputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ iface.launch()