Spaces:
Sleeping
Sleeping
jatingocodeo
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from src.hindi_bpe import HindiBPE
|
3 |
+
|
4 |
+
# Initialize the tokenizer
|
5 |
+
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
|
6 |
+
|
7 |
+
def process_text(text: str, mode: str) -> str:
|
8 |
+
"""Process text using the tokenizer"""
|
9 |
+
if not text.strip():
|
10 |
+
return "Please enter some text."
|
11 |
+
|
12 |
+
if mode == "Encode":
|
13 |
+
# Encode the text
|
14 |
+
encoded = tokenizer.encode(text)
|
15 |
+
return f"Encoded tokens: {encoded}"
|
16 |
+
else:
|
17 |
+
# First encode then decode to show the round trip
|
18 |
+
encoded = tokenizer.encode(text)
|
19 |
+
decoded = tokenizer.decode(encoded)
|
20 |
+
return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
|
21 |
+
|
22 |
+
# Create the interface
|
23 |
+
iface = gr.Interface(
|
24 |
+
fn=process_text,
|
25 |
+
inputs=[
|
26 |
+
gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
|
27 |
+
gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
|
28 |
+
],
|
29 |
+
outputs=gr.Textbox(label="Result"),
|
30 |
+
title="Hindi BPE Tokenizer",
|
31 |
+
description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
|
32 |
+
Features:
|
33 |
+
- Vocabulary size: < 5000 tokens
|
34 |
+
- Compression ratio: ≥ 3.2
|
35 |
+
- Proper handling of Hindi Unicode characters and combining marks""",
|
36 |
+
examples=[
|
37 |
+
["नमस्ते भारत", "Encode & Decode"],
|
38 |
+
["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
|
39 |
+
["यह एक परीक्षण वाक्य है", "Encode & Decode"],
|
40 |
+
["भारत एक विशाल देश है", "Encode & Decode"],
|
41 |
+
["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
|
42 |
+
]
|
43 |
+
)
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
iface.launch()
|