File size: 3,002 Bytes
a35bc8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
from tokenizers import Tokenizer
import json
from huggingface_hub import hf_hub_download
import os

# Download tokenizer files from HF Hub
def get_tokenizer():
    try:
        # Download tokenizer.json
        tokenizer_path = hf_hub_download(
            repo_id="Saiteja/telugu-bpe",
            filename="tokenizer.json",
            repo_type="model"
        )
        # Download examples.json
        examples_path = hf_hub_download(
            repo_id="Saiteja/telugu-bpe",
            filename="examples.json",
            repo_type="model"
        )
        return tokenizer_path, examples_path
    except Exception as e:
        print(f"Error downloading files: {e}")
        return None, None

# Get tokenizer and examples
tokenizer_path, examples_path = get_tokenizer()

# Load the tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Load examples
with open(examples_path, "r", encoding="utf-8") as f:
    examples_data = json.load(f)

# Extract example texts
# example_texts = [
#     "నమస్కారం",  # Hello
#     "తెలుగు భాష చాలా అందమైనది",  # Telugu language is very beautiful
#     "భారతదేశం నా దేశం",  # India is my country
#     "తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది",  # Telugu literature is very rich
#     "నేను తెలుగు భాషను ప్రేమిస్తున్నాను"  # I love Telugu language
# ]
example_texts = [example["text"] for example in examples_data]

def tokenize_text(text):
    """Tokenize the input text and return tokens, ids and compression ratio."""
    if not text.strip():
        return "Please enter some text."

    try:
        encoding = tokenizer.encode(text)
        compression_ratio = len(text) / len(encoding.ids)

        result = f"""Tokens: {encoding.tokens}

Token IDs: {encoding.ids}

Number of tokens: {len(encoding.ids)}

Text length: {len(text)}

Compression ratio: {compression_ratio:.2f}"""

        return result
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface
iface = gr.Interface(
    fn=tokenize_text,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Enter Telugu text here...",
        label="Input Text"
    ),
    outputs=gr.Textbox(
        label="Tokenization Results",
        lines=10
    ),
    title="Telugu Tokenizer Demo",
    description="""This demo uses a custom Telugu tokenizer trained on a large corpus of Telugu text.

    The tokenizer has a vocabulary size of 50,000+ tokens and achieves a compression ratio of >3.0.

    Try entering some Telugu text to see how it's tokenized!



    Tokenizer: https://huggingface.co/Saiteja/telugu-bpe""",
    examples=example_texts,
    theme=gr.themes.Soft()
)

# Launch the app
if __name__ == "__main__":
    iface.launch()