import re from collections import Counter import gradio as gr def preprocess_text(text): text = re.sub(r'[^\u0900-\u097F\s]', '', text) text = ' '.join(text.split()) return text def get_stats(vocab): pairs = Counter() for word, freq in vocab.items(): symbols = word.split() for i in range(len(symbols) - 1): pairs[symbols[i], symbols[i + 1]] += freq return pairs def merge_vocab(pair, v_in): v_out = {} bigram = ' '.join(pair) replacement = ''.join(pair) for word in v_in: w_out = word.replace(bigram, replacement) v_out[w_out] = v_in[word] return v_out def apply_bpe(text, bpe_codes): word_list = text.split() for pair, _ in bpe_codes: if ' ' in pair: p = re.compile(r'(? 0 else 0 if len(vocab) >= 5000 and compression_ratio >= 3: break result = f"Vocabulary size: {len(vocab)}\n" result += f"Original size: {original_size}\n" result += f"Compressed size: {compressed_size}\n" result += f"Compression ratio: {compression_ratio:.2f}X\n\n" if len(vocab) >= 5000 and compression_ratio >= 3: result += "Both criteria are met!" elif len(vocab) >= 5000: result += "Vocabulary size criterion is met, but compression ratio is below 3." elif compression_ratio >= 3: result += "Compression ratio criterion is met, but vocabulary size is below 5000." else: result += "Neither criterion is met." return result, ' '.join(encoded_text) def bpe_app(input_text): if not input_text: input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।" stats, encoded_text = perform_bpe(input_text) return stats, encoded_text # Custom CSS custom_css = """ """ # HTML Template html_template = """

🇮🇳 Byte Pair Encoding for Hindi

Compress and tokenize Hindi text using the BPE algorithm. Enter your text below or use one of the examples provided.

""" # Create Gradio interface with custom theme with gr.Blocks(css=custom_css) as iface: gr.HTML(html_template) with gr.Row(): with gr.Column(): input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example") with gr.Row(): submit_btn = gr.Button("Process", variant="primary") with gr.Row(): with gr.Column(): output_stats = gr.Textbox(label="BPE Statistics") with gr.Column(): output_encoded = gr.Textbox(label="Encoded Text") gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.") examples = gr.Examples( examples=[ ["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"], ["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"], ["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"] ], inputs=[input_text], ) submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded]) gr.HTML('') # Launch the app iface.launch(inbrowser=True, share=True)