import re from collections import Counter import gradio as gr def preprocess_text(text): text = re.sub(r'[^\u0900-\u097F\s]', '', text) text = ' '.join(text.split()) return text def get_stats(vocab): pairs = Counter() for word, freq in vocab.items(): symbols = word.split() for i in range(len(symbols) - 1): pairs[symbols[i], symbols[i + 1]] += freq return pairs def merge_vocab(pair, v_in): v_out = {} bigram = ' '.join(pair) replacement = ''.join(pair) for word in v_in: w_out = word.replace(bigram, replacement) v_out[w_out] = v_in[word] return v_out def apply_bpe(text, bpe_codes): word_list = text.split() for pair, _ in bpe_codes: if ' ' in pair: p = re.compile(r'(? 0 else 0 if len(vocab) >= 5000 and compression_ratio >= 3: break result = f"Vocabulary size: {len(vocab)}\n" result += f"Original size: {original_size}\n" result += f"Compressed size: {compressed_size}\n" result += f"Compression ratio: {compression_ratio:.2f}X\n\n" if len(vocab) >= 5000 and compression_ratio >= 3: result += "Both criteria are met!" elif len(vocab) >= 5000: result += "Vocabulary size criterion is met, but compression ratio is below 3." elif compression_ratio >= 3: result += "Compression ratio criterion is met, but vocabulary size is below 5000." else: result += "Neither criterion is met." return result, ' '.join(encoded_text) def bpe_app(input_text): if not input_text: input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।" stats, encoded_text = perform_bpe(input_text) return stats, encoded_text # Custom CSS custom_css = """ """ # HTML Template html_template = """
Compress and tokenize Hindi text using the BPE algorithm. Enter your text below or use one of the examples provided.