Spaces:

sagar007
/

BPE

Sleeping

App Files Files Community

sagar007 commited on Jun 21, 2024

Commit

0f0dabd

verified ·

1 Parent(s): 86e7a4b

Create app.py

Browse files

Files changed (1) hide show

app.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import gradio as gr
+import re
+from collections import Counter
+# Define preprocessing and BPE functions
+def preprocess_text(text):
+    # Remove punctuation and special characters, keep Hindi characters and spaces
+    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    return text
+def get_stats(vocab):
+    pairs = Counter()
+    for word, freq in vocab.items():
+        symbols = word.split()
+        for i in range(len(symbols) - 1):
+            pairs[symbols[i], symbols[i + 1]] += freq
+    return pairs
+def merge_vocab(pair, v_in):
+    v_out = {}
+    bigram = ' '.join(pair)
+    replacement = ''.join(pair)
+    for word in v_in:
+        w_out = word.replace(bigram, replacement)
+        v_out[w_out] = v_in[word]
+    return v_out
+def apply_bpe(text, bpe_codes):
+    word_list = text.split()
+    for pair, _ in bpe_codes:
+        if ' ' in pair:
+            p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
+            word_list = [p.sub(''.join(pair), word) for word in word_list]
+    return word_list
+def bpe_process(text, target_vocab_size):
+    # Preprocess the text
+    preprocessed_text = preprocess_text(text)
+    # Initialize vocabulary with character-level tokens and common subwords
+    vocab = Counter(preprocessed_text.split())
+    vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
+    vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
+    # Perform BPE merges
+    bpe_codes = []
+    while len(vocab) < target_vocab_size:
+        pairs = get_stats(vocab)
+        if not pairs:
+            break
+        best = max(pairs, key=pairs.get)
+        vocab = merge_vocab(best, vocab)
+        bpe_codes.append((best, pairs[best]))
+    # Apply BPE to the original text
+    encoded_text = apply_bpe(preprocessed_text, bpe_codes)
+    # Calculate compression ratio
+    original_size = len(preprocessed_text)
+    compressed_size = len(' '.join(encoded_text))
+    compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
+    # Create output text
+    encoded_output = ' '.join(encoded_text)
+    vocab_size = len(vocab)
+    # Determine criteria status
+    criteria_met = {
+        "Vocabulary Size Criterion": vocab_size >= 5000,
+        "Compression Ratio Criterion": compression_ratio >= 3
+    }
+    return encoded_output, vocab_size, compression_ratio, criteria_met
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=bpe_process,
+    inputs=[
+        gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
+        gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
+    ],
+    outputs=[
+        gr.Textbox(label="Encoded Text"),
+        gr.Number(label="Vocabulary Size"),
+        gr.Number(label="Compression Ratio"),
+        gr.JSON(label="Criteria Met")
+    ],
+    title="Byte Pair Encoding (BPE) Gradio App",
+    description="Encode text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
+)
+# Launch the Gradio app
+iface.launch()