Spaces:

sagar007
/

BPE

Sleeping

App Files Files Community

sagar007 commited on Jun 21, 2024

Commit

00ea787

verified ·

1 Parent(s): ebd3c95

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -29

app.py CHANGED Viewed

@@ -2,13 +2,9 @@ import gradio as gr
 import re
 from collections import Counter
-# Define preprocessing and BPE functions
 def preprocess_text(text):
-    # Remove punctuation and special characters, keep Hindi characters and spaces
     text = re.sub(r'[^\u0900-\u097F\s]', '', text)
-    # Remove extra whitespace
-    text = ' '.join(text.split())
-    return text
 def get_stats(vocab):
     pairs = Counter()
@@ -35,15 +31,14 @@ def apply_bpe(text, bpe_codes):
             word_list = [p.sub(''.join(pair), word) for word in word_list]
     return word_list
-def bpe_process(text, target_vocab_size):
-    # Preprocess the text
-    preprocessed_text = preprocess_text(text)
-    # Initialize vocabulary with character-level tokens and common subwords
     vocab = Counter(preprocessed_text.split())
     vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
     vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
     # Perform BPE merges
     bpe_codes = []
     while len(vocab) < target_vocab_size:
@@ -53,26 +48,27 @@ def bpe_process(text, target_vocab_size):
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
     # Apply BPE to the original text
     encoded_text = apply_bpe(preprocessed_text, bpe_codes)
     # Calculate compression ratio
-    original_size = len(preprocessed_text)
-    compressed_size = len(' '.join(encoded_text))
-    compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
-    # Create output text
-    encoded_output = ' '.join(encoded_text)
-    vocab_size = len(vocab)
-    # Determine criteria status
     criteria_met = {
-        "Vocabulary Size Criterion": vocab_size >= 5000,
-        "Compression Ratio Criterion": compression_ratio >= 3
     }
-    return encoded_output, vocab_size, compression_ratio, criteria_met
 # Define the Gradio interface
 iface = gr.Interface(
@@ -87,9 +83,9 @@ iface = gr.Interface(
         gr.Number(label="Compression Ratio"),
         gr.JSON(label="Criteria Met")
     ],
-    title="Byte Pair Encoding (BPE) Gradio App",
-    description="Encode text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
 )
 # Launch the Gradio app
-iface.launch()

 import re
 from collections import Counter
 def preprocess_text(text):
     text = re.sub(r'[^\u0900-\u097F\s]', '', text)
+    return ' '.join(text.split())
 def get_stats(vocab):
     pairs = Counter()
             word_list = [p.sub(''.join(pair), word) for word in word_list]
     return word_list
+def bpe_process(input_text, target_vocab_size):
+    preprocessed_text = preprocess_text(input_text)
+    # Initialize vocabulary
     vocab = Counter(preprocessed_text.split())
     vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
     vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
     # Perform BPE merges
     bpe_codes = []
     while len(vocab) < target_vocab_size:
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
     # Apply BPE to the original text
     encoded_text = apply_bpe(preprocessed_text, bpe_codes)
     # Calculate compression ratio
+    original_size = len(preprocessed_text.split())
+    compressed_size = len(encoded_text)
+    compression_ratio = original_size / compressed_size
+    # Check if criteria are met
     criteria_met = {
+        "vocab_size_met": len(vocab) >= 5000,
+        "compression_ratio_met": compression_ratio >= 3
     }
+    return (
+        " ".join(encoded_text),
+        len(vocab),
+        compression_ratio,
+        criteria_met
+    )
 # Define the Gradio interface
 iface = gr.Interface(
         gr.Number(label="Compression Ratio"),
         gr.JSON(label="Criteria Met")
     ],
+    title="Byte Pair Encoding (BPE) for Hindi",
+    description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
 )
 # Launch the Gradio app
+iface.launch(share=True)