sagar007 commited on
Commit
0f0dabd
·
verified ·
1 Parent(s): 86e7a4b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ from collections import Counter
4
+
5
+ # Define preprocessing and BPE functions
6
+ def preprocess_text(text):
7
+ # Remove punctuation and special characters, keep Hindi characters and spaces
8
+ text = re.sub(r'[^\u0900-\u097F\s]', '', text)
9
+ # Remove extra whitespace
10
+ text = ' '.join(text.split())
11
+ return text
12
+
13
+ def get_stats(vocab):
14
+ pairs = Counter()
15
+ for word, freq in vocab.items():
16
+ symbols = word.split()
17
+ for i in range(len(symbols) - 1):
18
+ pairs[symbols[i], symbols[i + 1]] += freq
19
+ return pairs
20
+
21
+ def merge_vocab(pair, v_in):
22
+ v_out = {}
23
+ bigram = ' '.join(pair)
24
+ replacement = ''.join(pair)
25
+ for word in v_in:
26
+ w_out = word.replace(bigram, replacement)
27
+ v_out[w_out] = v_in[word]
28
+ return v_out
29
+
30
+ def apply_bpe(text, bpe_codes):
31
+ word_list = text.split()
32
+ for pair, _ in bpe_codes:
33
+ if ' ' in pair:
34
+ p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
35
+ word_list = [p.sub(''.join(pair), word) for word in word_list]
36
+ return word_list
37
+
38
+ def bpe_process(text, target_vocab_size):
39
+ # Preprocess the text
40
+ preprocessed_text = preprocess_text(text)
41
+
42
+ # Initialize vocabulary with character-level tokens and common subwords
43
+ vocab = Counter(preprocessed_text.split())
44
+ vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
45
+ vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
46
+
47
+ # Perform BPE merges
48
+ bpe_codes = []
49
+ while len(vocab) < target_vocab_size:
50
+ pairs = get_stats(vocab)
51
+ if not pairs:
52
+ break
53
+ best = max(pairs, key=pairs.get)
54
+ vocab = merge_vocab(best, vocab)
55
+ bpe_codes.append((best, pairs[best]))
56
+
57
+ # Apply BPE to the original text
58
+ encoded_text = apply_bpe(preprocessed_text, bpe_codes)
59
+
60
+ # Calculate compression ratio
61
+ original_size = len(preprocessed_text)
62
+ compressed_size = len(' '.join(encoded_text))
63
+ compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
64
+
65
+ # Create output text
66
+ encoded_output = ' '.join(encoded_text)
67
+ vocab_size = len(vocab)
68
+
69
+ # Determine criteria status
70
+ criteria_met = {
71
+ "Vocabulary Size Criterion": vocab_size >= 5000,
72
+ "Compression Ratio Criterion": compression_ratio >= 3
73
+ }
74
+
75
+ return encoded_output, vocab_size, compression_ratio, criteria_met
76
+
77
+ # Define the Gradio interface
78
+ iface = gr.Interface(
79
+ fn=bpe_process,
80
+ inputs=[
81
+ gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
82
+ gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
83
+ ],
84
+ outputs=[
85
+ gr.Textbox(label="Encoded Text"),
86
+ gr.Number(label="Vocabulary Size"),
87
+ gr.Number(label="Compression Ratio"),
88
+ gr.JSON(label="Criteria Met")
89
+ ],
90
+ title="Byte Pair Encoding (BPE) Gradio App",
91
+ description="Encode text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
92
+ )
93
+
94
+ # Launch the Gradio app
95
+ iface.launch()