sagar007 commited on
Commit
00ea787
·
verified ·
1 Parent(s): ebd3c95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -29
app.py CHANGED
@@ -2,13 +2,9 @@ import gradio as gr
2
  import re
3
  from collections import Counter
4
 
5
- # Define preprocessing and BPE functions
6
  def preprocess_text(text):
7
- # Remove punctuation and special characters, keep Hindi characters and spaces
8
  text = re.sub(r'[^\u0900-\u097F\s]', '', text)
9
- # Remove extra whitespace
10
- text = ' '.join(text.split())
11
- return text
12
 
13
  def get_stats(vocab):
14
  pairs = Counter()
@@ -35,15 +31,14 @@ def apply_bpe(text, bpe_codes):
35
  word_list = [p.sub(''.join(pair), word) for word in word_list]
36
  return word_list
37
 
38
- def bpe_process(text, target_vocab_size):
39
- # Preprocess the text
40
- preprocessed_text = preprocess_text(text)
41
-
42
- # Initialize vocabulary with character-level tokens and common subwords
43
  vocab = Counter(preprocessed_text.split())
44
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
45
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
46
-
47
  # Perform BPE merges
48
  bpe_codes = []
49
  while len(vocab) < target_vocab_size:
@@ -53,26 +48,27 @@ def bpe_process(text, target_vocab_size):
53
  best = max(pairs, key=pairs.get)
54
  vocab = merge_vocab(best, vocab)
55
  bpe_codes.append((best, pairs[best]))
56
-
57
  # Apply BPE to the original text
58
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
59
-
60
  # Calculate compression ratio
61
- original_size = len(preprocessed_text)
62
- compressed_size = len(' '.join(encoded_text))
63
- compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
64
-
65
- # Create output text
66
- encoded_output = ' '.join(encoded_text)
67
- vocab_size = len(vocab)
68
-
69
- # Determine criteria status
70
  criteria_met = {
71
- "Vocabulary Size Criterion": vocab_size >= 5000,
72
- "Compression Ratio Criterion": compression_ratio >= 3
73
  }
74
-
75
- return encoded_output, vocab_size, compression_ratio, criteria_met
 
 
 
 
 
76
 
77
  # Define the Gradio interface
78
  iface = gr.Interface(
@@ -87,9 +83,9 @@ iface = gr.Interface(
87
  gr.Number(label="Compression Ratio"),
88
  gr.JSON(label="Criteria Met")
89
  ],
90
- title="Byte Pair Encoding (BPE) Gradio App",
91
- description="Encode text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
92
  )
93
 
94
  # Launch the Gradio app
95
- iface.launch()
 
2
  import re
3
  from collections import Counter
4
 
 
5
  def preprocess_text(text):
 
6
  text = re.sub(r'[^\u0900-\u097F\s]', '', text)
7
+ return ' '.join(text.split())
 
 
8
 
9
  def get_stats(vocab):
10
  pairs = Counter()
 
31
  word_list = [p.sub(''.join(pair), word) for word in word_list]
32
  return word_list
33
 
34
+ def bpe_process(input_text, target_vocab_size):
35
+ preprocessed_text = preprocess_text(input_text)
36
+
37
+ # Initialize vocabulary
 
38
  vocab = Counter(preprocessed_text.split())
39
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
40
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
41
+
42
  # Perform BPE merges
43
  bpe_codes = []
44
  while len(vocab) < target_vocab_size:
 
48
  best = max(pairs, key=pairs.get)
49
  vocab = merge_vocab(best, vocab)
50
  bpe_codes.append((best, pairs[best]))
51
+
52
  # Apply BPE to the original text
53
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
54
+
55
  # Calculate compression ratio
56
+ original_size = len(preprocessed_text.split())
57
+ compressed_size = len(encoded_text)
58
+ compression_ratio = original_size / compressed_size
59
+
60
+ # Check if criteria are met
 
 
 
 
61
  criteria_met = {
62
+ "vocab_size_met": len(vocab) >= 5000,
63
+ "compression_ratio_met": compression_ratio >= 3
64
  }
65
+
66
+ return (
67
+ " ".join(encoded_text),
68
+ len(vocab),
69
+ compression_ratio,
70
+ criteria_met
71
+ )
72
 
73
  # Define the Gradio interface
74
  iface = gr.Interface(
 
83
  gr.Number(label="Compression Ratio"),
84
  gr.JSON(label="Criteria Met")
85
  ],
86
+ title="Byte Pair Encoding (BPE) for Hindi",
87
+ description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
88
  )
89
 
90
  # Launch the Gradio app
91
+ iface.launch(share=True)