Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
from collections import Counter
|
| 3 |
from typing import List, Tuple, Dict
|
| 4 |
|
|
@@ -7,7 +8,7 @@ import nltk
|
|
| 7 |
|
| 8 |
# ---------- NLTK bootstrap ----------
|
| 9 |
def _ensure_nltk():
|
| 10 |
-
# NLTK 3.9+
|
| 11 |
try:
|
| 12 |
nltk.data.find("tokenizers/punkt")
|
| 13 |
except LookupError:
|
|
@@ -18,7 +19,7 @@ def _ensure_nltk():
|
|
| 18 |
try:
|
| 19 |
nltk.download("punkt_tab", quiet=True)
|
| 20 |
except Exception:
|
| 21 |
-
pass #
|
| 22 |
try:
|
| 23 |
nltk.data.find("corpora/stopwords")
|
| 24 |
except LookupError:
|
|
@@ -61,10 +62,23 @@ def read_text_input(text: str, file_obj) -> Tuple[str, str]:
|
|
| 61 |
|
| 62 |
|
| 63 |
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
if not clean:
|
| 65 |
return tokens
|
| 66 |
stops = set(stopwords.words("english"))
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
def tokenize_pipeline(
|
|
@@ -73,7 +87,7 @@ def tokenize_pipeline(
|
|
| 73 |
"""
|
| 74 |
- Split text into sentences
|
| 75 |
- Tokenize each sentence into words
|
| 76 |
-
- (Optionally)
|
| 77 |
- Build Bag of Words across the full text
|
| 78 |
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
|
| 79 |
"""
|
|
@@ -96,6 +110,10 @@ def tokenize_pipeline(
|
|
| 96 |
def build_sentence_vector(
|
| 97 |
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
|
| 98 |
) -> Dict[str, int]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
if not tokenized_sentences or not vocabulary:
|
| 100 |
return {}
|
| 101 |
if idx < 0 or idx >= len(tokenized_sentences):
|
|
@@ -131,7 +149,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
| 131 |
3) Count word occurrences (Bag of Words)
|
| 132 |
4) Build a word-frequency vector for any selected sentence
|
| 133 |
|
| 134 |
-
**
|
| 135 |
|
| 136 |
> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
|
| 137 |
"""
|
|
@@ -151,9 +169,9 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
| 151 |
)
|
| 152 |
|
| 153 |
clean_opt = gr.Checkbox(
|
| 154 |
-
label="Stopword
|
| 155 |
value=True,
|
| 156 |
-
info='Removes common English stopwords (e.g., "
|
| 157 |
)
|
| 158 |
|
| 159 |
process_btn = gr.Button("Process", variant="primary")
|
|
@@ -219,7 +237,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
| 219 |
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
|
| 220 |
vector_rows = [[w, c] for w, c in vec_map.items()]
|
| 221 |
|
| 222 |
-
status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
|
| 223 |
return (
|
| 224 |
gr.update(choices=dd_choices, value=dd_value),
|
| 225 |
tokenized_json,
|
|
@@ -231,7 +249,6 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
| 231 |
status,
|
| 232 |
)
|
| 233 |
except LookupError as e:
|
| 234 |
-
# Common NLTK resource errors (e.g., punkt_tab)
|
| 235 |
return (
|
| 236 |
gr.update(choices=[], value=None),
|
| 237 |
{},
|
|
@@ -287,3 +304,4 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
| 287 |
|
| 288 |
if __name__ == "__main__":
|
| 289 |
demo.launch()
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import string
|
| 3 |
from collections import Counter
|
| 4 |
from typing import List, Tuple, Dict
|
| 5 |
|
|
|
|
| 8 |
|
| 9 |
# ---------- NLTK bootstrap ----------
|
| 10 |
def _ensure_nltk():
|
| 11 |
+
# NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
|
| 12 |
try:
|
| 13 |
nltk.data.find("tokenizers/punkt")
|
| 14 |
except LookupError:
|
|
|
|
| 19 |
try:
|
| 20 |
nltk.download("punkt_tab", quiet=True)
|
| 21 |
except Exception:
|
| 22 |
+
pass # older NLTK doesn't have punkt_tab
|
| 23 |
try:
|
| 24 |
nltk.data.find("corpora/stopwords")
|
| 25 |
except LookupError:
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
|
| 65 |
+
"""
|
| 66 |
+
Clean mode:
|
| 67 |
+
- lowercase
|
| 68 |
+
- remove English stopwords
|
| 69 |
+
- remove punctuation tokens (.,?!;:"'()[]{}- etc.)
|
| 70 |
+
Raw mode (clean=False):
|
| 71 |
+
- return tokens unchanged
|
| 72 |
+
"""
|
| 73 |
if not clean:
|
| 74 |
return tokens
|
| 75 |
stops = set(stopwords.words("english"))
|
| 76 |
+
punct = set(string.punctuation)
|
| 77 |
+
return [
|
| 78 |
+
t.lower()
|
| 79 |
+
for t in tokens
|
| 80 |
+
if t not in punct and t.lower() not in stops
|
| 81 |
+
]
|
| 82 |
|
| 83 |
|
| 84 |
def tokenize_pipeline(
|
|
|
|
| 87 |
"""
|
| 88 |
- Split text into sentences
|
| 89 |
- Tokenize each sentence into words
|
| 90 |
+
- (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
|
| 91 |
- Build Bag of Words across the full text
|
| 92 |
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
|
| 93 |
"""
|
|
|
|
| 110 |
def build_sentence_vector(
|
| 111 |
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
|
| 112 |
) -> Dict[str, int]:
|
| 113 |
+
"""
|
| 114 |
+
Count occurrences of each vocab term inside the selected sentence.
|
| 115 |
+
Returns {word: count} for non-zero entries, sorted by count desc then word.
|
| 116 |
+
"""
|
| 117 |
if not tokenized_sentences or not vocabulary:
|
| 118 |
return {}
|
| 119 |
if idx < 0 or idx >= len(tokenized_sentences):
|
|
|
|
| 149 |
3) Count word occurrences (Bag of Words)
|
| 150 |
4) Build a word-frequency vector for any selected sentence
|
| 151 |
|
| 152 |
+
**Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults).
|
| 153 |
|
| 154 |
> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
|
| 155 |
"""
|
|
|
|
| 169 |
)
|
| 170 |
|
| 171 |
clean_opt = gr.Checkbox(
|
| 172 |
+
label="Stopword + lowercase + punctuation removal",
|
| 173 |
value=True,
|
| 174 |
+
info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
|
| 175 |
)
|
| 176 |
|
| 177 |
process_btn = gr.Button("Process", variant="primary")
|
|
|
|
| 237 |
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
|
| 238 |
vector_rows = [[w, c] for w, c in vec_map.items()]
|
| 239 |
|
| 240 |
+
status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
|
| 241 |
return (
|
| 242 |
gr.update(choices=dd_choices, value=dd_value),
|
| 243 |
tokenized_json,
|
|
|
|
| 249 |
status,
|
| 250 |
)
|
| 251 |
except LookupError as e:
|
|
|
|
| 252 |
return (
|
| 253 |
gr.update(choices=[], value=None),
|
| 254 |
{},
|
|
|
|
| 304 |
|
| 305 |
if __name__ == "__main__":
|
| 306 |
demo.launch()
|
| 307 |
+
|