Spaces:

eaglelandsonce
/

BOW_Workflow

Sleeping

App Files Files Community

eaglelandsonce commited on Aug 17

Commit

6d1e3da

verified ·

1 Parent(s): 3dd5cd9

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -9

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from collections import Counter
 from typing import List, Tuple, Dict
@@ -7,7 +8,7 @@ import nltk
 # ---------- NLTK bootstrap ----------
 def _ensure_nltk():
-    # NLTK 3.9+ needs both 'punkt' and 'punkt_tab'
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
@@ -18,7 +19,7 @@ def _ensure_nltk():
         try:
             nltk.download("punkt_tab", quiet=True)
         except Exception:
-            pass  # old NLTK won't have punkt_tab; 'punkt' is enough there
     try:
         nltk.data.find("corpora/stopwords")
     except LookupError:
@@ -61,10 +62,23 @@ def read_text_input(text: str, file_obj) -> Tuple[str, str]:
 def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
     if not clean:
         return tokens
     stops = set(stopwords.words("english"))
-    return [t.lower() for t in tokens if t.lower() not in stops]
 def tokenize_pipeline(
@@ -73,7 +87,7 @@ def tokenize_pipeline(
     """
     - Split text into sentences
     - Tokenize each sentence into words
-    - (Optionally) lower + remove stopwords
     - Build Bag of Words across the full text
     Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
     """
@@ -96,6 +110,10 @@ def tokenize_pipeline(
 def build_sentence_vector(
     tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
 ) -> Dict[str, int]:
     if not tokenized_sentences or not vocabulary:
         return {}
     if idx < 0 or idx >= len(tokenized_sentences):
@@ -131,7 +149,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
 3) Count word occurrences (Bag of Words)
 4) Build a word-frequency vector for any selected sentence
-**Option:** Toggle *Stopword removal + lowercasing* for a cleaner Bag of Words.
 > Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
 """
@@ -151,9 +169,9 @@ Type/paste text or drop a **.txt** / **.docx** file.
         )
     clean_opt = gr.Checkbox(
-        label="Stopword removal + lowercasing",
         value=True,
-        info='Removes common English stopwords (e.g., "is", "for", "the") and lowercases tokens.',
     )
     process_btn = gr.Button("Process", variant="primary")
@@ -219,7 +237,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
                 vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
                 vector_rows = [[w, c] for w, c in vec_map.items()]
-            status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
             return (
                 gr.update(choices=dd_choices, value=dd_value),
                 tokenized_json,
@@ -231,7 +249,6 @@ Type/paste text or drop a **.txt** / **.docx** file.
                 status,
             )
         except LookupError as e:
-            # Common NLTK resource errors (e.g., punkt_tab)
             return (
                 gr.update(choices=[], value=None),
                 {},
@@ -287,3 +304,4 @@ Type/paste text or drop a **.txt** / **.docx** file.
 if __name__ == "__main__":
     demo.launch()

 import os
+import string
 from collections import Counter
 from typing import List, Tuple, Dict
 # ---------- NLTK bootstrap ----------
 def _ensure_nltk():
+    # NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         try:
             nltk.download("punkt_tab", quiet=True)
         except Exception:
+            pass  # older NLTK doesn't have punkt_tab
     try:
         nltk.data.find("corpora/stopwords")
     except LookupError:
 def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
+    """
+    Clean mode:
+      - lowercase
+      - remove English stopwords
+      - remove punctuation tokens (.,?!;:"'()[]{}- etc.)
+    Raw mode (clean=False):
+      - return tokens unchanged
+    """
     if not clean:
         return tokens
     stops = set(stopwords.words("english"))
+    punct = set(string.punctuation)
+    return [
+        t.lower()
+        for t in tokens
+        if t not in punct and t.lower() not in stops
+    ]
 def tokenize_pipeline(
     """
     - Split text into sentences
     - Tokenize each sentence into words
+    - (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
     - Build Bag of Words across the full text
     Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
     """
 def build_sentence_vector(
     tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
 ) -> Dict[str, int]:
+    """
+    Count occurrences of each vocab term inside the selected sentence.
+    Returns {word: count} for non-zero entries, sorted by count desc then word.
+    """
     if not tokenized_sentences or not vocabulary:
         return {}
     if idx < 0 or idx >= len(tokenized_sentences):
 3) Count word occurrences (Bag of Words)
 4) Build a word-frequency vector for any selected sentence
+**Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults).
 > Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
 """
         )
     clean_opt = gr.Checkbox(
+        label="Stopword + lowercase + punctuation removal",
         value=True,
+        info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
     )
     process_btn = gr.Button("Process", variant="primary")
                 vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
                 vector_rows = [[w, c] for w, c in vec_map.items()]
+            status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
             return (
                 gr.update(choices=dd_choices, value=dd_value),
                 tokenized_json,
                 status,
             )
         except LookupError as e:
             return (
                 gr.update(choices=[], value=None),
                 {},
 if __name__ == "__main__":
     demo.launch()