Spaces:

eaglelandsonce
/

BOW_Workflow

Sleeping

App Files Files Community

eaglelandsonce commited on Aug 17

Commit

2d115e8

verified ·

1 Parent(s): c0c4087

Update app.py

Browse files

Files changed (1) hide show

app.py +298 -203

app.py CHANGED Viewed

@@ -1,226 +1,321 @@
 import os
-from typing import List, Tuple, Optional
 import gradio as gr
-import pandas as pd
-# --- NLTK setup ----
 import nltk
-from nltk.tokenize import sent_tokenize, word_tokenize
-from nltk.corpus import stopwords
-def ensure_nltk() -> None:
-    """Download required NLTK data if missing (safe to call repeatedly)."""
-    try:
-        nltk.data.find("tokenizers/punkt")
-    except LookupError:
-        nltk.download("punkt", quiet=True)
-    try:
-        nltk.data.find("corpora/stopwords")
-    except LookupError:
-        nltk.download("stopwords", quiet=True)
-ensure_nltk()
-# Optional .docx support
-HAS_DOCX = False
-try:
-    from docx import Document  # python-docx
-    HAS_DOCX = True
-except Exception:
-    HAS_DOCX = False
-SAMPLE_TEXT = (
-    "NLTK is a powerful library for text processing. "
-    "Text processing is essential for NLP tasks. "
-    'Bag of Words is a fundamental concept in NLP. '
-    "Tokenization splits sentences into words. "
-    "We can count word occurrences in text. "
-    "Word frequency vectors represent sentences numerically. "
-    "Vectorization helps in transforming text for machine learning."
-)
-# --------- Helpers ----------
-def read_uploaded_files(files: Optional[List]) -> str:
-    """Read text from uploaded .txt and .docx files."""
-    if not files:
-        return ""
-    chunks = []
-    for f in files:
-        # Gradio v4 provides a dict-like object; support both path & name
-        path = getattr(f, "name", None) or (f.get("name") if isinstance(f, dict) else None)
-        if not path:
-            continue
-        ext = os.path.splitext(path)[1].lower()
-        if ext == ".txt":
-            with open(path, "r", encoding="utf-8", errors="ignore") as fh:
-                chunks.append(fh.read())
-        elif ext == ".docx" and HAS_DOCX:
-            try:
-                doc = Document(path)
-                chunks.append("\n".join(p.text for p in doc.paragraphs if p.text))
-            except Exception as e:
-                chunks.append(f"[Error reading {os.path.basename(path)}: {e}]")
-        elif ext == ".docx" and not HAS_DOCX:
-            chunks.append(f"[Install python-docx to read {os.path.basename(path)}]")
-        elif ext == ".doc":
-            chunks.append(f"[Unsupported legacy .doc: {os.path.basename(path)}]")
-        else:
-            chunks.append(f"[Skipped unsupported file: {os.path.basename(path)}]")
-    return "\n\n".join(chunks)
-def normalize_tokens(tokens: List[str], clean: bool) -> List[str]:
-    """Lowercase + stopword filter when clean=True; keep alphabetic tokens."""
-    if not clean:
-        return tokens
-    stops = set(stopwords.words("english"))
-    out = []
-    for t in tokens:
-        t = t.lower()
-        if t.isalpha() and t not in stops:
-            out.append(t)
-    return out
-def tokenize_text_to_sentences(text: str, clean: bool) -> List[List[str]]:
-    """Sentence tokenize, then word tokenize each sentence; optional cleaning."""
-    sents = sent_tokenize(text)
-    tokenized = [word_tokenize(s) for s in sents]
-    if clean:
-        tokenized = [normalize_tokens(toks, clean=True) for toks in tokenized]
-    return tokenized
-def build_bow(tokenized_sentences: List[List[str]]) -> pd.DataFrame:
-    """Bag of Words as DataFrame (word, count), sorted by count desc."""
-    from collections import Counter
-    if not tokenized_sentences:
-        return pd.DataFrame(columns=["word", "count"])
-    all_words = [w for sent in tokenized_sentences for w in sent]
-    bow = Counter(all_words)
-    df = pd.DataFrame(sorted(bow.items(), key=lambda x: (-x[1], x[0])),
-                      columns=["word", "count"])
-    return df
-def build_vector_for_sentence(
-    tokenized_sentences: List[List[str]], vocab: List[str], idx: int
-) -> pd.DataFrame:
-    if not tokenized_sentences or not vocab:
-        return pd.DataFrame(columns=["word", "count"])
-    idx = max(0, min(idx, len(tokenized_sentences) - 1))
-    tokens = tokenized_sentences[idx]
-    counts = [tokens.count(w) for w in vocab]
-    return pd.DataFrame({"word": vocab, "count": counts})
-ACTIONS = [
-    "Install NLTK",
-    "Tokenize sentences into words",
-    "Count word occurrences (Bag of Words)",
-    "Build a word frequency vector for any selected sentence",
-]
-def process(
-    action: str,
-    text: str,
-    files,                         # avoid strict typing to prevent runtime issues
-    clean: bool,
-    sentence_index_ui: float,      # comes in as float from Number component
-):
     """
-    Returns: status_msg, tokens_df, bow_df, vector_df
     """
-    ensure_nltk()
-    # Combine text areas + files
-    incoming = []
-    if text and text.strip():
-        incoming.append(text.strip())
-    file_text = read_uploaded_files(files)
-    if file_text.strip():
-        incoming.append(file_text.strip())
-    full_text = "\n\n".join(incoming).strip() or SAMPLE_TEXT
-    # Always tokenize once; later steps reuse results
-    tokenized = tokenize_text_to_sentences(full_text, clean=clean)
-    # Prepare tables (avoid None to keep Gradio happy)
-    tokens_df = pd.DataFrame(
-        {
-            "sentence #": list(range(1, len(tokenized) + 1)),
-            "tokens": [" ".join(toks) if toks else "" for toks in tokenized],
-        }
-    )
-    bow_df = pd.DataFrame(columns=["word", "count"])
-    vector_df = pd.DataFrame(columns=["word", "count"])
-    # Route per action
-    if action == "Install NLTK":
-        status = "NLTK is ready (punkt + stopwords ensured)."
-    elif action == "Tokenize sentences into words":
-        status = f"Tokenized {len(tokenized)} sentences."
-    elif action == "Count word occurrences (Bag of Words)":
-        bow_df = build_bow(tokenized)
-        status = f"Bag of Words built with {len(bow_df)} unique terms."
-    elif action == "Build a word frequency vector for any selected sentence":
-        bow_df = build_bow(tokenized)
-        vocab = bow_df["word"].tolist()
-        # Gradio Number is float; UI is 1-based
-        idx = int(max(1, sentence_index_ui)) - 1
-        vector_df = build_vector_for_sentence(tokenized, vocab, idx)
-        status = f"Vector built for sentence #{idx+1} over {len(vocab)}-term vocabulary."
     else:
-        status = "Unknown action."
-    return status, tokens_df, bow_df, vector_df
-with gr.Blocks(title="NLTK BoW & Vectors") as demo:
     gr.Markdown(
-        "# 🧰 NLP Mini-Workbench (NLTK)\n"
-        "Type/paste text or drop **.txt**/**.docx** files. Choose an action from the menu.\n"
-        "Toggle cleaning to lowercase + remove English stopwords."
-    )
-    text_in = gr.Textbox(label="Input Text", lines=10, value=SAMPLE_TEXT)
-    files_in = gr.File(
-        label="Upload .txt / .docx (optional)",
-        file_count="multiple",
-        file_types=[".txt", ".docx"] if HAS_DOCX else [".txt"],
     )
     with gr.Row():
-        action = gr.Dropdown(choices=ACTIONS, value=ACTIONS[1], label="Menu")
-        clean = gr.Checkbox(value=True, label="Apply stopword removal + lowercasing (recommended)")
-        sentence_index = gr.Number(value=1, precision=0, label="Sentence # for vector (1-based)")
-    run_btn = gr.Button("Run")
-    status_out = gr.Textbox(label="Status", interactive=False)
-    tokens_out = gr.Dataframe(headers=["sentence #", "tokens"], label="Tokens per Sentence")
-    bow_out = gr.Dataframe(label="Bag of Words (word, count)")
-    vector_out = gr.Dataframe(label="Word Frequency Vector for Selected Sentence")
-    run_btn.click(
-        process,
-        inputs=[action, text_in, files_in, clean, sentence_index],
-        outputs=[status_out, tokens_out, bow_out, vector_out],
-    )
 if __name__ == "__main__":
     demo.launch()

+import io
 import os
+from typing import List, Tuple, Union
 import gradio as gr
 import nltk
+# -----------------------------------------------------------------------------
+# Force NLTK data into a local folder to avoid permissions/network issues
+# -----------------------------------------------------------------------------
+NLTK_DATA_DIR = os.path.join(os.path.dirname(__file__), "nltk_data")
+os.makedirs(NLTK_DATA_DIR, exist_ok=True)
+os.environ["NLTK_DATA"] = NLTK_DATA_DIR
+if NLTK_DATA_DIR not in nltk.data.path:
+    nltk.data.path.insert(0, NLTK_DATA_DIR)
+# Cover old/new resource names across recent NLTK releases
+NLTK_PACKAGES = [
+    # Tokenizers
+    "punkt", "punkt_tab",
+    # Stopwords / Lemmas
+    "stopwords", "wordnet", "omw-1.4",
+    # POS taggers (old and new english-specific)
+    "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
+    # NE chunkers (old and new)
+    "maxent_ne_chunker", "maxent_ne_chunker_tab",
+    # Word lists used by NE chunker
+    "words",
+]
+def ensure_nltk_resources() -> str:
+    msgs = []
+    for pkg in NLTK_PACKAGES:
+        try:
+            # idempotent; will skip if already present
+            ok = nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
+            msgs.append(f"OK: {pkg}" if ok else f"Skipped: {pkg}")
+        except Exception as e:
+            msgs.append(f"Failed {pkg}: {e}")
+    return " | ".join(msgs) if msgs else "Resources checked."
+# Import after setting up data path
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+from nltk import pos_tag
+from nltk.chunk import ne_chunk
+# -----------------------------------------------------------------------------
+# File reading helpers
+# -----------------------------------------------------------------------------
+def _read_bytes(path: str) -> bytes:
+    with open(path, "rb") as f:
+        return f.read()
+def _extract_from_docx_bytes(b: bytes) -> str:
+    try:
+        import docx  # python-docx
+    except ImportError:
+        return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
+    f = io.BytesIO(b)
+    doc = docx.Document(f)
+    return "\n".join(p.text for p in doc.paragraphs)
+def _extract_from_doc_bytes(b: bytes) -> str:
     """
+    Best-effort .doc (binary) support:
+    - If 'textract' is installed, use it.
+    - Otherwise, return a clear message telling the user to convert to .docx.
     """
+    try:
+        import textract  # optional
+    except Exception:
+        return ("ERROR: .doc files require optional dependency 'textract' "
+                "and system tools. Either `pip install textract` or convert "
+                "the file to .docx and try again.")
+    try:
+        text = textract.process(io.BytesIO(b))  # may still fail if system tools missing
+        return text.decode("utf-8", errors="replace")
+    except Exception as e:
+        return (f"ERROR: Could not extract text from .doc with textract: {e}. "
+                "Please convert the file to .docx and try again.")
+def read_file(upload: Union[str, dict, "gr.File", None]) -> str:
+    """
+    Reads text from Gradio's File input. Supports .txt, .docx, and (optionally) .doc.
+    Works if `upload` is a path (str), a dict, or a file-like with .name/.read().
+    """
+    if upload is None:
+        return ""
+    # Normalize to name/path/bytes
+    name, path, content = None, None, None
+    if isinstance(upload, str):
+        path = upload
+        name = os.path.basename(path)
+        content = _read_bytes(path)
+    elif isinstance(upload, dict):
+        # gradio sometimes passes {'name': '/tmp/..', 'orig_name': 'foo.txt', ...}
+        path = upload.get("name") or upload.get("path")
+        name = upload.get("orig_name") or (os.path.basename(path) if path else "")
+        if path and os.path.exists(path):
+            content = _read_bytes(path)
     else:
+        # file-like
+        name = getattr(upload, "name", "") or ""
+        path = getattr(upload, "name", None)
+        try:
+            if path and os.path.exists(path):
+                content = _read_bytes(path)
+            else:
+                content = upload.read()
+        except Exception:
+            if path and os.path.exists(path):
+                content = _read_bytes(path)
+    if not name:
+        name = "(uploaded)"
+    if content is None:
+        return "ERROR: Could not read uploaded file."
+    ext = os.path.splitext(name)[1].lower()
+    if ext == ".txt":
+        # try common encodings
+        for enc in ("utf-8", "utf-16", "latin-1"):
+            try:
+                return content.decode(enc)
+            except UnicodeDecodeError:
+                continue
+        return "ERROR: Could not decode text file. Try UTF-8/plain text."
+    if ext == ".docx":
+        return _extract_from_docx_bytes(content)
+    if ext == ".doc":
+        return _extract_from_doc_bytes(content)
+    return f"Unsupported file type: {ext}. Please upload .txt, .docx, or .doc."
+# -----------------------------------------------------------------------------
+# NLP helpers
+# -----------------------------------------------------------------------------
+def extract_ner(ne_tree) -> List[Tuple[str, str]]:
+    entities = []
+    for subtree in ne_tree:
+        if hasattr(subtree, "label"):
+            label = subtree.label()
+            text = " ".join(token for token, _ in subtree.leaves())
+            entities.append((text, label))
+    return entities
+def process_text(raw_text: str, steps: List[str]) -> str:
+    if not raw_text or raw_text.strip() == "":
+        return "⚠️ No text provided."
+    # Ensure data locally (quiet)
+    ensure_nltk_resources()
+    report_lines = []
+    text = raw_text
+    # 1) Tokenize (required by later steps)
+    tokens = None
+    if "Tokenize text." in steps or any(
+        s in steps for s in [
+            "Remove stopwords.", "Stem words.", "Lemmatize words.",
+            "Tag parts of speech.", "Extract named entities."
+        ]
+    ):
+        tokens = word_tokenize(text)
+        if "Tokenize text." in steps:
+            report_lines.append("### Tokens")
+            report_lines.append(f"`{tokens}`\n")
+    # 2) Stopwords
+    filtered_tokens = tokens
+    if "Remove stopwords." in steps:
+        sw = set(stopwords.words("english"))
+        filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
+        report_lines.append("### After Stopword Removal")
+        report_lines.append(f"`{filtered_tokens}`\n")
+    # 3) Stemming
+    stemmed_tokens = filtered_tokens
+    if "Stem words." in steps:
+        stemmer = PorterStemmer()
+        stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
+        report_lines.append("### Stemmed Tokens (Porter)")
+        report_lines.append(f"`{stemmed_tokens}`\n")
+    # 4) Lemmatization
+    lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
+    if "Lemmatize words." in steps:
+        lemmatizer = WordNetLemmatizer()
+        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
+        report_lines.append("### Lemmatized Tokens (WordNet)")
+        report_lines.append(f"`{lemmatized_tokens}`\n")
+    # 5) POS Tagging
+    pos_tags_val = None
+    if "Tag parts of speech." in steps or "Extract named entities." in steps:
+        base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
+        pos_tags_val = pos_tag(base_for_tagging)
+        if "Tag parts of speech." in steps:
+            report_lines.append("### Part-of-Speech Tags")
+            rows = ["| Token | POS |", "|---|---|"]
+            rows += [f"| {t} | {p} |" for (t, p) in pos_tags_val]
+            report_lines.append("\n".join(rows) + "\n")
+    # 6) NER
+    if "Extract named entities." in steps:
+        if not pos_tags_val:
+            base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
+            pos_tags_val = pos_tag(base_for_tagging)
+        ne_tree = ne_chunk(pos_tags_val, binary=False)
+        ner_pairs = extract_ner(ne_tree)
+        report_lines.append("### Named Entities")
+        if ner_pairs:
+            rows = ["| Entity | Label |", "|---|---|"]
+            rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
+            report_lines.append("\n".join(rows) + "\n")
+        else:
+            report_lines.append("_No named entities found._\n")
+    return "\n".join(report_lines).strip() or "No steps selected."
+# -----------------------------------------------------------------------------
+# Gradio UI
+# -----------------------------------------------------------------------------
+MENU = [
+    "Install and download required resources.",
+    "Tokenize text.",
+    "Remove stopwords.",
+    "Stem words.",
+    "Lemmatize words.",
+    "Tag parts of speech.",
+    "Extract named entities.",
+]
+DEFAULT_TEXT = (
+    "NLTK is a powerful library for text processing. "
+    "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
+)
+with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
+    gr.Markdown("# NLTK Text Processing Toolkit")
     gr.Markdown(
+        "Type or paste text, or drop a `.txt`/`.docx`/`.doc` file. "
+        "Select steps and click **Process**. Use **Install/Download Resources** first if needed."
     )
     with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Text Input",
+                lines=10,
+                value=DEFAULT_TEXT,
+                placeholder="Type or paste text here..."
+            )
+            file_in = gr.File(
+                label="...or drop a .txt / .docx / .doc file",
+                file_types=[".txt", ".docx", ".doc"]
+            )
+            steps_in = gr.CheckboxGroup(
+                choices=MENU,
+                value=[
+                    "Tokenize text.",
+                    "Remove stopwords.",
+                    "Lemmatize words.",
+                    "Tag parts of speech.",
+                    "Extract named entities.",
+                ],
+                label="Menu (choose one or more)"
+            )
+            with gr.Row():
+                install_btn = gr.Button("Install/Download Resources")
+                process_btn = gr.Button("Process", variant="primary")
+                clear_btn = gr.Button("Clear")
+        with gr.Column():
+            status_out = gr.Textbox(label="Status / Logs", interactive=False)
+            result_out = gr.Markdown(label="Results")
+    # Button callbacks
+    def on_install():
+        try:
+            return ensure_nltk_resources()
+        except Exception as e:
+            return f"Install error: {e}"
+    def on_process(text, file, steps):
+        try:
+            text = (text or "").strip()
+            file_text = read_file(file) if file is not None else ""
+            if not text and file_text:
+                text = file_text
+            if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
+                return file_text
+            return process_text(text, steps or [])
+        except Exception:
+            import traceback
+            return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"
+    def on_clear():
+        return "", ""
+    install_btn.click(fn=on_install, inputs=None, outputs=status_out)
+    process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
+    clear_btn.click(fn=on_clear, inputs=None, outputs=[status_out, result_out])
 if __name__ == "__main__":
+    # If you need external access, set server_name="0.0.0.0"
     demo.launch()