eaglelandsonce commited on
Commit
2d115e8
·
verified ·
1 Parent(s): c0c4087

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +298 -203
app.py CHANGED
@@ -1,226 +1,321 @@
 
1
  import os
2
- from typing import List, Tuple, Optional
3
 
4
  import gradio as gr
5
- import pandas as pd
6
-
7
- # --- NLTK setup ----
8
  import nltk
9
- from nltk.tokenize import sent_tokenize, word_tokenize
10
- from nltk.corpus import stopwords
11
-
12
- def ensure_nltk() -> None:
13
- """Download required NLTK data if missing (safe to call repeatedly)."""
14
- try:
15
- nltk.data.find("tokenizers/punkt")
16
- except LookupError:
17
- nltk.download("punkt", quiet=True)
18
- try:
19
- nltk.data.find("corpora/stopwords")
20
- except LookupError:
21
- nltk.download("stopwords", quiet=True)
22
-
23
- ensure_nltk()
24
 
25
- # Optional .docx support
26
- HAS_DOCX = False
27
- try:
28
- from docx import Document # python-docx
29
- HAS_DOCX = True
30
- except Exception:
31
- HAS_DOCX = False
32
-
33
- SAMPLE_TEXT = (
34
- "NLTK is a powerful library for text processing. "
35
- "Text processing is essential for NLP tasks. "
36
- 'Bag of Words is a fundamental concept in NLP. '
37
- "Tokenization splits sentences into words. "
38
- "We can count word occurrences in text. "
39
- "Word frequency vectors represent sentences numerically. "
40
- "Vectorization helps in transforming text for machine learning."
41
- )
42
-
43
- # --------- Helpers ----------
44
- def read_uploaded_files(files: Optional[List]) -> str:
45
- """Read text from uploaded .txt and .docx files."""
46
- if not files:
47
- return ""
48
-
49
- chunks = []
50
- for f in files:
51
- # Gradio v4 provides a dict-like object; support both path & name
52
- path = getattr(f, "name", None) or (f.get("name") if isinstance(f, dict) else None)
53
- if not path:
54
- continue
55
-
56
- ext = os.path.splitext(path)[1].lower()
57
- if ext == ".txt":
58
- with open(path, "r", encoding="utf-8", errors="ignore") as fh:
59
- chunks.append(fh.read())
60
-
61
- elif ext == ".docx" and HAS_DOCX:
62
- try:
63
- doc = Document(path)
64
- chunks.append("\n".join(p.text for p in doc.paragraphs if p.text))
65
- except Exception as e:
66
- chunks.append(f"[Error reading {os.path.basename(path)}: {e}]")
67
-
68
- elif ext == ".docx" and not HAS_DOCX:
69
- chunks.append(f"[Install python-docx to read {os.path.basename(path)}]")
70
 
71
- elif ext == ".doc":
72
- chunks.append(f"[Unsupported legacy .doc: {os.path.basename(path)}]")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- else:
75
- chunks.append(f"[Skipped unsupported file: {os.path.basename(path)}]")
76
-
77
- return "\n\n".join(chunks)
78
-
79
-
80
- def normalize_tokens(tokens: List[str], clean: bool) -> List[str]:
81
- """Lowercase + stopword filter when clean=True; keep alphabetic tokens."""
82
- if not clean:
83
- return tokens
84
- stops = set(stopwords.words("english"))
85
- out = []
86
- for t in tokens:
87
- t = t.lower()
88
- if t.isalpha() and t not in stops:
89
- out.append(t)
90
- return out
91
-
92
-
93
- def tokenize_text_to_sentences(text: str, clean: bool) -> List[List[str]]:
94
- """Sentence tokenize, then word tokenize each sentence; optional cleaning."""
95
- sents = sent_tokenize(text)
96
- tokenized = [word_tokenize(s) for s in sents]
97
- if clean:
98
- tokenized = [normalize_tokens(toks, clean=True) for toks in tokenized]
99
- return tokenized
100
-
101
-
102
- def build_bow(tokenized_sentences: List[List[str]]) -> pd.DataFrame:
103
- """Bag of Words as DataFrame (word, count), sorted by count desc."""
104
- from collections import Counter
105
- if not tokenized_sentences:
106
- return pd.DataFrame(columns=["word", "count"])
107
- all_words = [w for sent in tokenized_sentences for w in sent]
108
- bow = Counter(all_words)
109
- df = pd.DataFrame(sorted(bow.items(), key=lambda x: (-x[1], x[0])),
110
- columns=["word", "count"])
111
- return df
112
-
113
-
114
- def build_vector_for_sentence(
115
- tokenized_sentences: List[List[str]], vocab: List[str], idx: int
116
- ) -> pd.DataFrame:
117
- if not tokenized_sentences or not vocab:
118
- return pd.DataFrame(columns=["word", "count"])
119
- idx = max(0, min(idx, len(tokenized_sentences) - 1))
120
- tokens = tokenized_sentences[idx]
121
- counts = [tokens.count(w) for w in vocab]
122
- return pd.DataFrame({"word": vocab, "count": counts})
123
-
124
-
125
- ACTIONS = [
126
- "Install NLTK",
127
- "Tokenize sentences into words",
128
- "Count word occurrences (Bag of Words)",
129
- "Build a word frequency vector for any selected sentence",
130
- ]
131
 
 
 
 
 
 
 
132
 
133
- def process(
134
- action: str,
135
- text: str,
136
- files, # avoid strict typing to prevent runtime issues
137
- clean: bool,
138
- sentence_index_ui: float, # comes in as float from Number component
139
- ):
 
 
 
140
  """
141
- Returns: status_msg, tokens_df, bow_df, vector_df
 
 
142
  """
143
- ensure_nltk()
144
-
145
- # Combine text areas + files
146
- incoming = []
147
- if text and text.strip():
148
- incoming.append(text.strip())
149
- file_text = read_uploaded_files(files)
150
- if file_text.strip():
151
- incoming.append(file_text.strip())
152
-
153
- full_text = "\n\n".join(incoming).strip() or SAMPLE_TEXT
154
-
155
- # Always tokenize once; later steps reuse results
156
- tokenized = tokenize_text_to_sentences(full_text, clean=clean)
157
-
158
- # Prepare tables (avoid None to keep Gradio happy)
159
- tokens_df = pd.DataFrame(
160
- {
161
- "sentence #": list(range(1, len(tokenized) + 1)),
162
- "tokens": [" ".join(toks) if toks else "" for toks in tokenized],
163
- }
164
- )
165
- bow_df = pd.DataFrame(columns=["word", "count"])
166
- vector_df = pd.DataFrame(columns=["word", "count"])
167
-
168
- # Route per action
169
- if action == "Install NLTK":
170
- status = "NLTK is ready (punkt + stopwords ensured)."
171
-
172
- elif action == "Tokenize sentences into words":
173
- status = f"Tokenized {len(tokenized)} sentences."
174
-
175
- elif action == "Count word occurrences (Bag of Words)":
176
- bow_df = build_bow(tokenized)
177
- status = f"Bag of Words built with {len(bow_df)} unique terms."
178
 
179
- elif action == "Build a word frequency vector for any selected sentence":
180
- bow_df = build_bow(tokenized)
181
- vocab = bow_df["word"].tolist()
182
- # Gradio Number is float; UI is 1-based
183
- idx = int(max(1, sentence_index_ui)) - 1
184
- vector_df = build_vector_for_sentence(tokenized, vocab, idx)
185
- status = f"Vector built for sentence #{idx+1} over {len(vocab)}-term vocabulary."
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  else:
188
- status = "Unknown action."
189
-
190
- return status, tokens_df, bow_df, vector_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
 
 
 
 
192
 
193
- with gr.Blocks(title="NLTK BoW & Vectors") as demo:
 
194
  gr.Markdown(
195
- "# 🧰 NLP Mini-Workbench (NLTK)\n"
196
- "Type/paste text or drop **.txt**/**.docx** files. Choose an action from the menu.\n"
197
- "Toggle cleaning to lowercase + remove English stopwords."
198
- )
199
-
200
- text_in = gr.Textbox(label="Input Text", lines=10, value=SAMPLE_TEXT)
201
- files_in = gr.File(
202
- label="Upload .txt / .docx (optional)",
203
- file_count="multiple",
204
- file_types=[".txt", ".docx"] if HAS_DOCX else [".txt"],
205
  )
206
 
207
  with gr.Row():
208
- action = gr.Dropdown(choices=ACTIONS, value=ACTIONS[1], label="Menu")
209
- clean = gr.Checkbox(value=True, label="Apply stopword removal + lowercasing (recommended)")
210
- sentence_index = gr.Number(value=1, precision=0, label="Sentence # for vector (1-based)")
211
-
212
- run_btn = gr.Button("Run")
213
-
214
- status_out = gr.Textbox(label="Status", interactive=False)
215
- tokens_out = gr.Dataframe(headers=["sentence #", "tokens"], label="Tokens per Sentence")
216
- bow_out = gr.Dataframe(label="Bag of Words (word, count)")
217
- vector_out = gr.Dataframe(label="Word Frequency Vector for Selected Sentence")
218
-
219
- run_btn.click(
220
- process,
221
- inputs=[action, text_in, files_in, clean, sentence_index],
222
- outputs=[status_out, tokens_out, bow_out, vector_out],
223
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  if __name__ == "__main__":
 
226
  demo.launch()
 
1
+ import io
2
  import os
3
+ from typing import List, Tuple, Union
4
 
5
  import gradio as gr
 
 
 
6
  import nltk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # -----------------------------------------------------------------------------
9
+ # Force NLTK data into a local folder to avoid permissions/network issues
10
+ # -----------------------------------------------------------------------------
11
+ NLTK_DATA_DIR = os.path.join(os.path.dirname(__file__), "nltk_data")
12
+ os.makedirs(NLTK_DATA_DIR, exist_ok=True)
13
+ os.environ["NLTK_DATA"] = NLTK_DATA_DIR
14
+ if NLTK_DATA_DIR not in nltk.data.path:
15
+ nltk.data.path.insert(0, NLTK_DATA_DIR)
16
+
17
+ # Cover old/new resource names across recent NLTK releases
18
+ NLTK_PACKAGES = [
19
+ # Tokenizers
20
+ "punkt", "punkt_tab",
21
+ # Stopwords / Lemmas
22
+ "stopwords", "wordnet", "omw-1.4",
23
+ # POS taggers (old and new english-specific)
24
+ "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
25
+ # NE chunkers (old and new)
26
+ "maxent_ne_chunker", "maxent_ne_chunker_tab",
27
+ # Word lists used by NE chunker
28
+ "words",
29
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ def ensure_nltk_resources() -> str:
32
+ msgs = []
33
+ for pkg in NLTK_PACKAGES:
34
+ try:
35
+ # idempotent; will skip if already present
36
+ ok = nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
37
+ msgs.append(f"OK: {pkg}" if ok else f"Skipped: {pkg}")
38
+ except Exception as e:
39
+ msgs.append(f"Failed {pkg}: {e}")
40
+ return " | ".join(msgs) if msgs else "Resources checked."
41
+
42
+ # Import after setting up data path
43
+ from nltk.tokenize import word_tokenize
44
+ from nltk.corpus import stopwords
45
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
46
+ from nltk import pos_tag
47
+ from nltk.chunk import ne_chunk
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # -----------------------------------------------------------------------------
51
+ # File reading helpers
52
+ # -----------------------------------------------------------------------------
53
+ def _read_bytes(path: str) -> bytes:
54
+ with open(path, "rb") as f:
55
+ return f.read()
56
 
57
+ def _extract_from_docx_bytes(b: bytes) -> str:
58
+ try:
59
+ import docx # python-docx
60
+ except ImportError:
61
+ return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
62
+ f = io.BytesIO(b)
63
+ doc = docx.Document(f)
64
+ return "\n".join(p.text for p in doc.paragraphs)
65
+
66
+ def _extract_from_doc_bytes(b: bytes) -> str:
67
  """
68
+ Best-effort .doc (binary) support:
69
+ - If 'textract' is installed, use it.
70
+ - Otherwise, return a clear message telling the user to convert to .docx.
71
  """
72
+ try:
73
+ import textract # optional
74
+ except Exception:
75
+ return ("ERROR: .doc files require optional dependency 'textract' "
76
+ "and system tools. Either `pip install textract` or convert "
77
+ "the file to .docx and try again.")
78
+ try:
79
+ text = textract.process(io.BytesIO(b)) # may still fail if system tools missing
80
+ return text.decode("utf-8", errors="replace")
81
+ except Exception as e:
82
+ return (f"ERROR: Could not extract text from .doc with textract: {e}. "
83
+ "Please convert the file to .docx and try again.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def read_file(upload: Union[str, dict, "gr.File", None]) -> str:
86
+ """
87
+ Reads text from Gradio's File input. Supports .txt, .docx, and (optionally) .doc.
88
+ Works if `upload` is a path (str), a dict, or a file-like with .name/.read().
89
+ """
90
+ if upload is None:
91
+ return ""
92
 
93
+ # Normalize to name/path/bytes
94
+ name, path, content = None, None, None
95
+
96
+ if isinstance(upload, str):
97
+ path = upload
98
+ name = os.path.basename(path)
99
+ content = _read_bytes(path)
100
+ elif isinstance(upload, dict):
101
+ # gradio sometimes passes {'name': '/tmp/..', 'orig_name': 'foo.txt', ...}
102
+ path = upload.get("name") or upload.get("path")
103
+ name = upload.get("orig_name") or (os.path.basename(path) if path else "")
104
+ if path and os.path.exists(path):
105
+ content = _read_bytes(path)
106
  else:
107
+ # file-like
108
+ name = getattr(upload, "name", "") or ""
109
+ path = getattr(upload, "name", None)
110
+ try:
111
+ if path and os.path.exists(path):
112
+ content = _read_bytes(path)
113
+ else:
114
+ content = upload.read()
115
+ except Exception:
116
+ if path and os.path.exists(path):
117
+ content = _read_bytes(path)
118
+
119
+ if not name:
120
+ name = "(uploaded)"
121
+ if content is None:
122
+ return "ERROR: Could not read uploaded file."
123
+
124
+ ext = os.path.splitext(name)[1].lower()
125
+
126
+ if ext == ".txt":
127
+ # try common encodings
128
+ for enc in ("utf-8", "utf-16", "latin-1"):
129
+ try:
130
+ return content.decode(enc)
131
+ except UnicodeDecodeError:
132
+ continue
133
+ return "ERROR: Could not decode text file. Try UTF-8/plain text."
134
+
135
+ if ext == ".docx":
136
+ return _extract_from_docx_bytes(content)
137
+
138
+ if ext == ".doc":
139
+ return _extract_from_doc_bytes(content)
140
+
141
+ return f"Unsupported file type: {ext}. Please upload .txt, .docx, or .doc."
142
+
143
+
144
+ # -----------------------------------------------------------------------------
145
+ # NLP helpers
146
+ # -----------------------------------------------------------------------------
147
+ def extract_ner(ne_tree) -> List[Tuple[str, str]]:
148
+ entities = []
149
+ for subtree in ne_tree:
150
+ if hasattr(subtree, "label"):
151
+ label = subtree.label()
152
+ text = " ".join(token for token, _ in subtree.leaves())
153
+ entities.append((text, label))
154
+ return entities
155
+
156
+ def process_text(raw_text: str, steps: List[str]) -> str:
157
+ if not raw_text or raw_text.strip() == "":
158
+ return "⚠️ No text provided."
159
+
160
+ # Ensure data locally (quiet)
161
+ ensure_nltk_resources()
162
+
163
+ report_lines = []
164
+ text = raw_text
165
+
166
+ # 1) Tokenize (required by later steps)
167
+ tokens = None
168
+ if "Tokenize text." in steps or any(
169
+ s in steps for s in [
170
+ "Remove stopwords.", "Stem words.", "Lemmatize words.",
171
+ "Tag parts of speech.", "Extract named entities."
172
+ ]
173
+ ):
174
+ tokens = word_tokenize(text)
175
+ if "Tokenize text." in steps:
176
+ report_lines.append("### Tokens")
177
+ report_lines.append(f"`{tokens}`\n")
178
+
179
+ # 2) Stopwords
180
+ filtered_tokens = tokens
181
+ if "Remove stopwords." in steps:
182
+ sw = set(stopwords.words("english"))
183
+ filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
184
+ report_lines.append("### After Stopword Removal")
185
+ report_lines.append(f"`{filtered_tokens}`\n")
186
+
187
+ # 3) Stemming
188
+ stemmed_tokens = filtered_tokens
189
+ if "Stem words." in steps:
190
+ stemmer = PorterStemmer()
191
+ stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
192
+ report_lines.append("### Stemmed Tokens (Porter)")
193
+ report_lines.append(f"`{stemmed_tokens}`\n")
194
+
195
+ # 4) Lemmatization
196
+ lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
197
+ if "Lemmatize words." in steps:
198
+ lemmatizer = WordNetLemmatizer()
199
+ lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
200
+ report_lines.append("### Lemmatized Tokens (WordNet)")
201
+ report_lines.append(f"`{lemmatized_tokens}`\n")
202
+
203
+ # 5) POS Tagging
204
+ pos_tags_val = None
205
+ if "Tag parts of speech." in steps or "Extract named entities." in steps:
206
+ base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
207
+ pos_tags_val = pos_tag(base_for_tagging)
208
+ if "Tag parts of speech." in steps:
209
+ report_lines.append("### Part-of-Speech Tags")
210
+ rows = ["| Token | POS |", "|---|---|"]
211
+ rows += [f"| {t} | {p} |" for (t, p) in pos_tags_val]
212
+ report_lines.append("\n".join(rows) + "\n")
213
+
214
+ # 6) NER
215
+ if "Extract named entities." in steps:
216
+ if not pos_tags_val:
217
+ base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
218
+ pos_tags_val = pos_tag(base_for_tagging)
219
+ ne_tree = ne_chunk(pos_tags_val, binary=False)
220
+ ner_pairs = extract_ner(ne_tree)
221
+
222
+ report_lines.append("### Named Entities")
223
+ if ner_pairs:
224
+ rows = ["| Entity | Label |", "|---|---|"]
225
+ rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
226
+ report_lines.append("\n".join(rows) + "\n")
227
+ else:
228
+ report_lines.append("_No named entities found._\n")
229
+
230
+ return "\n".join(report_lines).strip() or "No steps selected."
231
+
232
+
233
+ # -----------------------------------------------------------------------------
234
+ # Gradio UI
235
+ # -----------------------------------------------------------------------------
236
+ MENU = [
237
+ "Install and download required resources.",
238
+ "Tokenize text.",
239
+ "Remove stopwords.",
240
+ "Stem words.",
241
+ "Lemmatize words.",
242
+ "Tag parts of speech.",
243
+ "Extract named entities.",
244
+ ]
245
 
246
+ DEFAULT_TEXT = (
247
+ "NLTK is a powerful library for text processing. "
248
+ "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
249
+ )
250
 
251
+ with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
252
+ gr.Markdown("# NLTK Text Processing Toolkit")
253
  gr.Markdown(
254
+ "Type or paste text, or drop a `.txt`/`.docx`/`.doc` file. "
255
+ "Select steps and click **Process**. Use **Install/Download Resources** first if needed."
 
 
 
 
 
 
 
 
256
  )
257
 
258
  with gr.Row():
259
+ with gr.Column():
260
+ text_in = gr.Textbox(
261
+ label="Text Input",
262
+ lines=10,
263
+ value=DEFAULT_TEXT,
264
+ placeholder="Type or paste text here..."
265
+ )
266
+ file_in = gr.File(
267
+ label="...or drop a .txt / .docx / .doc file",
268
+ file_types=[".txt", ".docx", ".doc"]
269
+ )
270
+ steps_in = gr.CheckboxGroup(
271
+ choices=MENU,
272
+ value=[
273
+ "Tokenize text.",
274
+ "Remove stopwords.",
275
+ "Lemmatize words.",
276
+ "Tag parts of speech.",
277
+ "Extract named entities.",
278
+ ],
279
+ label="Menu (choose one or more)"
280
+ )
281
+ with gr.Row():
282
+ install_btn = gr.Button("Install/Download Resources")
283
+ process_btn = gr.Button("Process", variant="primary")
284
+ clear_btn = gr.Button("Clear")
285
+
286
+ with gr.Column():
287
+ status_out = gr.Textbox(label="Status / Logs", interactive=False)
288
+ result_out = gr.Markdown(label="Results")
289
+
290
+ # Button callbacks
291
+ def on_install():
292
+ try:
293
+ return ensure_nltk_resources()
294
+ except Exception as e:
295
+ return f"Install error: {e}"
296
+
297
+ def on_process(text, file, steps):
298
+ try:
299
+ text = (text or "").strip()
300
+ file_text = read_file(file) if file is not None else ""
301
+ if not text and file_text:
302
+ text = file_text
303
+
304
+ if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
305
+ return file_text
306
+
307
+ return process_text(text, steps or [])
308
+ except Exception:
309
+ import traceback
310
+ return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"
311
+
312
+ def on_clear():
313
+ return "", ""
314
+
315
+ install_btn.click(fn=on_install, inputs=None, outputs=status_out)
316
+ process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
317
+ clear_btn.click(fn=on_clear, inputs=None, outputs=[status_out, result_out])
318
 
319
  if __name__ == "__main__":
320
+ # If you need external access, set server_name="0.0.0.0"
321
  demo.launch()