tomerz14 commited on
Commit
cfaed4d
·
verified ·
1 Parent(s): 4d68a43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -17
app.py CHANGED
@@ -5,11 +5,12 @@ Gradio App — AI vs Human Document Classifier (Chunked Inference)
5
  ----------------------------------------------------------------
6
  Features:
7
  - Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
8
- - Shows:
9
  1) Probability bars with raw numbers (AI generated / Human written)
10
  2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
11
  3) Tabs for Basic / Advanced controls
12
  4) Chunk details accordion with per-chunk probabilities
 
13
  """
14
 
15
  import os
@@ -25,7 +26,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
25
  # -----------------------------
26
  # Config
27
  # -----------------------------
28
- MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased") # e.g., "username/bert-binclass"
29
  MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
30
  STRIDE = int(os.getenv("STRIDE", "128"))
31
 
@@ -92,7 +93,8 @@ def read_text_from_file(file_obj) -> str:
92
  def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
93
  """
94
  Chunk the document using tokenizer overflow, run classifier on each chunk,
95
- aggregate probabilities, and return both doc-level and chunk-level results.
 
96
  """
97
  if not text or not text.strip():
98
  return {"error": "Empty document."}
@@ -105,6 +107,7 @@ def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: st
105
  return_overflowing_tokens=True,
106
  stride=stride,
107
  padding=True,
 
108
  return_tensors="pt",
109
  )
110
 
@@ -131,16 +134,39 @@ def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: st
131
  prob_human = float(doc_probs[0])
132
  prob_ai = float(doc_probs[1])
133
 
134
- # Per-chunk table rows
135
- chunk_rows = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  for i, p in enumerate(probs):
137
- chunk_rows.append([i + 1, float(p[1]), float(p[0])]) # [chunk, AI, Human]
 
 
138
 
139
  return {
140
  "ai_prob": prob_ai,
141
  "human_prob": prob_human,
142
  "num_chunks": num_chunks,
143
- "chunk_rows": chunk_rows, # list of [chunk, AI, Human]
144
  "max_length": max_length,
145
  "stride": stride,
146
  }
@@ -194,7 +220,7 @@ def format_outputs(result: Dict[str, Any], threshold: float = 0.5):
194
  probs_html += probability_bar_html("AI generated", ai)
195
  probs_html += probability_bar_html("Human written", human)
196
 
197
- # Chunk table rows
198
  table_data = result["chunk_rows"]
199
 
200
  details_md = (
@@ -217,9 +243,11 @@ CSS = """
217
  .prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
218
  .prob-fill {height:12px; background:#6366f1;}
219
  .small-note {font-size:0.9rem; color:#6b7280;}
220
- #chunkbox {max-height:260px; overflow:auto;}
 
 
 
221
  #details_note { font-size: 0.9rem; color: #6b7280; }
222
- .gr-group { max-height: 260px; overflow: auto; }
223
  """
224
 
225
  DESCRIPTION = """
@@ -244,17 +272,17 @@ with gr.Blocks(
244
  probs_html = gr.HTML(label="Probabilities")
245
 
246
  with gr.Accordion("Chunk details", open=False):
247
- with gr.Group():
248
  chunk_table = gr.Dataframe(
249
- headers=["Chunk", "AI generated", "Human written"],
250
- datatype=["number", "number", "number"],
251
  label="Per-chunk probabilities",
252
  wrap=True,
253
  interactive=False,
254
- row_count=(0, "dynamic"), # let rows grow
255
- col_count=(3, "fixed"), # 3 fixed columns
256
- )
257
- details_md = gr.Markdown("", elem_id="details_note") # use elem_id for broad compatibility
258
 
259
  with gr.Tab("Advanced"):
260
  gr.Markdown("Adjust chunking parameters below.")
 
5
  ----------------------------------------------------------------
6
  Features:
7
  - Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
8
+ - UI includes:
9
  1) Probability bars with raw numbers (AI generated / Human written)
10
  2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
11
  3) Tabs for Basic / Advanced controls
12
  4) Chunk details accordion with per-chunk probabilities
13
+ 5) NEW: Per-chunk **snippet** extracted using tokenizer offset_mapping
14
  """
15
 
16
  import os
 
26
  # -----------------------------
27
  # Config
28
  # -----------------------------
29
+ MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")
30
  MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
31
  STRIDE = int(os.getenv("STRIDE", "128"))
32
 
 
93
  def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
94
  """
95
  Chunk the document using tokenizer overflow, run classifier on each chunk,
96
+ aggregate probabilities, and return both doc-level and chunk-level results,
97
+ including a short snippet per chunk derived from offset_mapping.
98
  """
99
  if not text or not text.strip():
100
  return {"error": "Empty document."}
 
107
  return_overflowing_tokens=True,
108
  stride=stride,
109
  padding=True,
110
+ return_offsets_mapping=True, # NEW: get character offsets per token
111
  return_tensors="pt",
112
  )
113
 
 
134
  prob_human = float(doc_probs[0])
135
  prob_ai = float(doc_probs[1])
136
 
137
+ # --- Build snippets per chunk from offset mapping ---
138
+ offsets = enc["offset_mapping"] # tensor of pairs
139
+ attn = enc["attention_mask"] # [num_chunks, seq_len]
140
+ snippets: List[str] = []
141
+ PREVIEW = 120
142
+
143
+ for i in range(offsets.shape[0]):
144
+ offs = offsets[i].tolist()
145
+ mask = attn[i].tolist()
146
+ spans = [(s, e) for (s, e), m in zip(offs, mask) if m == 1 and not (s == 0 and e == 0)]
147
+ if spans:
148
+ s0 = min(s for s, _ in spans)
149
+ e0 = max(e for _, e in spans)
150
+ raw = text[s0:e0].strip()
151
+ raw = " ".join(raw.split())
152
+ if len(raw) > PREVIEW:
153
+ raw = raw[:PREVIEW].rstrip() + "…"
154
+ snippets.append(raw)
155
+ else:
156
+ snippets.append("")
157
+
158
+ # Per-chunk rows: [chunk#, AI prob, Human prob, Snippet]
159
+ chunk_rows: List[List[Any]] = []
160
  for i, p in enumerate(probs):
161
+ ai_p = float(p[1])
162
+ hu_p = float(p[0])
163
+ chunk_rows.append([i + 1, ai_p, hu_p, snippets[i]])
164
 
165
  return {
166
  "ai_prob": prob_ai,
167
  "human_prob": prob_human,
168
  "num_chunks": num_chunks,
169
+ "chunk_rows": chunk_rows, # list of [chunk, AI, Human, Snippet]
170
  "max_length": max_length,
171
  "stride": stride,
172
  }
 
220
  probs_html += probability_bar_html("AI generated", ai)
221
  probs_html += probability_bar_html("Human written", human)
222
 
223
+ # Chunk table rows (already built server-side)
224
  table_data = result["chunk_rows"]
225
 
226
  details_md = (
 
243
  .prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
244
  .prob-fill {height:12px; background:#6366f1;}
245
  .small-note {font-size:0.9rem; color:#6b7280;}
246
+ /* Wrap long snippet text within the DataFrame cells */
247
+ .gr-dataframe table td { white-space: normal; }
248
+ /* Scrollable chunk table container */
249
+ #chunkgroup { max-height: 260px; overflow: auto; }
250
  #details_note { font-size: 0.9rem; color: #6b7280; }
 
251
  """
252
 
253
  DESCRIPTION = """
 
272
  probs_html = gr.HTML(label="Probabilities")
273
 
274
  with gr.Accordion("Chunk details", open=False):
275
+ with gr.Group(elem_id="chunkgroup"):
276
  chunk_table = gr.Dataframe(
277
+ headers=["Chunk", "AI generated", "Human written", "Snippet"],
278
+ datatype=["number", "number", "number", "str"],
279
  label="Per-chunk probabilities",
280
  wrap=True,
281
  interactive=False,
282
+ row_count=(0, "dynamic"),
283
+ col_count=(4, "fixed"),
284
+ )
285
+ details_md = gr.Markdown("", elem_id="details_note")
286
 
287
  with gr.Tab("Advanced"):
288
  gr.Markdown("Adjust chunking parameters below.")