tbitai commited on
Commit
cc21853
1 Parent(s): 9bd0746

Bayesian model

Browse files
Files changed (1) hide show
  1. app.py +52 -5
app.py CHANGED
@@ -1,19 +1,66 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- def predict(model, input_txt):
4
- return 0.46 # Worldwide spam rate in 2023. Source: https://securelist.com/spam-phishing-report-2023/112015/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  demo = gr.Interface(
7
  fn=predict,
8
  inputs=[
9
- gr.Dropdown(choices=["spam-rate"], value="spam-rate", label="Model"),
 
 
 
10
  gr.TextArea(label="Email"),
11
  ],
12
  outputs=[gr.Number(label="Spam probability")],
13
  title="Bayes or Spam?",
14
- description="Choose your model, and predict if your email is a spam! 📨<br>COMING SOON: Bayesian, NN and LLM models.",
15
  examples=[
16
- ["spam-rate", "revision #1 - hpl noms for november 3, 2000 (see attached file: hplnl 103.xls) - hplnl 103.xls"],
 
17
  ],
18
  article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.",
19
  )
 
1
  import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ import json
4
+ import tensorflow as tf
5
+ import numpy as np
6
 
7
+ model_probs_path = hf_hub_download(repo_id="tbitai/bayes-enron1-spam", filename="probs.json")
8
+ with open(model_probs_path) as f:
9
+ model_probs = json.load(f)
10
+
11
+ UNK = '[UNK]'
12
+
13
+ def tokenize(text):
14
+ return tf.keras.preprocessing.text.text_to_word_sequence(text)
15
+
16
+ def combine(probs):
17
+ if any(p == 0 for p in probs):
18
+ return 0
19
+ prod = np.prod(probs)
20
+ neg_prod = np.prod([1 - p for p in probs])
21
+ if prod + neg_prod == 0: # Still possible due to floating point arithmetic
22
+ return 0.5 # Assume that prod and neg_prod are equally small
23
+ return prod / (prod + neg_prod)
24
+
25
+ def get_interesting_probs(probs, intr_threshold):
26
+ return sorted(probs,
27
+ key=lambda p: abs(p - 0.5),
28
+ reverse=True)[:intr_threshold]
29
+
30
+ def unbias(p):
31
+ return (2 * p) / (p + 1)
32
+
33
+ def predict_bayes(text, intr_threshold, unbiased=False):
34
+ words = tokenize(text)
35
+ probs = [model_probs.get(w, model_probs[UNK]) for w in words]
36
+ if unbiased:
37
+ probs = [unbias(p) for p in probs]
38
+ interesting_probs = get_interesting_probs(probs, intr_threshold)
39
+ return combine(interesting_probs)
40
+
41
+ MODELS = [
42
+ BAYES := "Bayes Enron1 spam",
43
+ ]
44
+
45
+ def predict(model, unbiased, intr_threshold, input_txt):
46
+ if model == BAYES:
47
+ return predict_bayes(input_txt, unbiased=unbiased, intr_threshold=intr_threshold)
48
 
49
  demo = gr.Interface(
50
  fn=predict,
51
  inputs=[
52
+ gr.Dropdown(choices=MODELS, value=BAYES, label="Model"),
53
+ gr.Checkbox(label="Unbias", info="Correct Graham's bias?"),
54
+ gr.Slider(minimum=1, maximum=20, step=1, value=15, label="Interestingness threshold",
55
+ info="How many of the most interesting words to select in the probability calculation?"),
56
  gr.TextArea(label="Email"),
57
  ],
58
  outputs=[gr.Number(label="Spam probability")],
59
  title="Bayes or Spam?",
60
+ description="Choose and configure your model, and predict if your email is a spam! 📨<br>COMING SOON: NN and LLM models.",
61
  examples=[
62
+ [BAYES, "enron actuals for june 26, 2000"],
63
+ [BAYES, "stop the aging clock nerissa"],
64
  ],
65
  article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.",
66
  )