Spaces:

tbitai
/

bayes-or-spam

Sleeping

App Files Files Community

tbitai commited on Sep 26, 2024

Commit

cc21853

verified ·

1 Parent(s): 9bd0746

Bayesian model

Browse files

Files changed (1) hide show

app.py +52 -5

app.py CHANGED Viewed

@@ -1,19 +1,66 @@
 import gradio as gr
-def predict(model, input_txt):
-    return 0.46  # Worldwide spam rate in 2023. Source: https://securelist.com/spam-phishing-report-2023/112015/
 demo = gr.Interface(
     fn=predict,
     inputs=[
-        gr.Dropdown(choices=["spam-rate"], value="spam-rate", label="Model"),
         gr.TextArea(label="Email"),
     ],
     outputs=[gr.Number(label="Spam probability")],
     title="Bayes or Spam?",
-    description="Choose your model, and predict if your email is a spam! 📨<br>COMING SOON: Bayesian, NN and LLM models.",
     examples=[
-        ["spam-rate", "revision #1 - hpl noms for november 3, 2000 (see attached file: hplnl 103.xls) - hplnl 103.xls"],
     ],
     article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.",
 )

 import gradio as gr
+from huggingface_hub import hf_hub_download
+import json
+import tensorflow as tf
+import numpy as np
+model_probs_path = hf_hub_download(repo_id="tbitai/bayes-enron1-spam", filename="probs.json")
+with open(model_probs_path) as f:
+    model_probs = json.load(f)
+UNK = '[UNK]'
+def tokenize(text):
+    return tf.keras.preprocessing.text.text_to_word_sequence(text)
+def combine(probs):
+    if any(p == 0 for p in probs):
+        return 0
+    prod = np.prod(probs)
+    neg_prod = np.prod([1 - p for p in probs])
+    if prod + neg_prod == 0:  # Still possible due to floating point arithmetic
+        return 0.5  # Assume that prod and neg_prod are equally small
+    return prod / (prod + neg_prod)
+def get_interesting_probs(probs, intr_threshold):
+    return sorted(probs,
+                  key=lambda p: abs(p - 0.5),
+                  reverse=True)[:intr_threshold]
+def unbias(p):
+    return (2 * p) / (p + 1)
+def predict_bayes(text, intr_threshold, unbiased=False):
+    words = tokenize(text)
+    probs = [model_probs.get(w, model_probs[UNK]) for w in words]
+    if unbiased:
+        probs = [unbias(p) for p in probs]
+    interesting_probs = get_interesting_probs(probs, intr_threshold)
+    return combine(interesting_probs)
+MODELS = [
+    BAYES := "Bayes Enron1 spam",
+]
+def predict(model, unbiased, intr_threshold, input_txt):
+    if model == BAYES:
+        return predict_bayes(input_txt, unbiased=unbiased, intr_threshold=intr_threshold)
 demo = gr.Interface(
     fn=predict,
     inputs=[
+        gr.Dropdown(choices=MODELS, value=BAYES, label="Model"),
+        gr.Checkbox(label="Unbias", info="Correct Graham's bias?"),
+        gr.Slider(minimum=1, maximum=20, step=1, value=15, label="Interestingness threshold",
+                  info="How many of the most interesting words to select in the probability calculation?"),
         gr.TextArea(label="Email"),
     ],
     outputs=[gr.Number(label="Spam probability")],
     title="Bayes or Spam?",
+    description="Choose and configure your model, and predict if your email is a spam! 📨<br>COMING SOON: NN and LLM models.",
     examples=[
+        [BAYES, "enron actuals for june 26, 2000"],
+        [BAYES, "stop the aging clock nerissa"],
     ],
     article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.",
 )