File size: 3,714 Bytes
8f460b5
1840ab8
308314b
 
ed1b0c1
00b4891
4f6ca42
 
ed1b0c1
 
 
 
 
d481617
00b4891
 
308314b
00b4891
308314b
00b4891
 
 
7c07748
00b4891
 
4f6ca42
00b4891
 
 
 
 
 
 
 
 
 
 
 
 
 
308314b
00b4891
308314b
 
00b4891
308314b
 
 
 
 
 
00b4891
 
 
308314b
 
00b4891
308314b
ed1b0c1
 
 
 
 
 
 
 
 
 
 
 
00b4891
308314b
 
 
 
00b4891
483b677
ed1b0c1
 
 
 
 
 
 
 
 
 
 
 
 
308314b
ed1b0c1
 
 
 
 
f4ba322
00b4891
 
 
ef88b24
 
00b4891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import streamlit as st
import pandas as pd
import re
import string
import google.generativeai as genai
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity

# --- Set Gemini API Key ---
genai.configure(api_key="AIzaSyCVRGVxIe1vESoAgykgHWOej-jZxiU-RKE")  # <-- Replace this with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-pro")

# Title & Intro
st.set_page_config(page_title="SMS Spam Detection", layout="centered")
st.title("📩 SMS Spam Detection App")
st.markdown("🔍 Enter an SMS message below to check if it's **Spam** or **Not Spam (Ham)**")

# --- Load CSV Dataset ---
@st.cache_data
def load_data():
    url = "https://huggingface.co/spaces/MLDeveloper/Spam_SMS_Detection/resolve/main/spam.csv"
    df = pd.read_csv(url, encoding='latin-1')
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
    return df

df = load_data()

# --- Preprocessing ---
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# --- Train Model ---
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# --- Clean Text Function ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# --- Predict Function ---
def predict_spam(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)
    return "Spam" if prediction[0] == 1 else "Not Spam (Ham)"

# --- Gemini Fallback ---
def ask_gemini(text):
    prompt = f"""You are an expert SMS spam detector.
Classify the following message as 'Spam' or 'Not Spam (Ham)'.
Message: "{text}"
Reply with only: Spam or Not Spam (Ham)."""
    try:
        response = gemini_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error using Gemini: {str(e)}"

# --- Input ---
user_input = st.text_area("✉️ Enter your SMS message here:")

if st.button("Check Message"):
    if user_input.strip() == "":
        st.warning("⚠️ Please enter a message.")
    else:
        cleaned = clean_text(user_input)
        input_vector = vectorizer.transform([cleaned])
        similarities = cosine_similarity(input_vector, X_train_tfidf)
        max_similarity = similarities.max()

        # Check similarity threshold (e.g., < 0.3 = unknown message)
        if max_similarity < 0.3:
            st.info("🧠 Message not found in training data. Using Gemini for prediction...")
            gemini_result = ask_gemini(user_input)
            if "spam" in gemini_result.lower():
                st.error("🚫 Gemini says: This message is **SPAM**.")
            else:
                st.success("✅ Gemini says: This message is **NOT SPAM (HAM)**.")
        else:
            result = predict_spam(user_input)
            if result == "Spam":
                st.error("🚫 This message is classified as **SPAM**.")
            else:
                st.success("✅ This message is classified as **NOT SPAM (HAM)**.")

# --- Dataset preview ---
with st.expander("📄 View sample dataset"):
    st.dataframe(df.head())

st.markdown("---")
st.markdown("🔒 *Note: This app is for educational purposes only.*")