Spam_SMS_Detection

Sleeping

File size: 3,714 Bytes

import streamlit as st
import pandas as pd
import re
import string
import google.generativeai as genai
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity

# --- Set Gemini API Key ---
genai.configure(api_key="AIzaSyCVRGVxIe1vESoAgykgHWOej-jZxiU-RKE")  # <-- Replace this with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-pro")

# Title & Intro
st.set_page_config(page_title="SMS Spam Detection", layout="centered")
st.title("📩 SMS Spam Detection App")
st.markdown("🔍 Enter an SMS message below to check if it's **Spam** or **Not Spam (Ham)**")

# --- Load CSV Dataset ---
@st.cache_data
def load_data():
    url = "https://huggingface.co/spaces/MLDeveloper/Spam_SMS_Detection/resolve/main/spam.csv"
    df = pd.read_csv(url, encoding='latin-1')
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
    return df

df = load_data()

# --- Preprocessing ---
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# --- Train Model ---
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# --- Clean Text Function ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# --- Predict Function ---
def predict_spam(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)
    return "Spam" if prediction[0] == 1 else "Not Spam (Ham)"

# --- Gemini Fallback ---
def ask_gemini(text):
    prompt = f"""You are an expert SMS spam detector.
Classify the following message as 'Spam' or 'Not Spam (Ham)'.
Message: "{text}"
Reply with only: Spam or Not Spam (Ham)."""
    try:
        response = gemini_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error using Gemini: {str(e)}"

# --- Input ---
user_input = st.text_area("✉️ Enter your SMS message here:")

if st.button("Check Message"):
    if user_input.strip() == "":
        st.warning("⚠️ Please enter a message.")
    else:
        cleaned = clean_text(user_input)
        input_vector = vectorizer.transform([cleaned])
        similarities = cosine_similarity(input_vector, X_train_tfidf)
        max_similarity = similarities.max()

        # Check similarity threshold (e.g., < 0.3 = unknown message)
        if max_similarity < 0.3:
            st.info("🧠 Message not found in training data. Using Gemini for prediction...")
            gemini_result = ask_gemini(user_input)
            if "spam" in gemini_result.lower():
                st.error("🚫 Gemini says: This message is **SPAM**.")
            else:
                st.success("✅ Gemini says: This message is **NOT SPAM (HAM)**.")
        else:
            result = predict_spam(user_input)
            if result == "Spam":
                st.error("🚫 This message is classified as **SPAM**.")
            else:
                st.success("✅ This message is classified as **NOT SPAM (HAM)**.")

# --- Dataset preview ---
with st.expander("📄 View sample dataset"):
    st.dataframe(df.head())

st.markdown("---")
st.markdown("🔒 *Note: This app is for educational purposes only.*")