Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
import string | |
import google.generativeai as genai | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.metrics.pairwise import cosine_similarity | |
# --- Set Gemini API Key --- | |
genai.configure(api_key="AIzaSyCVRGVxIe1vESoAgykgHWOej-jZxiU-RKE") # <-- Replace this with your actual Gemini API key | |
gemini_model = genai.GenerativeModel("gemini-pro") | |
# Title & Intro | |
st.set_page_config(page_title="SMS Spam Detection", layout="centered") | |
st.title("📩 SMS Spam Detection App") | |
st.markdown("🔍 Enter an SMS message below to check if it's **Spam** or **Not Spam (Ham)**") | |
# --- Load CSV Dataset --- | |
def load_data(): | |
url = "https://huggingface.co/spaces/MLDeveloper/Spam_SMS_Detection/resolve/main/spam.csv" | |
df = pd.read_csv(url, encoding='latin-1') | |
df = df[['v1', 'v2']] | |
df.columns = ['label', 'message'] | |
return df | |
df = load_data() | |
# --- Preprocessing --- | |
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) | |
# --- Train Model --- | |
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42) | |
vectorizer = TfidfVectorizer() | |
X_train_tfidf = vectorizer.fit_transform(X_train) | |
model = MultinomialNB() | |
model.fit(X_train_tfidf, y_train) | |
# --- Clean Text Function --- | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r"http\S+|www\S+|https\S+", '', text) | |
text = re.sub(r'\@w+|\#','', text) | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\d+', '', text) | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
return text.strip() | |
# --- Predict Function --- | |
def predict_spam(text): | |
cleaned = clean_text(text) | |
vector = vectorizer.transform([cleaned]) | |
prediction = model.predict(vector) | |
return "Spam" if prediction[0] == 1 else "Not Spam (Ham)" | |
# --- Gemini Fallback --- | |
def ask_gemini(text): | |
prompt = f"""You are an expert SMS spam detector. | |
Classify the following message as 'Spam' or 'Not Spam (Ham)'. | |
Message: "{text}" | |
Reply with only: Spam or Not Spam (Ham).""" | |
try: | |
response = gemini_model.generate_content(prompt) | |
return response.text.strip() | |
except Exception as e: | |
return f"Error using Gemini: {str(e)}" | |
# --- Input --- | |
user_input = st.text_area("✉️ Enter your SMS message here:") | |
if st.button("Check Message"): | |
if user_input.strip() == "": | |
st.warning("⚠️ Please enter a message.") | |
else: | |
cleaned = clean_text(user_input) | |
input_vector = vectorizer.transform([cleaned]) | |
similarities = cosine_similarity(input_vector, X_train_tfidf) | |
max_similarity = similarities.max() | |
# Check similarity threshold (e.g., < 0.3 = unknown message) | |
if max_similarity < 0.3: | |
st.info("🧠 Message not found in training data. Using Gemini for prediction...") | |
gemini_result = ask_gemini(user_input) | |
if "spam" in gemini_result.lower(): | |
st.error("🚫 Gemini says: This message is **SPAM**.") | |
else: | |
st.success("✅ Gemini says: This message is **NOT SPAM (HAM)**.") | |
else: | |
result = predict_spam(user_input) | |
if result == "Spam": | |
st.error("🚫 This message is classified as **SPAM**.") | |
else: | |
st.success("✅ This message is classified as **NOT SPAM (HAM)**.") | |
# --- Dataset preview --- | |
with st.expander("📄 View sample dataset"): | |
st.dataframe(df.head()) | |
st.markdown("---") | |
st.markdown("🔒 *Note: This app is for educational purposes only.*") | |