import streamlit as st import pandas as pd import re import string import google.generativeai as genai from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics.pairwise import cosine_similarity # --- Set Gemini API Key --- genai.configure(api_key="AIzaSyCVRGVxIe1vESoAgykgHWOej-jZxiU-RKE") # <-- Replace this with your actual Gemini API key gemini_model = genai.GenerativeModel("gemini-pro") # Title & Intro st.set_page_config(page_title="SMS Spam Detection", layout="centered") st.title("📩 SMS Spam Detection App") st.markdown("🔍 Enter an SMS message below to check if it's **Spam** or **Not Spam (Ham)**") # --- Load CSV Dataset --- @st.cache_data def load_data(): url = "https://huggingface.co/spaces/MLDeveloper/Spam_SMS_Detection/resolve/main/spam.csv" df = pd.read_csv(url, encoding='latin-1') df = df[['v1', 'v2']] df.columns = ['label', 'message'] return df df = load_data() # --- Preprocessing --- df['label'] = df['label'].map({'ham': 0, 'spam': 1}) # --- Train Model --- X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42) vectorizer = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(X_train) model = MultinomialNB() model.fit(X_train_tfidf, y_train) # --- Clean Text Function --- def clean_text(text): text = text.lower() text = re.sub(r"http\S+|www\S+|https\S+", '', text) text = re.sub(r'\@w+|\#','', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) text = text.translate(str.maketrans('', '', string.punctuation)) return text.strip() # --- Predict Function --- def predict_spam(text): cleaned = clean_text(text) vector = vectorizer.transform([cleaned]) prediction = model.predict(vector) return "Spam" if prediction[0] == 1 else "Not Spam (Ham)" # --- Gemini Fallback --- def ask_gemini(text): prompt = f"""You are an expert SMS spam detector. Classify the following message as 'Spam' or 'Not Spam (Ham)'. Message: "{text}" Reply with only: Spam or Not Spam (Ham).""" try: response = gemini_model.generate_content(prompt) return response.text.strip() except Exception as e: return f"Error using Gemini: {str(e)}" # --- Input --- user_input = st.text_area("✉️ Enter your SMS message here:") if st.button("Check Message"): if user_input.strip() == "": st.warning("⚠️ Please enter a message.") else: cleaned = clean_text(user_input) input_vector = vectorizer.transform([cleaned]) similarities = cosine_similarity(input_vector, X_train_tfidf) max_similarity = similarities.max() # Check similarity threshold (e.g., < 0.3 = unknown message) if max_similarity < 0.3: st.info("🧠 Message not found in training data. Using Gemini for prediction...") gemini_result = ask_gemini(user_input) if "spam" in gemini_result.lower(): st.error("🚫 Gemini says: This message is **SPAM**.") else: st.success("✅ Gemini says: This message is **NOT SPAM (HAM)**.") else: result = predict_spam(user_input) if result == "Spam": st.error("🚫 This message is classified as **SPAM**.") else: st.success("✅ This message is classified as **NOT SPAM (HAM)**.") # --- Dataset preview --- with st.expander("📄 View sample dataset"): st.dataframe(df.head()) st.markdown("---") st.markdown("🔒 *Note: This app is for educational purposes only.*")