Spaces:

jeevitha-app
/

Sentiment_analyzer

Sleeping

App Files Files Community

jeevitha-app commited on Oct 21

Commit

6e629f4

verified ·

1 Parent(s): 0426dfa

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -91

app.py CHANGED Viewed

@@ -1,108 +1,221 @@
-import gradio as gr
 import joblib
-import shap
 import numpy as np
-# ------------------------------------------------------------
-# 1️⃣ Load pre-trained models and vectorizers
-# ------------------------------------------------------------
-eng_model = joblib.load("best_model.pkl")
-eng_vectorizer = joblib.load("tfidf_vectorizer.pkl")
-per_model = joblib.load("logistic_regression.pkl")
-per_vectorizer = joblib.load("tfidf_vectorizer_persian.pkl")
-# ------------------------------------------------------------
-# 2️⃣ Define class labels
-# ------------------------------------------------------------
-class_names = ["Negative", "Neutral", "Positive"]
-# ------------------------------------------------------------
-# 3️⃣ Prediction Function
-# ------------------------------------------------------------
-def predict_sentiment(text, language):
-    if not text.strip():
-        return "⚠️ Please enter some text!", None, None
     if language == "English":
         model = eng_model
         vectorizer = eng_vectorizer
     else:
         model = per_model
         vectorizer = per_vectorizer
-    # Vectorize input
     vec = vectorizer.transform([text])
     probs = model.predict_proba(vec)[0]
-    pred_class = np.argmax(probs)
     label = class_names[pred_class]
-    confidence = probs[pred_class]
-    # Interpret top words with SHAP
-    explainer = shap.Explainer(model, vectorizer.transform(["sample"]))
-    feature_names = vectorizer.get_feature_names_out()
-    shap_values = explainer(vec)
-    # top contributing words
-    shap_vals = shap_values[0].values[:, pred_class]
-    top_indices = np.argsort(-np.abs(shap_vals))[:10]
-    top_words = [feature_names[i] for i in top_indices]
-    top_contribs = shap_vals[top_indices]
-    interpretation = {
-        "words": top_words,
-        "contributions": top_contribs.tolist()
-    }
-    return f"🎯 **{label}** (confidence: {confidence:.2f})", probs.tolist(), interpretation
-# ------------------------------------------------------------
-# 4️⃣ Gradio Interface
-# ------------------------------------------------------------
-def gradio_ui(text, language):
-    pred, probs, interp = predict_sentiment(text, language)
-    if not probs:
-        return pred, None, None
-    # Confidence Bar Plot
-    bar_plot = {cls: float(p) for cls, p in zip(class_names, probs)}
-    # Word Contribution Table
-    word_table = None
-    if interp:
-        word_table = {
-            "Word": interp["words"],
-            "SHAP Impact": interp["contributions"]
-        }
-    return pred, bar_plot, word_table
-# ------------------------------------------------------------
-# 5️⃣ Gradio Layout
-# ------------------------------------------------------------
-with gr.Blocks(theme=gr.themes.Soft()) as interface:
-    gr.Markdown("
-    Select language, enter text, and view predictions with interpretable SHAP insights!")
-    lang = gr.Radio(["English", "Persian"], label="Select Dataset", value="English")
-    text = gr.Textbox(label="Enter your text here", placeholder="Type an English or Persian comment...")
-    output_pred = gr.Markdown(label="Prediction")
-    output_bar = gr.BarPlot(label="Confidence per Class")
-    output_table = gr.Dataframe(label="Top Influential Words", headers=["Word", "SHAP Impact"])
-    btn = gr.Button("🔍 Analyze Sentiment")
-    btn.click(fn=gradio_ui, inputs=[text, lang], outputs=[output_pred, output_bar, output_table])
-# ------------------------------------------------------------
-# 6️⃣ Launch App
-# ------------------------------------------------------------
 if __name__ == "__main__":
-    interface.launch(share=True)

+# app.py
+# Gradio app: English + Persian sentiment with SHAP-based interpretability and word highlighting
 import joblib
 import numpy as np
+import pandas as pd
+import shap
+import matplotlib.pyplot as plt
+import io
+import base64
+import html
+from typing import Tuple, Dict, List
+import math
+import gradio as gr
+# --------- Load models (replace filenames if you used different names) ----------
+ENG_MODEL_PATH = "models/english_lr_model.pkl"
+ENG_VEC_PATH   = "models/english_vectorizer.pkl"
+PER_MODEL_PATH = "models/persian_lr_model.pkl"
+PER_VEC_PATH   = "models/persian_vectorizer.pkl"
+eng_model = joblib.load(ENG_MODEL_PATH)
+eng_vectorizer = joblib.load(ENG_VEC_PATH)
+per_model = joblib.load(PER_MODEL_PATH)
+per_vectorizer = joblib.load(PER_VEC_PATH)
+CLASS_NAMES_EN = ["Negative", "Neutral", "Positive"]
+CLASS_NAMES_PER = ["منفی", "خنثی", "مثبت"]
+# --------- Utility: create bar data for gradio BarPlot ----------
+def probs_to_bar(probs: List[float], lang: str):
+    names = CLASS_NAMES_EN if lang == "English" else CLASS_NAMES_PER
+    return {names[i]: float(probs[i]) for i in range(len(probs))}
+# --------- Utility: create HTML highlight from SHAP values ----------
+def make_html_highlight(original_text: str,
+                        feature_names: np.ndarray,
+                        shap_values_feature: np.ndarray,
+                        vectorizer_vocab: dict,
+                        max_display: int = 30) -> str:
+    """
+    Simple token-level highlighting:
+    - Tokenize by whitespace (preserves original punctuation).
+    - For each token, attempt to map token.lower() to the vectorizer vocab;
+      if found, get SHAP impact for that feature name.
+    - Color red for positive contribution, blue for negative.
+    Returns an HTML-safe string.
+    """
+    # Build mapping word -> shap value if present in vocabulary
+    # vectorizer_vocab maps token -> idx in feature_names
+    token_to_shap = {}
+    for idx, fname in enumerate(feature_names):
+        # Often fname is the token/ngram itself
+        token_to_shap[fname] = shap_values_feature[idx]
+    # Tokenize (simple)
+    tokens = original_text.split()
+    # Compute max magnitude for scaling opacity
+    mags = []
+    for t in tokens:
+        key = t.lower()
+        val = None
+        # Try several common variants: exact, lower, strip punctuation from ends
+        if key in vectorizer_vocab:
+            val = shap_values_feature[vectorizer_vocab[key]]
+        else:
+            key2 = ''.join(ch for ch in key if ch.isalnum())
+            if key2 in vectorizer_vocab:
+                val = shap_values_feature[vectorizer_vocab[key2]]
+        mags.append(abs(val) if val is not None else 0.0)
+    max_mag = max(mags) if mags else 1.0
+    if max_mag == 0:
+        max_mag = 1.0
+    # Build HTML with span coloring
+    html_tokens = []
+    for t in tokens:
+        display = html.escape(t)
+        key = t.lower()
+        val = None
+        if key in vectorizer_vocab:
+            val = shap_values_feature[vectorizer_vocab[key]]
+        else:
+            key2 = ''.join(ch for ch in key if ch.isalnum())
+            if key2 in vectorizer_vocab:
+                val = shap_values_feature[vectorizer_vocab[key2]]
+        if val is None or abs(val) < 1e-6:
+            html_tokens.append(f"<span style='padding:2px'>{display}</span>")
+        else:
+            sign = "pos" if val > 0 else "neg"
+            mag = min(1.0, abs(val) / max_mag)  # scale 0..1
+            opacity = 0.15 + 0.85 * mag  # avoid fully transparent
+            color = f"rgba(220,20,60,{opacity})" if sign == "pos" else f"rgba(30,144,255,{opacity})"
+            border = "1px solid rgba(0,0,0,0.04)"
+            html_tokens.append(
+                f"<span style='background:{color};padding:2px;margin:1px;border-radius:4px;display:inline-block;{border}'>"
+                f"{display}</span>"
+            )
+    highlighted_html = "<div style='line-height:1.6;font-size:16px'>" + " ".join(html_tokens) + "</div>"
+    return highlighted_html
+# --------- Core function: predict + interpret ----------
+def explain_and_predict(text: str, language: str):
+    text = text or ""
     if language == "English":
         model = eng_model
         vectorizer = eng_vectorizer
+        class_names = CLASS_NAMES_EN
     else:
         model = per_model
         vectorizer = per_vectorizer
+        class_names = CLASS_NAMES_PER
+    if text.strip() == "":
+        return "⚠️ Please enter text.", {}, {"Word": [], "SHAP Impact": []}, "<i>No input</i>"
+    # vectorize
     vec = vectorizer.transform([text])
     probs = model.predict_proba(vec)[0]
+    pred_class = int(np.argmax(probs))
     label = class_names[pred_class]
+    confidence = float(probs[pred_class])
+    # Build SHAP explainer on a small background (use small subset via dummy background)
+    # NOTE: building explainer can be slow; in Spaces you can build once at import
+    # For robustness we build a simple LinearExplainer on vector space
+    # Use small dense sample from training if available - here use vectorizer vocabulary size fallback
+    # Convert to dense for LinearExplainer
+    try:
+        # Use a small background of zeros (cheap) — LinearExplainer can accept arrays
+        background = np.zeros((1, vec.shape[1]))
+        explainer = shap.LinearExplainer(model, background, feature_names=vectorizer.get_feature_names_out())
+        # compute shap on the numeric vector
+        vec_dense = vec.toarray()
+        shap_vals = explainer(vec_dense)  # returns shap.Explanation
+    except Exception:
+        # fallback: use PermutationExplainer on numeric input (slower)
+        explainer = shap.Explainer(model.predict_proba, vec)
+        shap_vals = explainer(vec)
+    # shap_vals.values shape: (n_outputs, n_features) OR Explanation with values (n_features, n_classes)
+    # Normalize to feature vector for chosen class
+    # shap_vals may be multi-output: shap_vals.values => (n_samples, n_features, n_classes) or similar
+    try:
+        # preferred shape: shap_vals.values -> (1, n_features, n_classes)
+        values = shap_vals.values  # ND array
+        if values.ndim == 3:
+            # pick sample 0, class pred_class
+            shap_per_feature = values[0, :, pred_class]
+        elif values.ndim == 2:
+            # shape (n_samples, n_features) for single class models — take sample 0
+            shap_per_feature = values[0, :]
+        else:
+            # try to flatten
+            shap_per_feature = np.ravel(values)[0:vec.shape[1]]
+    except Exception:
+        # Last resort: try shap_vals[0].values
+        try:
+            shap_per_feature = shap_vals[0].values[:, pred_class]
+        except Exception:
+            shap_per_feature = np.zeros(vec.shape[1])
+    # Feature names & vocab
+    feature_names = np.array(vectorizer.get_feature_names_out())
+    vocab = {k: v for k, v in (getattr(vectorizer, "vocabulary_", {})).items()}
+    # Build top contributing words list (pairs)
+    # shap_per_feature length must match len(feature_names)
+    if len(shap_per_feature) != len(feature_names):
+        # try to align by vectorizer.vocabulary_
+        full_shap = np.zeros(len(feature_names))
+        # if shap_per_feature smaller, attempt to use indices from vocab
+        min_len = min(len(shap_per_feature), len(full_shap))
+        full_shap[:min_len] = shap_per_feature[:min_len]
+        shap_per_feature = full_shap
+    # Top positive and negative features
+    n = 10
+    idx_sorted = np.argsort(-np.abs(shap_per_feature))
+    top_idx = idx_sorted[:n]
+    top_words = feature_names[top_idx].tolist()
+    top_contribs = shap_per_feature[top_idx].tolist()
+    # Build word table for display
+    word_table = {"Word": top_words, "SHAP Impact": top_contribs}
+    # Build highlight HTML (token-level approx using unigram mapping)
+    highlight_html = make_html_highlight(text, feature_names, shap_per_feature, vocab)
+    # Return: label string, probabilities dict, table dict, html highlight
+    return f"🎯 **{label}** (confidence: {confidence:.2f})", probs_to_bar(probs.tolist(), language), word_table, highlight_html
+# --------- Gradio UI build ----------
+with gr.Blocks() as demo:
+    gr.Markdown("## 🌍 Multilingual Sentiment Analysis (English 🇬🇧 & Persian 🇮🇷) — Interpretable")
+    with gr.Row():
+        language = gr.Radio(["English", "Persian"], value="English", label="Choose language")
+        text_input = gr.Textbox(lines=4, placeholder="Type comment here...", label="Input text")
+    with gr.Row():
+        btn = gr.Button("Analyze")
+    with gr.Row():
+        pred_out = gr.Markdown()
+    with gr.Row():
+        bar = gr.BarPlot(label="Class probabilities")
+        table = gr.Dataframe(headers=["Word", "SHAP Impact"], label="Top contributing words")
+    with gr.Row():
+        html_out = gr.HTML(label="Word-level Highlight (red = pushes toward prediction, blue = pushes away)")
+    def run(text, lang):
+        label, probs, word_table, html_highlight = explain_and_predict(text, lang)
+        # format outputs for gradio
+        return label, probs, pd.DataFrame(word_table), html_highlight
+    btn.click(fn=run, inputs=[text_input, language], outputs=[pred_out, bar, table, html_out])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=True)