File size: 3,288 Bytes
f585727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import joblib
import re
import numpy as np

# ==========================================================
# 🔹 1️⃣ Load Models and Vectorizers
# ==========================================================
english_model = joblib.load("logistic_regression_english.pkl")
english_vectorizer = joblib.load("tfidf_vectorizer_english.pkl")

persian_model = joblib.load("logistic_regression_persian.pkl")
persian_vectorizer = joblib.load("tfidf_vectorizer_persian.pkl")

# Label mapping from training
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# ==========================================================
# 🔹 2️⃣ Preprocessing (must match training exactly)
# ==========================================================
def clean_english_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_persian_text(text):
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # keep only Persian chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ==========================================================
# 🔹 3️⃣ Prediction Function
# ==========================================================
def predict_sentiment(text, language):
    if not text.strip():
        return "⚠ Please enter some text."
    
    if language == "English":
        cleaned = clean_english_text(text)
        vec = english_vectorizer.transform([cleaned])
        probs = english_model.predict_proba(vec)[0]
        pred = np.argmax(probs)
        return f"Prediction: {label_map[pred]} ({probs[pred]:.2f} confidence)"

    elif language == "Persian":
        cleaned = clean_persian_text(text)
        vec = persian_vectorizer.transform([cleaned])
        probs = persian_model.predict_proba(vec)[0]
        pred = np.argmax(probs)
        return f"Prediction: {label_map[pred]} ({probs[pred]:.2f} confidence)"
    
    else:
        return "❌ Invalid language option selected."

# ==========================================================
# 🔹 4️⃣ Debug Info (Optional - Check Vocabulary Size)
# ==========================================================
print(f"✅ English vectorizer vocabulary size: {len(english_vectorizer.get_feature_names_out())}")
print(f"✅ Persian vectorizer vocabulary size: {len(persian_vectorizer.get_feature_names_out())}")

# ==========================================================
# 🔹 5️⃣ Gradio Interface
# ==========================================================
iface = gr.Interface(
    fn=predict_sentiment,
    inputs=[
        gr.Textbox(lines=3, label="Enter Text"),
        gr.Radio(["English", "Persian"], label="Select Language", value="English")
    ],
    outputs=gr.Textbox(label="Predicted Sentiment"),
    title="🌍 Multilingual Sentiment Classifier (English & Persian)",
    description="Choose your language and get sentiment prediction with confidence score.",
    examples=[
        ["This movie was amazing!", "English"],
        ["The worst experience ever", "English"],
        ["این فیلم خیلی بد بود", "Persian"],
        ["من این محصول را دوست دارم", "Persian"]
    ]
)

iface.launch()