File size: 6,380 Bytes
ea7b9be
6743c3d
cd458ad
 
 
23ba503
cd458ad
 
7373e67
cd458ad
 
 
 
 
 
 
 
 
 
0780c88
cd458ad
 
 
 
 
 
0780c88
cd458ad
 
23ba503
 
cd458ad
6743c3d
 
 
cd458ad
 
7373e67
cd458ad
 
0780c88
 
cd458ad
7373e67
0780c88
 
 
 
 
 
 
 
 
7373e67
cd458ad
 
 
7373e67
 
 
0780c88
 
 
7373e67
 
 
0780c88
cd458ad
 
 
6743c3d
cd458ad
6743c3d
cd458ad
 
 
 
 
0780c88
cd458ad
0780c88
 
 
 
 
cd458ad
0780c88
cd458ad
7373e67
0780c88
7373e67
 
 
 
0780c88
7373e67
0780c88
7373e67
0780c88
 
 
 
 
 
7373e67
cd458ad
0780c88
cd458ad
0780c88
 
 
 
6743c3d
0780c88
 
 
 
 
 
6743c3d
0780c88
6743c3d
0780c88
 
 
 
 
 
 
6743c3d
0780c88
 
 
 
6743c3d
0780c88
 
 
 
 
 
 
 
6743c3d
0780c88
cd458ad
23ba503
7373e67
23ba503
 
 
7373e67
23ba503
 
 
cd458ad
 
 
 
 
7373e67
 
0780c88
cd458ad
 
 
 
0780c88
23ba503
 
7373e67
23ba503
 
 
cd458ad
 
 
0780c88
 
 
cd458ad
23ba503
 
0780c88
23ba503
 
 
 
 
 
 
3840bbb
 
0780c88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import gradio as gr
from transformers import pipeline
import pandas as pd
import os
import re
from filelock import FileLock

# -----------------------------
# Load Transformer Models
# -----------------------------
english_model = pipeline(
    "sentiment-analysis",
    model="siebert/sentiment-roberta-large-english"
)

urdu_model = pipeline(
    "sentiment-analysis",
    model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
)

roman_urdu_model = pipeline(
    "sentiment-analysis",
    model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
)

# -----------------------------
# CSV Setup
# -----------------------------
SAVE_FILE = "sentiment_logs.csv"
LOCK_FILE = SAVE_FILE + ".lock"

if not os.path.exists(SAVE_FILE):
    pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"]).to_csv(
        SAVE_FILE, index=False, encoding="utf-8-sig"
    )

# -----------------------------
# Improved Language Detection
# -----------------------------
def detect_language(text):
    urdu_script = re.compile(r"[\u0600-\u06FF]")
    if urdu_script.search(text):
        return "Urdu"

    roman_urdu_patterns = [
        r"\b(hai|hain|tha|thi|parhta|parhai|acha|bura|bohot|zabardast)\b",
        r"\b(sir|madam|ustad|class|parh|samajh)\b",
    ]

    text_l = text.lower()
    for p in roman_urdu_patterns:
        if re.search(p, text_l):
            return "Roman Urdu"

    return "English"

# -----------------------------
# Roman Urdu Normalization
# -----------------------------
def normalize_roman_urdu(text):
    text = text.lower()
    text = text.replace("hy", "hai").replace("h", "hai")
    text = re.sub(r"\bnhi\b|\bnai\b|\bnhi\b", "nahi", text)
    return text

# -----------------------------
# Normalize Labels
# -----------------------------
def normalize_label(label):
    label = label.lower()
    if "pos" in label or "positive" in label:
        return "Positive"
    elif "neg" in label or "negative" in label:
        return "Negative"
    else:
        return "Neutral"

# -----------------------------
# Polarity Explanation
# -----------------------------
def polarity_explanation(text, sentiment):
    explanations = {
        "Positive": "Contains praise words or positive evaluation.",
        "Negative": "Contains criticism or negative expressions.",
        "Neutral": "Factual statement or balanced observation."
    }
    return explanations.get(sentiment, "")

# -----------------------------
# Ensemble Roman Urdu + Urdu
# -----------------------------
def ensemble_roman_urdu(text):
    ru = roman_urdu_model(text)[0]
    ur = urdu_model(text)[0]

    ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])

    if ru_sent == ur_sent:
        return ru if ru["score"] >= ur["score"] else ur

    # Weight Roman Urdu higher for Roman Urdu input
    weight_ru = ru["score"] * 1.25
    weight_ur = ur["score"]
    return ru if weight_ru >= weight_ur else ur

# -----------------------------
# Adjust sentiment if low intensity
# -----------------------------
def adjust_for_neutral(text, sentiment, score):
    if sentiment in ["Positive", "Negative"] and score < 0.7:
        return "Neutral", score
    return sentiment, score

# -----------------------------
# Main Analysis Function
# -----------------------------
def analyze_sentiment(text, lang_hint):
    if not text.strip():
        return "⚠️ Please enter a sentence.", "", "", SAVE_FILE

    lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)

    if lang == "English":
        result = english_model(text)[0]
    elif lang == "Urdu":
        result = urdu_model(text)[0]
    else:
        text = normalize_roman_urdu(text)
        result = ensemble_roman_urdu(text)

    sentiment = normalize_label(result["label"])
    score = round(float(result["score"]), 3)
    sentiment, score = adjust_for_neutral(text, sentiment, score)
    explanation = polarity_explanation(text, sentiment)

    # Save logs
    with FileLock(LOCK_FILE):
        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
            if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
        new_row = pd.DataFrame([[text, lang, sentiment, score]],
                               columns=["Sentence", "Language", "Sentiment", "Confidence"])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")

    return sentiment, str(score), explanation, SAVE_FILE

# -----------------------------
# Show Logs
# -----------------------------
def show_logs():
    if os.path.exists(SAVE_FILE):
        return pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
    else:
        return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])

# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown(
        "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
        "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
        "🪶 Improved Roman Urdu normalization + ensemble + polarity explanation.\n"
    )

    with gr.Row():
        with gr.Column():
            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type English, Urdu, or Roman Urdu...")
            lang_dropdown = gr.Dropdown(
                ["Auto Detect", "English", "Urdu", "Roman Urdu"],
                value="Auto Detect", label="🌐 Language"
            )
            btn_analyze = gr.Button("🔍 Analyze Sentiment")
            btn_show = gr.Button("📂 Show Saved Logs")

        with gr.Column():
            out_sent = gr.Textbox(label="Sentiment")
            out_conf = gr.Textbox(label="Confidence (0–1)")
            out_exp = gr.Textbox(label="Polarity Explanation")
            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")

    logs_df = gr.Dataframe(
        headers=["Sentence", "Language", "Sentiment", "Confidence"],
        label="🧾 Sentiment Logs", interactive=False
    )

    btn_analyze.click(analyze_sentiment,
                      inputs=[user_text, lang_dropdown],
                      outputs=[out_sent, out_conf, out_exp, out_file])

    btn_show.click(show_logs, outputs=[logs_df])

if __name__ == "__main__":
    demo.launch()