Spaces:

tahamueed23
/

Sentiment-Analyzer

Sleeping

App Files Files Community

tahamueed23 commited on Oct 7

Commit

7373e67

verified ·

1 Parent(s): 23ba503

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -32

app.py CHANGED Viewed

@@ -6,18 +6,18 @@ import re
 from filelock import FileLock
 # -----------------------------
-# Load Models
 # -----------------------------
 english_model = pipeline(
     "sentiment-analysis",
     model="siebert/sentiment-roberta-large-english"
 )
 urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 roman_urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
@@ -35,19 +35,47 @@ if not os.path.exists(SAVE_FILE):
     )
 # -----------------------------
-# Language Detection (rule-based)
 # -----------------------------
 def detect_language(text):
     urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
-    if any(ch in urdu_chars for ch in text):
         return "Urdu"
-    roman_urdu_pattern = r"\b(hai|kia|kyun|nahi|bohot|acha|galat|sahi|parhai|ustad|pyar|dil|insaan)\b"
-    if re.search(roman_urdu_pattern, text.lower()):
         return "Roman Urdu"
     return "English"
 # -----------------------------
-# Normalize Sentiment Labels
 # -----------------------------
 def normalize_label(label):
     label = label.lower()
@@ -64,11 +92,34 @@ def normalize_label(label):
 def sentiment_with_tips(sentiment):
     tips = {
         "Positive": "😊 Great! Keep spreading positivity.",
-        "Negative": "😞 It seems negative. Try to focus on solutions.",
-        "Neutral": "😐 Neutral feeling — balanced perspective."
     }
     return tips.get(sentiment, "")
 # -----------------------------
 # Main Sentiment Function
 # -----------------------------
@@ -77,29 +128,28 @@ def analyze_sentiment(text, lang_hint):
         if not text.strip():
             return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
-        # Auto detect if language hint not selected
         lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
-        # Select correct model
         if lang == "English":
             result = english_model(text)[0]
         elif lang == "Urdu":
             result = urdu_model(text)[0]
-        else:
-            result = roman_urdu_model(text)[0]
-        # Process results
         sentiment = normalize_label(result["label"])
         score = round(float(result["score"]), 3)
         explanation = sentiment_with_tips(sentiment)
-        # Thread-safe CSV append
         with FileLock(LOCK_FILE):
-            if os.path.exists(SAVE_FILE):
-                df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
-            else:
-                df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
             new_row = pd.DataFrame([[text, lang, sentiment, score]],
                                    columns=["Sentence", "Language", "Sentiment", "Confidence"])
             df = pd.concat([df, new_row], ignore_index=True)
@@ -111,12 +161,11 @@ def analyze_sentiment(text, lang_hint):
         return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
 # -----------------------------
-# View Logs Function
 # -----------------------------
 def show_logs():
     if os.path.exists(SAVE_FILE):
-        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
-        return df
     else:
         return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
@@ -125,31 +174,31 @@ def show_logs():
 # -----------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
-        "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
-        "Analyze text sentiment as **Positive**, **Neutral**, or **Negative** with confidence scores.\n\n"
-        "💾 Sentiments are stored permanently — visible to everyone sharing this Space!"
     )
     with gr.Row():
         with gr.Column():
-            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman Urdu...")
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
-                label="🌐 Language", value="Auto Detect"
             )
             btn_analyze = gr.Button("🔍 Analyze Sentiment")
             btn_show = gr.Button("📂 Show Saved Logs")
         with gr.Column():
             out_sent = gr.Textbox(label="Sentiment")
-            out_conf = gr.Textbox(label="Confidence (0–1)")
             out_exp = gr.Textbox(label="Explanation")
-            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
     logs_df = gr.Dataframe(
         headers=["Sentence", "Language", "Sentiment", "Confidence"],
-        label="🧾 Sentiment Logs",
-        interactive=False
     )
     btn_analyze.click(analyze_sentiment,

 from filelock import FileLock
 # -----------------------------
+# Load Transformer Models
 # -----------------------------
 english_model = pipeline(
     "sentiment-analysis",
     model="siebert/sentiment-roberta-large-english"
 )
+# same model but we'll ensemble results for Roman+Urdu
 urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 roman_urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
     )
 # -----------------------------
+# Improved Language Detection
 # -----------------------------
+roman_urdu_keywords = {
+    "acha", "bura", "ganda", "din", "zabardast", "bohot", "pyar",
+    "parhai", "ustad", "kyun", "nahi", "hai", "tha", "karta", "kar",
+    "mera", "tera", "tum", "ka", "kaisa", "raha", "guzra", "galat"
+}
 def detect_language(text):
     urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
+    clean = re.sub(r"[^A-Za-z\u0600-\u06FF]+", " ", text)
+    # rule 1: actual Urdu characters
+    if any(ch in urdu_chars for ch in clean):
         return "Urdu"
+    # rule 2: roman urdu keyword ratio
+    tokens = clean.lower().split()
+    roman_hits = sum(w in roman_urdu_keywords for w in tokens)
+    if roman_hits / max(len(tokens), 1) > 0.2 or roman_hits > 0:
         return "Roman Urdu"
     return "English"
 # -----------------------------
+# Roman Urdu Normalization
+# -----------------------------
+def normalize_roman_urdu(text):
+    replacements = {
+        "acha ni": "acha nahi",
+        "acha nai": "acha nahi",
+        "ganda hy": "ganda hai",
+        "bura hy": "bura hai",
+        "ni": "nahi",
+        "nai": "nahi",
+    }
+    for k, v in replacements.items():
+        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
+    return text
+# -----------------------------
+# Label Normalization
 # -----------------------------
 def normalize_label(label):
     label = label.lower()
 def sentiment_with_tips(sentiment):
     tips = {
         "Positive": "😊 Great! Keep spreading positivity.",
+        "Negative": "😞 Looks negative — maybe reflect and improve things.",
+        "Neutral":  "😐 Neutral observation — balanced view."
     }
     return tips.get(sentiment, "")
+# -----------------------------
+# Neutral Adjuster (Urdu/Descriptive)
+# -----------------------------
+def adjust_for_neutral(text, sentiment, score):
+    neutral_triggers = ["ہورہی ہے", "ہو رہی ہے", "ہے", "tha", "thi"]
+    if sentiment != "Neutral" and any(p in text for p in neutral_triggers):
+        if score < 0.9:  # descriptive statements, low emotional intensity
+            return "Neutral", 0.7
+    return sentiment, score
+# -----------------------------
+# Combine Roman Urdu & Urdu Models (Ensemble)
+# -----------------------------
+def ensemble_roman_urdu(text):
+    ru = roman_urdu_model(text)[0]
+    ur = urdu_model(text)[0]
+    ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
+    if ru_sent == ur_sent:
+        result = ru if ru["score"] >= ur["score"] else ur
+    else:
+        result = ru if ru["score"] * 0.9 >= ur["score"] else ur
+    return result
 # -----------------------------
 # Main Sentiment Function
 # -----------------------------
         if not text.strip():
             return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
+        # auto detect if needed
         lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
+        # select & possibly normalize
         if lang == "English":
             result = english_model(text)[0]
         elif lang == "Urdu":
             result = urdu_model(text)[0]
+        else:  # Roman Urdu
+            text = normalize_roman_urdu(text)
+            result = ensemble_roman_urdu(text)
+        # get normalized sentiment
         sentiment = normalize_label(result["label"])
         score = round(float(result["score"]), 3)
+        sentiment, score = adjust_for_neutral(text, sentiment, score)
         explanation = sentiment_with_tips(sentiment)
+        # store results (thread-safe)
         with FileLock(LOCK_FILE):
+            df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
+                if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
             new_row = pd.DataFrame([[text, lang, sentiment, score]],
                                    columns=["Sentence", "Language", "Sentiment", "Confidence"])
             df = pd.concat([df, new_row], ignore_index=True)
         return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
 # -----------------------------
+# Show Logs
 # -----------------------------
 def show_logs():
     if os.path.exists(SAVE_FILE):
+        return pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
     else:
         return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
 # -----------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
+        "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
+        "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
+        "🪶 **Improvements:** refined Urdu/Roman Urdu detection, better Roman Urdu normalization, ensemble correction, and neutral balancing.\n\n"
+        "💾 All analyzed text is stored permanently in the same CSV, even across shared sessions."
     )
     with gr.Row():
         with gr.Column():
+            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman Urdu...")
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
+                value="Auto Detect", label="🌐 Language"
             )
             btn_analyze = gr.Button("🔍 Analyze Sentiment")
             btn_show = gr.Button("📂 Show Saved Logs")
         with gr.Column():
             out_sent = gr.Textbox(label="Sentiment")
+            out_conf = gr.Textbox(label="Confidence (0–1)")
             out_exp = gr.Textbox(label="Explanation")
+            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
     logs_df = gr.Dataframe(
         headers=["Sentence", "Language", "Sentiment", "Confidence"],
+        label="🧾 Sentiment Logs", interactive=False
     )
     btn_analyze.click(analyze_sentiment,