Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,18 +6,18 @@ import re
|
|
| 6 |
from filelock import FileLock
|
| 7 |
|
| 8 |
# -----------------------------
|
| 9 |
-
# Load Models
|
| 10 |
# -----------------------------
|
| 11 |
english_model = pipeline(
|
| 12 |
"sentiment-analysis",
|
| 13 |
model="siebert/sentiment-roberta-large-english"
|
| 14 |
)
|
| 15 |
|
|
|
|
| 16 |
urdu_model = pipeline(
|
| 17 |
"sentiment-analysis",
|
| 18 |
model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
|
| 19 |
)
|
| 20 |
-
|
| 21 |
roman_urdu_model = pipeline(
|
| 22 |
"sentiment-analysis",
|
| 23 |
model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
|
|
@@ -35,19 +35,47 @@ if not os.path.exists(SAVE_FILE):
|
|
| 35 |
)
|
| 36 |
|
| 37 |
# -----------------------------
|
| 38 |
-
# Language Detection
|
| 39 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def detect_language(text):
|
| 41 |
urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
return "Urdu"
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
return "Roman Urdu"
|
|
|
|
| 47 |
return "English"
|
| 48 |
|
| 49 |
# -----------------------------
|
| 50 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# -----------------------------
|
| 52 |
def normalize_label(label):
|
| 53 |
label = label.lower()
|
|
@@ -64,11 +92,34 @@ def normalize_label(label):
|
|
| 64 |
def sentiment_with_tips(sentiment):
|
| 65 |
tips = {
|
| 66 |
"Positive": "😊 Great! Keep spreading positivity.",
|
| 67 |
-
"Negative": "😞
|
| 68 |
-
"Neutral":
|
| 69 |
}
|
| 70 |
return tips.get(sentiment, "")
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
# -----------------------------
|
| 73 |
# Main Sentiment Function
|
| 74 |
# -----------------------------
|
|
@@ -77,29 +128,28 @@ def analyze_sentiment(text, lang_hint):
|
|
| 77 |
if not text.strip():
|
| 78 |
return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
|
| 79 |
|
| 80 |
-
#
|
| 81 |
lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
|
| 82 |
|
| 83 |
-
#
|
| 84 |
if lang == "English":
|
| 85 |
result = english_model(text)[0]
|
| 86 |
elif lang == "Urdu":
|
| 87 |
result = urdu_model(text)[0]
|
| 88 |
-
else:
|
| 89 |
-
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
sentiment = normalize_label(result["label"])
|
| 93 |
score = round(float(result["score"]), 3)
|
|
|
|
| 94 |
explanation = sentiment_with_tips(sentiment)
|
| 95 |
|
| 96 |
-
#
|
| 97 |
with FileLock(LOCK_FILE):
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
else:
|
| 101 |
-
df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
| 102 |
-
|
| 103 |
new_row = pd.DataFrame([[text, lang, sentiment, score]],
|
| 104 |
columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
| 105 |
df = pd.concat([df, new_row], ignore_index=True)
|
|
@@ -111,12 +161,11 @@ def analyze_sentiment(text, lang_hint):
|
|
| 111 |
return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
|
| 112 |
|
| 113 |
# -----------------------------
|
| 114 |
-
#
|
| 115 |
# -----------------------------
|
| 116 |
def show_logs():
|
| 117 |
if os.path.exists(SAVE_FILE):
|
| 118 |
-
|
| 119 |
-
return df
|
| 120 |
else:
|
| 121 |
return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
| 122 |
|
|
@@ -125,31 +174,31 @@ def show_logs():
|
|
| 125 |
# -----------------------------
|
| 126 |
with gr.Blocks() as demo:
|
| 127 |
gr.Markdown(
|
| 128 |
-
"## 🌍 Multilingual Sentiment Analysis (English
|
| 129 |
-
"
|
| 130 |
-
"
|
|
|
|
| 131 |
)
|
| 132 |
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column():
|
| 135 |
-
user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman
|
| 136 |
lang_dropdown = gr.Dropdown(
|
| 137 |
["Auto Detect", "English", "Urdu", "Roman Urdu"],
|
| 138 |
-
|
| 139 |
)
|
| 140 |
btn_analyze = gr.Button("🔍 Analyze Sentiment")
|
| 141 |
btn_show = gr.Button("📂 Show Saved Logs")
|
| 142 |
|
| 143 |
with gr.Column():
|
| 144 |
out_sent = gr.Textbox(label="Sentiment")
|
| 145 |
-
out_conf = gr.Textbox(label="Confidence
|
| 146 |
out_exp = gr.Textbox(label="Explanation")
|
| 147 |
-
out_file = gr.File(label="⬇️ Download
|
| 148 |
|
| 149 |
logs_df = gr.Dataframe(
|
| 150 |
headers=["Sentence", "Language", "Sentiment", "Confidence"],
|
| 151 |
-
label="🧾
|
| 152 |
-
interactive=False
|
| 153 |
)
|
| 154 |
|
| 155 |
btn_analyze.click(analyze_sentiment,
|
|
|
|
| 6 |
from filelock import FileLock
|
| 7 |
|
| 8 |
# -----------------------------
|
| 9 |
+
# Load Transformer Models
|
| 10 |
# -----------------------------
|
| 11 |
english_model = pipeline(
|
| 12 |
"sentiment-analysis",
|
| 13 |
model="siebert/sentiment-roberta-large-english"
|
| 14 |
)
|
| 15 |
|
| 16 |
+
# same model but we'll ensemble results for Roman+Urdu
|
| 17 |
urdu_model = pipeline(
|
| 18 |
"sentiment-analysis",
|
| 19 |
model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
|
| 20 |
)
|
|
|
|
| 21 |
roman_urdu_model = pipeline(
|
| 22 |
"sentiment-analysis",
|
| 23 |
model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
# -----------------------------
|
| 38 |
+
# Improved Language Detection
|
| 39 |
# -----------------------------
|
| 40 |
+
roman_urdu_keywords = {
|
| 41 |
+
"acha", "bura", "ganda", "din", "zabardast", "bohot", "pyar",
|
| 42 |
+
"parhai", "ustad", "kyun", "nahi", "hai", "tha", "karta", "kar",
|
| 43 |
+
"mera", "tera", "tum", "ka", "kaisa", "raha", "guzra", "galat"
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
def detect_language(text):
|
| 47 |
urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
|
| 48 |
+
clean = re.sub(r"[^A-Za-z\u0600-\u06FF]+", " ", text)
|
| 49 |
+
# rule 1: actual Urdu characters
|
| 50 |
+
if any(ch in urdu_chars for ch in clean):
|
| 51 |
return "Urdu"
|
| 52 |
+
|
| 53 |
+
# rule 2: roman urdu keyword ratio
|
| 54 |
+
tokens = clean.lower().split()
|
| 55 |
+
roman_hits = sum(w in roman_urdu_keywords for w in tokens)
|
| 56 |
+
if roman_hits / max(len(tokens), 1) > 0.2 or roman_hits > 0:
|
| 57 |
return "Roman Urdu"
|
| 58 |
+
|
| 59 |
return "English"
|
| 60 |
|
| 61 |
# -----------------------------
|
| 62 |
+
# Roman Urdu Normalization
|
| 63 |
+
# -----------------------------
|
| 64 |
+
def normalize_roman_urdu(text):
|
| 65 |
+
replacements = {
|
| 66 |
+
"acha ni": "acha nahi",
|
| 67 |
+
"acha nai": "acha nahi",
|
| 68 |
+
"ganda hy": "ganda hai",
|
| 69 |
+
"bura hy": "bura hai",
|
| 70 |
+
"ni": "nahi",
|
| 71 |
+
"nai": "nahi",
|
| 72 |
+
}
|
| 73 |
+
for k, v in replacements.items():
|
| 74 |
+
text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
|
| 75 |
+
return text
|
| 76 |
+
|
| 77 |
+
# -----------------------------
|
| 78 |
+
# Label Normalization
|
| 79 |
# -----------------------------
|
| 80 |
def normalize_label(label):
|
| 81 |
label = label.lower()
|
|
|
|
| 92 |
def sentiment_with_tips(sentiment):
|
| 93 |
tips = {
|
| 94 |
"Positive": "😊 Great! Keep spreading positivity.",
|
| 95 |
+
"Negative": "😞 Looks negative — maybe reflect and improve things.",
|
| 96 |
+
"Neutral": "😐 Neutral observation — balanced view."
|
| 97 |
}
|
| 98 |
return tips.get(sentiment, "")
|
| 99 |
|
| 100 |
+
# -----------------------------
|
| 101 |
+
# Neutral Adjuster (Urdu/Descriptive)
|
| 102 |
+
# -----------------------------
|
| 103 |
+
def adjust_for_neutral(text, sentiment, score):
|
| 104 |
+
neutral_triggers = ["ہورہی ہے", "ہو رہی ہے", "ہے", "tha", "thi"]
|
| 105 |
+
if sentiment != "Neutral" and any(p in text for p in neutral_triggers):
|
| 106 |
+
if score < 0.9: # descriptive statements, low emotional intensity
|
| 107 |
+
return "Neutral", 0.7
|
| 108 |
+
return sentiment, score
|
| 109 |
+
|
| 110 |
+
# -----------------------------
|
| 111 |
+
# Combine Roman Urdu & Urdu Models (Ensemble)
|
| 112 |
+
# -----------------------------
|
| 113 |
+
def ensemble_roman_urdu(text):
|
| 114 |
+
ru = roman_urdu_model(text)[0]
|
| 115 |
+
ur = urdu_model(text)[0]
|
| 116 |
+
ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
|
| 117 |
+
if ru_sent == ur_sent:
|
| 118 |
+
result = ru if ru["score"] >= ur["score"] else ur
|
| 119 |
+
else:
|
| 120 |
+
result = ru if ru["score"] * 0.9 >= ur["score"] else ur
|
| 121 |
+
return result
|
| 122 |
+
|
| 123 |
# -----------------------------
|
| 124 |
# Main Sentiment Function
|
| 125 |
# -----------------------------
|
|
|
|
| 128 |
if not text.strip():
|
| 129 |
return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
|
| 130 |
|
| 131 |
+
# auto detect if needed
|
| 132 |
lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
|
| 133 |
|
| 134 |
+
# select & possibly normalize
|
| 135 |
if lang == "English":
|
| 136 |
result = english_model(text)[0]
|
| 137 |
elif lang == "Urdu":
|
| 138 |
result = urdu_model(text)[0]
|
| 139 |
+
else: # Roman Urdu
|
| 140 |
+
text = normalize_roman_urdu(text)
|
| 141 |
+
result = ensemble_roman_urdu(text)
|
| 142 |
|
| 143 |
+
# get normalized sentiment
|
| 144 |
sentiment = normalize_label(result["label"])
|
| 145 |
score = round(float(result["score"]), 3)
|
| 146 |
+
sentiment, score = adjust_for_neutral(text, sentiment, score)
|
| 147 |
explanation = sentiment_with_tips(sentiment)
|
| 148 |
|
| 149 |
+
# store results (thread-safe)
|
| 150 |
with FileLock(LOCK_FILE):
|
| 151 |
+
df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
|
| 152 |
+
if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
|
|
|
|
|
|
|
|
|
| 153 |
new_row = pd.DataFrame([[text, lang, sentiment, score]],
|
| 154 |
columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
| 155 |
df = pd.concat([df, new_row], ignore_index=True)
|
|
|
|
| 161 |
return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
|
| 162 |
|
| 163 |
# -----------------------------
|
| 164 |
+
# Show Logs
|
| 165 |
# -----------------------------
|
| 166 |
def show_logs():
|
| 167 |
if os.path.exists(SAVE_FILE):
|
| 168 |
+
return pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
|
|
|
|
| 169 |
else:
|
| 170 |
return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
|
| 171 |
|
|
|
|
| 174 |
# -----------------------------
|
| 175 |
with gr.Blocks() as demo:
|
| 176 |
gr.Markdown(
|
| 177 |
+
"## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
|
| 178 |
+
"Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
|
| 179 |
+
"🪶 **Improvements:** refined Urdu/Roman Urdu detection, better Roman Urdu normalization, ensemble correction, and neutral balancing.\n\n"
|
| 180 |
+
"💾 All analyzed text is stored permanently in the same CSV, even across shared sessions."
|
| 181 |
)
|
| 182 |
|
| 183 |
with gr.Row():
|
| 184 |
with gr.Column():
|
| 185 |
+
user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman Urdu...")
|
| 186 |
lang_dropdown = gr.Dropdown(
|
| 187 |
["Auto Detect", "English", "Urdu", "Roman Urdu"],
|
| 188 |
+
value="Auto Detect", label="🌐 Language"
|
| 189 |
)
|
| 190 |
btn_analyze = gr.Button("🔍 Analyze Sentiment")
|
| 191 |
btn_show = gr.Button("📂 Show Saved Logs")
|
| 192 |
|
| 193 |
with gr.Column():
|
| 194 |
out_sent = gr.Textbox(label="Sentiment")
|
| 195 |
+
out_conf = gr.Textbox(label="Confidence (0–1)")
|
| 196 |
out_exp = gr.Textbox(label="Explanation")
|
| 197 |
+
out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
|
| 198 |
|
| 199 |
logs_df = gr.Dataframe(
|
| 200 |
headers=["Sentence", "Language", "Sentiment", "Confidence"],
|
| 201 |
+
label="🧾 Sentiment Logs", interactive=False
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
btn_analyze.click(analyze_sentiment,
|