tahamueed23 commited on
Commit
7373e67
·
verified ·
1 Parent(s): 23ba503

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -32
app.py CHANGED
@@ -6,18 +6,18 @@ import re
6
  from filelock import FileLock
7
 
8
  # -----------------------------
9
- # Load Models
10
  # -----------------------------
11
  english_model = pipeline(
12
  "sentiment-analysis",
13
  model="siebert/sentiment-roberta-large-english"
14
  )
15
 
 
16
  urdu_model = pipeline(
17
  "sentiment-analysis",
18
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
19
  )
20
-
21
  roman_urdu_model = pipeline(
22
  "sentiment-analysis",
23
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
@@ -35,19 +35,47 @@ if not os.path.exists(SAVE_FILE):
35
  )
36
 
37
  # -----------------------------
38
- # Language Detection (rule-based)
39
  # -----------------------------
 
 
 
 
 
 
40
  def detect_language(text):
41
  urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
42
- if any(ch in urdu_chars for ch in text):
 
 
43
  return "Urdu"
44
- roman_urdu_pattern = r"\b(hai|kia|kyun|nahi|bohot|acha|galat|sahi|parhai|ustad|pyar|dil|insaan)\b"
45
- if re.search(roman_urdu_pattern, text.lower()):
 
 
 
46
  return "Roman Urdu"
 
47
  return "English"
48
 
49
  # -----------------------------
50
- # Normalize Sentiment Labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # -----------------------------
52
  def normalize_label(label):
53
  label = label.lower()
@@ -64,11 +92,34 @@ def normalize_label(label):
64
  def sentiment_with_tips(sentiment):
65
  tips = {
66
  "Positive": "😊 Great! Keep spreading positivity.",
67
- "Negative": "😞 It seems negative. Try to focus on solutions.",
68
- "Neutral": "😐 Neutral feeling — balanced perspective."
69
  }
70
  return tips.get(sentiment, "")
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # -----------------------------
73
  # Main Sentiment Function
74
  # -----------------------------
@@ -77,29 +128,28 @@ def analyze_sentiment(text, lang_hint):
77
  if not text.strip():
78
  return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
79
 
80
- # Auto detect if language hint not selected
81
  lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
82
 
83
- # Select correct model
84
  if lang == "English":
85
  result = english_model(text)[0]
86
  elif lang == "Urdu":
87
  result = urdu_model(text)[0]
88
- else:
89
- result = roman_urdu_model(text)[0]
 
90
 
91
- # Process results
92
  sentiment = normalize_label(result["label"])
93
  score = round(float(result["score"]), 3)
 
94
  explanation = sentiment_with_tips(sentiment)
95
 
96
- # Thread-safe CSV append
97
  with FileLock(LOCK_FILE):
98
- if os.path.exists(SAVE_FILE):
99
- df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
100
- else:
101
- df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
102
-
103
  new_row = pd.DataFrame([[text, lang, sentiment, score]],
104
  columns=["Sentence", "Language", "Sentiment", "Confidence"])
105
  df = pd.concat([df, new_row], ignore_index=True)
@@ -111,12 +161,11 @@ def analyze_sentiment(text, lang_hint):
111
  return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
112
 
113
  # -----------------------------
114
- # View Logs Function
115
  # -----------------------------
116
  def show_logs():
117
  if os.path.exists(SAVE_FILE):
118
- df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
119
- return df
120
  else:
121
  return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
122
 
@@ -125,31 +174,31 @@ def show_logs():
125
  # -----------------------------
126
  with gr.Blocks() as demo:
127
  gr.Markdown(
128
- "## 🌍 Multilingual Sentiment Analysis (English Urdu Roman Urdu)\n"
129
- "Analyze text sentiment as **Positive**, **Neutral**, or **Negative** with confidence scores.\n\n"
130
- "💾 Sentiments are stored permanently visible to everyone sharing this Space!"
 
131
  )
132
 
133
  with gr.Row():
134
  with gr.Column():
135
- user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman Urdu...")
136
  lang_dropdown = gr.Dropdown(
137
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
138
- label="🌐 Language", value="Auto Detect"
139
  )
140
  btn_analyze = gr.Button("🔍 Analyze Sentiment")
141
  btn_show = gr.Button("📂 Show Saved Logs")
142
 
143
  with gr.Column():
144
  out_sent = gr.Textbox(label="Sentiment")
145
- out_conf = gr.Textbox(label="Confidence (0–1)")
146
  out_exp = gr.Textbox(label="Explanation")
147
- out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
148
 
149
  logs_df = gr.Dataframe(
150
  headers=["Sentence", "Language", "Sentiment", "Confidence"],
151
- label="🧾 Sentiment Logs",
152
- interactive=False
153
  )
154
 
155
  btn_analyze.click(analyze_sentiment,
 
6
  from filelock import FileLock
7
 
8
  # -----------------------------
9
+ # Load Transformer Models
10
  # -----------------------------
11
  english_model = pipeline(
12
  "sentiment-analysis",
13
  model="siebert/sentiment-roberta-large-english"
14
  )
15
 
16
+ # same model but we'll ensemble results for Roman+Urdu
17
  urdu_model = pipeline(
18
  "sentiment-analysis",
19
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
20
  )
 
21
  roman_urdu_model = pipeline(
22
  "sentiment-analysis",
23
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 
35
  )
36
 
37
  # -----------------------------
38
+ # Improved Language Detection
39
  # -----------------------------
40
+ roman_urdu_keywords = {
41
+ "acha", "bura", "ganda", "din", "zabardast", "bohot", "pyar",
42
+ "parhai", "ustad", "kyun", "nahi", "hai", "tha", "karta", "kar",
43
+ "mera", "tera", "tum", "ka", "kaisa", "raha", "guzra", "galat"
44
+ }
45
+
46
  def detect_language(text):
47
  urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
48
+ clean = re.sub(r"[^A-Za-z\u0600-\u06FF]+", " ", text)
49
+ # rule 1: actual Urdu characters
50
+ if any(ch in urdu_chars for ch in clean):
51
  return "Urdu"
52
+
53
+ # rule 2: roman urdu keyword ratio
54
+ tokens = clean.lower().split()
55
+ roman_hits = sum(w in roman_urdu_keywords for w in tokens)
56
+ if roman_hits / max(len(tokens), 1) > 0.2 or roman_hits > 0:
57
  return "Roman Urdu"
58
+
59
  return "English"
60
 
61
  # -----------------------------
62
+ # Roman Urdu Normalization
63
+ # -----------------------------
64
+ def normalize_roman_urdu(text):
65
+ replacements = {
66
+ "acha ni": "acha nahi",
67
+ "acha nai": "acha nahi",
68
+ "ganda hy": "ganda hai",
69
+ "bura hy": "bura hai",
70
+ "ni": "nahi",
71
+ "nai": "nahi",
72
+ }
73
+ for k, v in replacements.items():
74
+ text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
75
+ return text
76
+
77
+ # -----------------------------
78
+ # Label Normalization
79
  # -----------------------------
80
  def normalize_label(label):
81
  label = label.lower()
 
92
  def sentiment_with_tips(sentiment):
93
  tips = {
94
  "Positive": "😊 Great! Keep spreading positivity.",
95
+ "Negative": "😞 Looks negative maybe reflect and improve things.",
96
+ "Neutral": "😐 Neutral observation — balanced view."
97
  }
98
  return tips.get(sentiment, "")
99
 
100
+ # -----------------------------
101
+ # Neutral Adjuster (Urdu/Descriptive)
102
+ # -----------------------------
103
+ def adjust_for_neutral(text, sentiment, score):
104
+ neutral_triggers = ["ہورہی ہے", "ہو رہی ہے", "ہے", "tha", "thi"]
105
+ if sentiment != "Neutral" and any(p in text for p in neutral_triggers):
106
+ if score < 0.9: # descriptive statements, low emotional intensity
107
+ return "Neutral", 0.7
108
+ return sentiment, score
109
+
110
+ # -----------------------------
111
+ # Combine Roman Urdu & Urdu Models (Ensemble)
112
+ # -----------------------------
113
+ def ensemble_roman_urdu(text):
114
+ ru = roman_urdu_model(text)[0]
115
+ ur = urdu_model(text)[0]
116
+ ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
117
+ if ru_sent == ur_sent:
118
+ result = ru if ru["score"] >= ur["score"] else ur
119
+ else:
120
+ result = ru if ru["score"] * 0.9 >= ur["score"] else ur
121
+ return result
122
+
123
  # -----------------------------
124
  # Main Sentiment Function
125
  # -----------------------------
 
128
  if not text.strip():
129
  return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
130
 
131
+ # auto detect if needed
132
  lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
133
 
134
+ # select & possibly normalize
135
  if lang == "English":
136
  result = english_model(text)[0]
137
  elif lang == "Urdu":
138
  result = urdu_model(text)[0]
139
+ else: # Roman Urdu
140
+ text = normalize_roman_urdu(text)
141
+ result = ensemble_roman_urdu(text)
142
 
143
+ # get normalized sentiment
144
  sentiment = normalize_label(result["label"])
145
  score = round(float(result["score"]), 3)
146
+ sentiment, score = adjust_for_neutral(text, sentiment, score)
147
  explanation = sentiment_with_tips(sentiment)
148
 
149
+ # store results (thread-safe)
150
  with FileLock(LOCK_FILE):
151
+ df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
152
+ if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
 
 
 
153
  new_row = pd.DataFrame([[text, lang, sentiment, score]],
154
  columns=["Sentence", "Language", "Sentiment", "Confidence"])
155
  df = pd.concat([df, new_row], ignore_index=True)
 
161
  return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
162
 
163
  # -----------------------------
164
+ # Show Logs
165
  # -----------------------------
166
  def show_logs():
167
  if os.path.exists(SAVE_FILE):
168
+ return pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
 
169
  else:
170
  return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
171
 
 
174
  # -----------------------------
175
  with gr.Blocks() as demo:
176
  gr.Markdown(
177
+ "## 🌍 Multilingual Sentiment Analysis (EnglishUrduRomanUrdu)\n"
178
+ "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
179
+ "🪶 **Improvements:** refined Urdu/Roman Urdu detection, better Roman Urdu normalization, ensemble correction, and neutral balancing.\n\n"
180
+ "💾 All analyzed text is stored permanently in the same CSV, even across shared sessions."
181
  )
182
 
183
  with gr.Row():
184
  with gr.Column():
185
+ user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or RomanUrdu...")
186
  lang_dropdown = gr.Dropdown(
187
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
188
+ value="Auto Detect", label="🌐 Language"
189
  )
190
  btn_analyze = gr.Button("🔍 Analyze Sentiment")
191
  btn_show = gr.Button("📂 Show Saved Logs")
192
 
193
  with gr.Column():
194
  out_sent = gr.Textbox(label="Sentiment")
195
+ out_conf = gr.Textbox(label="Confidence(0–1)")
196
  out_exp = gr.Textbox(label="Explanation")
197
+ out_file = gr.File(label="⬇️ DownloadLogs(.csv)", type="filepath")
198
 
199
  logs_df = gr.Dataframe(
200
  headers=["Sentence", "Language", "Sentiment", "Confidence"],
201
+ label="🧾SentimentLogs", interactive=False
 
202
  )
203
 
204
  btn_analyze.click(analyze_sentiment,