RaymondWongWL commited on
Commit
c96129a
·
1 Parent(s): da9228c

feat: add sentiment score for top entities for summary page

Browse files
Files changed (1) hide show
  1. app/controllers/summary/utils.py +78 -36
app/controllers/summary/utils.py CHANGED
@@ -8,6 +8,47 @@ from collections import defaultdict
8
 
9
  from database import article_collection, entity_collection
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def _get_latest_publish_date_from_collection(collection) -> datetime:
13
  """Return the latest publish date found in the specified collection.
@@ -114,7 +155,7 @@ def get_content_flow_data(time_filter: str) -> Dict[str, Any]:
114
 
115
 
116
  def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
117
- """Return *Entity Analysis* data for the given period.
118
 
119
  Uses rolling window approach:
120
  - today: only the latest date
@@ -129,59 +170,60 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
129
  Returns
130
  -------
131
  Dict[str, Any]
132
- Dictionary containing title, dateRange, and aggregated entity analysis data.
133
  """
134
  start, end = _time_range(time_filter, entity_collection)
135
 
136
- pipeline = [
 
137
  {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
138
  {"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
139
  "mentions": {"$sum": "$occurrence"}}},
140
  {"$sort": {"mentions": -1}},
141
  ]
142
 
143
- results = list(entity_collection.aggregate(pipeline))
144
-
145
- type_full_names = {
146
- "GPE": "Geopolitical Entities (Countries/Cities)",
147
- "LOC": "Locations (Non-political)",
148
- "ORG": "Organizations",
149
- "PER": "People",
150
- "PERSON": "People",
151
- "PROD": "Products",
152
- "PRODUCT": "Products",
153
- "PRODCAT": "Product Categories",
154
- "PRODUCT_CATEGORY": "Product Categories",
155
- "COM": "Companies",
156
- "EVENT": "Events",
157
- "LANGUAGE": "Languages",
158
- "NORP": "Nationalities/Religious/Political Groups",
159
- "LAW": "Laws/Legal Documents",
160
- "FAC": "Facilities/Landmarks",
161
- "INS": "Industry Institutions",
162
- }
163
 
164
  entity_types: Dict[str, Any] = {}
165
- for r in results:
166
- e_type = r["_id"]["type"]
167
- entity_name = r["_id"]["entity"].replace("_", " ")
168
- if e_type not in entity_types:
169
- entity_types[e_type] = {
170
- "fullName": type_full_names.get(e_type, e_type),
171
  "entities": [],
172
  }
173
- entity_types[e_type]["entities"].append(
174
- {"entityName": entity_name, "mentions": r["mentions"]}
 
175
  )
176
 
177
- # keep only the top 10 per type
178
- for type_data in entity_types.values():
179
- type_data["entities"] = sorted(
180
- type_data["entities"], key=lambda x: -x["mentions"]
181
  )[:10]
182
 
183
  return {
184
- "title": f"Top Entities {time_filter.capitalize()}",
185
  "dateRange": {"start": start, "end": end},
186
  "data": entity_types,
187
  }
 
8
 
9
  from database import article_collection, entity_collection
10
 
11
+ # Entity type full names mapping
12
+ ENTITY_TYPE_FULL_NAMES = {
13
+ "GPE": "Geopolitical Entities (Countries/Cities)",
14
+ "LOC": "Locations (Non-political)",
15
+ "ORG": "Organizations",
16
+ "PER": "People",
17
+ "PERSON": "People",
18
+ "PROD": "Products",
19
+ "PRODUCT": "Products",
20
+ "PRODCAT": "Product Categories",
21
+ "PRODUCT_CATEGORY": "Product Categories",
22
+ "COM": "Companies",
23
+ "EVENT": "Events",
24
+ "LANGUAGE": "Languages",
25
+ "NORP": "Nationalities/Religious/Political Groups",
26
+ "LAW": "Laws/Legal Documents",
27
+ "FAC": "Facilities/Landmarks",
28
+ "INS": "Industry Institutions",
29
+ }
30
+
31
+
32
+ def _build_sentiment_lookup(sentiment_results: list) -> Dict:
33
+ """Build sentiment lookup dictionary from sentiment aggregation results."""
34
+ sentiment_lookup = {}
35
+ for result in sentiment_results:
36
+ key = (result["_id"]["entity"], result["_id"]["type"])
37
+ sentiment_lookup[key] = round(result["avgSentiment"], 3)
38
+ return sentiment_lookup
39
+
40
+
41
+ def _process_entity_with_sentiment(mentions_result: Dict, sentiment_lookup: Dict) -> Dict[str, Any]:
42
+ """Process a single entity result and add sentiment information."""
43
+ entity_id = mentions_result["_id"]
44
+ entity_key = (entity_id["entity"], entity_id["type"])
45
+
46
+ return {
47
+ "entityName": entity_id["entity"].replace("_", " "),
48
+ "mentions": mentions_result["mentions"],
49
+ "sentiment": sentiment_lookup.get(entity_key)
50
+ }
51
+
52
 
53
  def _get_latest_publish_date_from_collection(collection) -> datetime:
54
  """Return the latest publish date found in the specified collection.
 
155
 
156
 
157
  def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
158
+ """Return *Entity Analysis* data for the given period with sentiment information.
159
 
160
  Uses rolling window approach:
161
  - today: only the latest date
 
170
  Returns
171
  -------
172
  Dict[str, Any]
173
+ Dictionary containing title, dateRange, and aggregated entity analysis data with sentiment.
174
  """
175
  start, end = _time_range(time_filter, entity_collection)
176
 
177
+ # Get mentions count pipeline
178
+ mentions_pipeline = [
179
  {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
180
  {"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
181
  "mentions": {"$sum": "$occurrence"}}},
182
  {"$sort": {"mentions": -1}},
183
  ]
184
 
185
+ # Get sentiment data pipeline
186
+ sentiment_pipeline = [
187
+ {"$match": {
188
+ "publishDate": {"$gte": start, "$lte": end},
189
+ "sentimentScore": {"$exists": True, "$ne": None}
190
+ }},
191
+ {"$group": {
192
+ "_id": {"entity": "$entity", "type": "$entityType"},
193
+ "avgSentiment": {"$avg": "$sentimentScore"},
194
+
195
+ }}
196
+ ]
197
+
198
+ mentions_results = list(entity_collection.aggregate(mentions_pipeline))
199
+ sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
200
+
201
+ sentiment_lookup = _build_sentiment_lookup(sentiment_results)
202
+
203
+
 
204
 
205
  entity_types: Dict[str, Any] = {}
206
+ for mentions_result in mentions_results:
207
+ entity_type = mentions_result["_id"]["type"]
208
+
209
+ if entity_type not in entity_types:
210
+ entity_types[entity_type] = {
211
+ "fullName": ENTITY_TYPE_FULL_NAMES.get(entity_type, entity_type),
212
  "entities": [],
213
  }
214
+
215
+ entity_types[entity_type]["entities"].append(
216
+ _process_entity_with_sentiment(mentions_result, sentiment_lookup)
217
  )
218
 
219
+ # Keep only the top 10 per type
220
+ for entity_data in entity_types.values():
221
+ entity_data["entities"] = sorted(
222
+ entity_data["entities"], key=lambda x: -x["mentions"]
223
  )[:10]
224
 
225
  return {
226
+ "title": f"Top Entities - {time_filter.capitalize()}",
227
  "dateRange": {"start": start, "end": end},
228
  "data": entity_types,
229
  }