Commit
·
c96129a
1
Parent(s):
da9228c
feat: add sentiment score for top entities for summary page
Browse files- app/controllers/summary/utils.py +78 -36
app/controllers/summary/utils.py
CHANGED
@@ -8,6 +8,47 @@ from collections import defaultdict
|
|
8 |
|
9 |
from database import article_collection, entity_collection
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def _get_latest_publish_date_from_collection(collection) -> datetime:
|
13 |
"""Return the latest publish date found in the specified collection.
|
@@ -114,7 +155,7 @@ def get_content_flow_data(time_filter: str) -> Dict[str, Any]:
|
|
114 |
|
115 |
|
116 |
def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
|
117 |
-
"""Return *Entity Analysis* data for the given period.
|
118 |
|
119 |
Uses rolling window approach:
|
120 |
- today: only the latest date
|
@@ -129,59 +170,60 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
|
|
129 |
Returns
|
130 |
-------
|
131 |
Dict[str, Any]
|
132 |
-
Dictionary containing title, dateRange, and aggregated entity analysis data.
|
133 |
"""
|
134 |
start, end = _time_range(time_filter, entity_collection)
|
135 |
|
136 |
-
|
|
|
137 |
{"$match": {"publishDate": {"$gte": start, "$lte": end}}},
|
138 |
{"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
|
139 |
"mentions": {"$sum": "$occurrence"}}},
|
140 |
{"$sort": {"mentions": -1}},
|
141 |
]
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
"
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
}
|
163 |
|
164 |
entity_types: Dict[str, Any] = {}
|
165 |
-
for
|
166 |
-
|
167 |
-
|
168 |
-
if
|
169 |
-
entity_types[
|
170 |
-
"fullName":
|
171 |
"entities": [],
|
172 |
}
|
173 |
-
|
174 |
-
|
|
|
175 |
)
|
176 |
|
177 |
-
#
|
178 |
-
for
|
179 |
-
|
180 |
-
|
181 |
)[:10]
|
182 |
|
183 |
return {
|
184 |
-
"title": f"Top Entities
|
185 |
"dateRange": {"start": start, "end": end},
|
186 |
"data": entity_types,
|
187 |
}
|
|
|
8 |
|
9 |
from database import article_collection, entity_collection
|
10 |
|
11 |
+
# Entity type full names mapping
|
12 |
+
ENTITY_TYPE_FULL_NAMES = {
|
13 |
+
"GPE": "Geopolitical Entities (Countries/Cities)",
|
14 |
+
"LOC": "Locations (Non-political)",
|
15 |
+
"ORG": "Organizations",
|
16 |
+
"PER": "People",
|
17 |
+
"PERSON": "People",
|
18 |
+
"PROD": "Products",
|
19 |
+
"PRODUCT": "Products",
|
20 |
+
"PRODCAT": "Product Categories",
|
21 |
+
"PRODUCT_CATEGORY": "Product Categories",
|
22 |
+
"COM": "Companies",
|
23 |
+
"EVENT": "Events",
|
24 |
+
"LANGUAGE": "Languages",
|
25 |
+
"NORP": "Nationalities/Religious/Political Groups",
|
26 |
+
"LAW": "Laws/Legal Documents",
|
27 |
+
"FAC": "Facilities/Landmarks",
|
28 |
+
"INS": "Industry Institutions",
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
def _build_sentiment_lookup(sentiment_results: list) -> Dict:
|
33 |
+
"""Build sentiment lookup dictionary from sentiment aggregation results."""
|
34 |
+
sentiment_lookup = {}
|
35 |
+
for result in sentiment_results:
|
36 |
+
key = (result["_id"]["entity"], result["_id"]["type"])
|
37 |
+
sentiment_lookup[key] = round(result["avgSentiment"], 3)
|
38 |
+
return sentiment_lookup
|
39 |
+
|
40 |
+
|
41 |
+
def _process_entity_with_sentiment(mentions_result: Dict, sentiment_lookup: Dict) -> Dict[str, Any]:
|
42 |
+
"""Process a single entity result and add sentiment information."""
|
43 |
+
entity_id = mentions_result["_id"]
|
44 |
+
entity_key = (entity_id["entity"], entity_id["type"])
|
45 |
+
|
46 |
+
return {
|
47 |
+
"entityName": entity_id["entity"].replace("_", " "),
|
48 |
+
"mentions": mentions_result["mentions"],
|
49 |
+
"sentiment": sentiment_lookup.get(entity_key)
|
50 |
+
}
|
51 |
+
|
52 |
|
53 |
def _get_latest_publish_date_from_collection(collection) -> datetime:
|
54 |
"""Return the latest publish date found in the specified collection.
|
|
|
155 |
|
156 |
|
157 |
def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
|
158 |
+
"""Return *Entity Analysis* data for the given period with sentiment information.
|
159 |
|
160 |
Uses rolling window approach:
|
161 |
- today: only the latest date
|
|
|
170 |
Returns
|
171 |
-------
|
172 |
Dict[str, Any]
|
173 |
+
Dictionary containing title, dateRange, and aggregated entity analysis data with sentiment.
|
174 |
"""
|
175 |
start, end = _time_range(time_filter, entity_collection)
|
176 |
|
177 |
+
# Get mentions count pipeline
|
178 |
+
mentions_pipeline = [
|
179 |
{"$match": {"publishDate": {"$gte": start, "$lte": end}}},
|
180 |
{"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
|
181 |
"mentions": {"$sum": "$occurrence"}}},
|
182 |
{"$sort": {"mentions": -1}},
|
183 |
]
|
184 |
|
185 |
+
# Get sentiment data pipeline
|
186 |
+
sentiment_pipeline = [
|
187 |
+
{"$match": {
|
188 |
+
"publishDate": {"$gte": start, "$lte": end},
|
189 |
+
"sentimentScore": {"$exists": True, "$ne": None}
|
190 |
+
}},
|
191 |
+
{"$group": {
|
192 |
+
"_id": {"entity": "$entity", "type": "$entityType"},
|
193 |
+
"avgSentiment": {"$avg": "$sentimentScore"},
|
194 |
+
|
195 |
+
}}
|
196 |
+
]
|
197 |
+
|
198 |
+
mentions_results = list(entity_collection.aggregate(mentions_pipeline))
|
199 |
+
sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
|
200 |
+
|
201 |
+
sentiment_lookup = _build_sentiment_lookup(sentiment_results)
|
202 |
+
|
203 |
+
|
|
|
204 |
|
205 |
entity_types: Dict[str, Any] = {}
|
206 |
+
for mentions_result in mentions_results:
|
207 |
+
entity_type = mentions_result["_id"]["type"]
|
208 |
+
|
209 |
+
if entity_type not in entity_types:
|
210 |
+
entity_types[entity_type] = {
|
211 |
+
"fullName": ENTITY_TYPE_FULL_NAMES.get(entity_type, entity_type),
|
212 |
"entities": [],
|
213 |
}
|
214 |
+
|
215 |
+
entity_types[entity_type]["entities"].append(
|
216 |
+
_process_entity_with_sentiment(mentions_result, sentiment_lookup)
|
217 |
)
|
218 |
|
219 |
+
# Keep only the top 10 per type
|
220 |
+
for entity_data in entity_types.values():
|
221 |
+
entity_data["entities"] = sorted(
|
222 |
+
entity_data["entities"], key=lambda x: -x["mentions"]
|
223 |
)[:10]
|
224 |
|
225 |
return {
|
226 |
+
"title": f"Top Entities - {time_filter.capitalize()}",
|
227 |
"dateRange": {"start": start, "end": end},
|
228 |
"data": entity_types,
|
229 |
}
|