Spaces:

Oxbridge-Economics
/

finfast-summary

Running

App Files Files Community

RaymondWongWL commited on Aug 6

Commit

c96129a

1 Parent(s): da9228c

feat: add sentiment score for top entities for summary page

Browse files

Files changed (1) hide show

app/controllers/summary/utils.py +78 -36

app/controllers/summary/utils.py CHANGED Viewed

@@ -8,6 +8,47 @@ from collections import defaultdict
 from database import article_collection, entity_collection
 def _get_latest_publish_date_from_collection(collection) -> datetime:
     """Return the latest publish date found in the specified collection.
@@ -114,7 +155,7 @@ def get_content_flow_data(time_filter: str) -> Dict[str, Any]:
 def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
-    """Return *Entity Analysis* data for the given period.
     Uses rolling window approach:
     - today: only the latest date
@@ -129,59 +170,60 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
     Returns
     -------
     Dict[str, Any]
-        Dictionary containing title, dateRange, and aggregated entity analysis data.
     """
     start, end = _time_range(time_filter, entity_collection)
-    pipeline = [
         {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
         {"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
                    "mentions": {"$sum": "$occurrence"}}},
         {"$sort": {"mentions": -1}},
     ]
-    results = list(entity_collection.aggregate(pipeline))
-    type_full_names = {
-        "GPE": "Geopolitical Entities (Countries/Cities)",
-        "LOC": "Locations (Non-political)",
-        "ORG": "Organizations",
-        "PER": "People",
-        "PERSON": "People",
-        "PROD": "Products",
-        "PRODUCT": "Products",
-        "PRODCAT": "Product Categories",
-        "PRODUCT_CATEGORY": "Product Categories",
-        "COM": "Companies",
-        "EVENT": "Events",
-        "LANGUAGE": "Languages",
-        "NORP": "Nationalities/Religious/Political Groups",
-        "LAW": "Laws/Legal Documents",
-        "FAC": "Facilities/Landmarks",
-        "INS": "Industry Institutions",
-    }
     entity_types: Dict[str, Any] = {}
-    for r in results:
-        e_type = r["_id"]["type"]
-        entity_name = r["_id"]["entity"].replace("_", " ")
-        if e_type not in entity_types:
-            entity_types[e_type] = {
-                "fullName": type_full_names.get(e_type, e_type),
                 "entities": [],
             }
-        entity_types[e_type]["entities"].append(
-            {"entityName": entity_name, "mentions": r["mentions"]}
         )
-    # keep only the top 10 per type
-    for type_data in entity_types.values():
-        type_data["entities"] = sorted(
-            type_data["entities"], key=lambda x: -x["mentions"]
         )[:10]
     return {
-        "title": f"Top Entities  {time_filter.capitalize()}",
         "dateRange": {"start": start, "end": end},
         "data": entity_types,
     }

 from database import article_collection, entity_collection
+# Entity type full names mapping
+ENTITY_TYPE_FULL_NAMES = {
+    "GPE": "Geopolitical Entities (Countries/Cities)",
+    "LOC": "Locations (Non-political)",
+    "ORG": "Organizations",
+    "PER": "People",
+    "PERSON": "People",
+    "PROD": "Products",
+    "PRODUCT": "Products",
+    "PRODCAT": "Product Categories",
+    "PRODUCT_CATEGORY": "Product Categories",
+    "COM": "Companies",
+    "EVENT": "Events",
+    "LANGUAGE": "Languages",
+    "NORP": "Nationalities/Religious/Political Groups",
+    "LAW": "Laws/Legal Documents",
+    "FAC": "Facilities/Landmarks",
+    "INS": "Industry Institutions",
+}
+def _build_sentiment_lookup(sentiment_results: list) -> Dict:
+    """Build sentiment lookup dictionary from sentiment aggregation results."""
+    sentiment_lookup = {}
+    for result in sentiment_results:
+        key = (result["_id"]["entity"], result["_id"]["type"])
+        sentiment_lookup[key] = round(result["avgSentiment"], 3)
+    return sentiment_lookup
+def _process_entity_with_sentiment(mentions_result: Dict, sentiment_lookup: Dict) -> Dict[str, Any]:
+    """Process a single entity result and add sentiment information."""
+    entity_id = mentions_result["_id"]
+    entity_key = (entity_id["entity"], entity_id["type"])
+    return {
+        "entityName": entity_id["entity"].replace("_", " "),
+        "mentions": mentions_result["mentions"],
+        "sentiment": sentiment_lookup.get(entity_key)
+    }
 def _get_latest_publish_date_from_collection(collection) -> datetime:
     """Return the latest publish date found in the specified collection.
 def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
+    """Return *Entity Analysis* data for the given period with sentiment information.
     Uses rolling window approach:
     - today: only the latest date
     Returns
     -------
     Dict[str, Any]
+        Dictionary containing title, dateRange, and aggregated entity analysis data with sentiment.
     """
     start, end = _time_range(time_filter, entity_collection)
+    # Get mentions count pipeline
+    mentions_pipeline = [
         {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
         {"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
                    "mentions": {"$sum": "$occurrence"}}},
         {"$sort": {"mentions": -1}},
     ]
+    # Get sentiment data pipeline
+    sentiment_pipeline = [
+        {"$match": {
+            "publishDate": {"$gte": start, "$lte": end},
+            "sentimentScore": {"$exists": True, "$ne": None}
+        }},
+        {"$group": {
+            "_id": {"entity": "$entity", "type": "$entityType"},
+            "avgSentiment": {"$avg": "$sentimentScore"},
+        }}
+    ]
+    mentions_results = list(entity_collection.aggregate(mentions_pipeline))
+    sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
+    sentiment_lookup = _build_sentiment_lookup(sentiment_results)
     entity_types: Dict[str, Any] = {}
+    for mentions_result in mentions_results:
+        entity_type = mentions_result["_id"]["type"]
+        if entity_type not in entity_types:
+            entity_types[entity_type] = {
+                "fullName": ENTITY_TYPE_FULL_NAMES.get(entity_type, entity_type),
                 "entities": [],
             }
+        entity_types[entity_type]["entities"].append(
+            _process_entity_with_sentiment(mentions_result, sentiment_lookup)
         )
+    # Keep only the top 10 per type
+    for entity_data in entity_types.values():
+        entity_data["entities"] = sorted(
+            entity_data["entities"], key=lambda x: -x["mentions"]
         )[:10]
     return {
+        "title": f"Top Entities - {time_filter.capitalize()}",
         "dateRange": {"start": start, "end": end},
         "data": entity_types,
     }