Spaces:

Kawaiibuta
/

real_estate

Running

App Files Files Community

FrancisGOS commited on Jul 1

Commit

683f058

1 Parent(s): b2a4481

Fix chat message with RAG

Browse files

Files changed (6) hide show

app/configs/pinecone.py +2 -1
app/domains/chat_message/controller.py +35 -35
app/domains/chat_message/service.py +109 -26
app/domains/properties/service.py +15 -24
app/domains/user_action/service.py +7 -7
app/seed/factories/article.py +88 -111

app/configs/pinecone.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import pinecone
 pc = pinecone.Pinecone(os.getenv("PINECONE_API_KEY"))
-property_index = pc.Index("properties")

 import os
 import pinecone
 pc = pinecone.Pinecone(os.getenv("PINECONE_API_KEY"))
+property_index = pc.Index("properties")
+article_index = pc.Index("articles")

app/domains/chat_message/controller.py CHANGED Viewed

@@ -39,21 +39,34 @@ class ChatMessageController(Controller):
         if not user.device_token:
             return
         notify_service = NotificationService()
-        title = "You have a new message"
-        body = f"You have a new message from {user.name}. \n{message.content}"
         notify_service.send_to_token(
             token=user.device_token,
             title=title,
             body=body,
             data={
                 "type": "chat",
-                "content": message.content,
                 "sender_id": str(message.sender_id),
                 "chat_session_id": str(message.session_id),
                 "created_at": str(message.created_at.timestamp()),
             },
         )
     @post("")
     async def create_message(
         self,
@@ -64,42 +77,29 @@ class ChatMessageController(Controller):
         chat_service: ChatMessageService,
         chat_session_service: ChatSessionService,
     ) -> Response:
-        if not data.is_ai:
-            message = await chat_service.create_message(data, request.user.id)
-            return Response(
-                chat_service.to_schema(message, schema_type=ChatMessageSchema),
-                background=BackgroundTasks(
-                    [
-                        BackgroundTask(
-                            self.notify_message,
-                            request.user,
-                            message,
-                        ),
-                        BackgroundTask(
-                            chat_session_service.update_last_message,
-                            message.session_id,
-                            message,
-                        ),
-                    ]
                 ),
             )
-        message = await chat_service.ai_respond_to_user(data, request.user.id)
         return Response(
             chat_service.to_schema(message, schema_type=ChatMessageSchema),
-            background=BackgroundTasks(
-                [
-                    BackgroundTask(
-                        chat_session_service.update_last_message,
-                        message.session_id,
-                        message,
-                    ),
-                    BackgroundTask(
-                        self.notify_message,
-                        request.user,
-                        message,
-                    ),
-                ]
-            ),
         )
     @post("/ai", no_auth=True, status_code=HTTP_200_OK)

         if not user.device_token:
             return
         notify_service = NotificationService()
+        title = "AI Assistant"
+        body = (
+            f"You have a new message from {user.name}."
+            if message.sender_id
+            else "AI has the answer you need"
+        )
         notify_service.send_to_token(
             token=user.device_token,
             title=title,
             body=body,
             data={
                 "type": "chat",
+                "id": str(message.id),
                 "sender_id": str(message.sender_id),
                 "chat_session_id": str(message.session_id),
                 "created_at": str(message.created_at.timestamp()),
             },
         )
+    async def chat_with_ai(
+        self,
+        data: CreateMessageDTO,
+        user: User,
+        chat_service: ChatMessageService,
+    ):
+        message = await chat_service.ai_respond_to_user(data, user_id=user.id)
+        self.notify_message(user, message)
     @post("")
     async def create_message(
         self,
         chat_service: ChatMessageService,
         chat_session_service: ChatSessionService,
     ) -> Response:
+        message = await chat_service.create_message(data, request.user.id)
+        background_task_list = [
+            BackgroundTask(
+                chat_session_service.update_last_message,
+                message.session_id,
+                message,
+            ),
+        ]
+        if data.is_ai:
+            background_task_list.append(
+                BackgroundTask(self.chat_with_ai, data, request.user, chat_service)
+            )
+        else:
+            background_task_list.append(
+                BackgroundTask(
+                    self.notify_message,
+                    request.user,
+                    message,
                 ),
             )
         return Response(
             chat_service.to_schema(message, schema_type=ChatMessageSchema),
+            background=BackgroundTasks(background_task_list),
         )
     @post("/ai", no_auth=True, status_code=HTTP_200_OK)

app/domains/chat_message/service.py CHANGED Viewed

@@ -1,13 +1,15 @@
 from collections.abc import AsyncGenerator
 from datetime import datetime
-from typing import Dict, List
 import uuid
 from venv import logger
 from sqlalchemy.dialects import postgresql  # or mysql, sqlite depending on your DB
 from sqlalchemy import and_, desc, or_, select
 from sqlalchemy.orm import noload
-from transformers import pipeline
 from database.models.property import Property
 from domains.properties.service import PropertyService
 from domains.chat_session.service import ChatSessionService
@@ -21,6 +23,7 @@ from domains.supabase.service import SupabaseService, provide_supabase_service
 from sqlalchemy.ext.asyncio import AsyncSession
 from google.genai import types
 from configs.gemai import client
 import re
 from litestar.exceptions import ValidationException, InternalServerException
 import requests
@@ -100,7 +103,7 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
                             "model_id": message.id,
                         }
                         for image in data.image_list
-                    ],
                 )
             return message
         except Exception as e:
@@ -297,9 +300,8 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
     async def summarize_session(self, session_id: uuid.UUID) -> str:
         """
-        Summarize the entire chat session using a lightweight summarization model.
         """
-        # Fetch all messages ordered oldest first
         query = (
             select(ChatMessage)
             .where(ChatMessage.session_id == session_id)
@@ -307,17 +309,39 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
         )
         result = await self.repository.session.execute(query)
         messages: List[ChatMessage] = result.scalars().all()
-        # Concatenate speaker labels
-        transcript = "\n".join(
-            f"{ 'User' if msg.sender_id else 'Assistant' }: {msg.content}"
-            for msg in messages
-        )
-        # Load summarizer (T5-small) on CPU
-        summarizer = pipeline("summarization", model="t5-small", device=-1)
-        summary_out = summarizer(
-            transcript, max_length=150, min_length=50, do_sample=False
         )
-        return summary_out[0]["summary_text"]
     async def build_chat_context(
         self, session_id: uuid.UUID, window_size: int = 10
@@ -374,8 +398,10 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
                 context = await self.build_chat_context(data.session_id, window_size)
             else:
                 context = []
             context.append(UserContent(data.content))
-            system_instruction = """You are a real estate assistant that help user choose and find the best match properties. Detect if the user wants property suggestions in any language.
             Always respond helpfully. If suggestions are requested, at the very end append exactly one line with
             #PROPERTY_CRITERIA:<json>
             where <json> exactly matches the PropertySchema fields:
@@ -391,14 +417,19 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
                 "average_rating": number,
                 "status": boolean,
             }
-            If not, do not append the tag."""
-            system_instruction += f"Also, here is there summary of the conversation between you and this customer {summary}"
             try:
                 response = client.models.generate_content(
                     model="gemini-2.0-flash",
                     contents=context,
                     config=GenerateContentConfig(
-                        tools=[Tool(google_search=GoogleSearch())],
                         system_instruction=system_instruction,
                     ),
                 )
@@ -414,15 +445,11 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
                         pass
                 raise
             assistant_text = response.text
-            message = await self.create_message(
-                CreateMessageDTO(session_id=data.session_id, content=data.content),
-                user_id,
-                auto_commit=False,
-            )
             message = await self.create(
                 {
                     "content": assistant_text,
-                    "session_id": message.session_id,
                     "sender_id": None,
                 }
             )
@@ -434,6 +461,62 @@ class ChatMessageService(SQLAlchemyAsyncRepositoryService[ChatMessage]):
         finally:
             await self.repository.session.commit()
 async def provide_chat_message_service(
     db_session: AsyncSession,

 from collections.abc import AsyncGenerator
 from datetime import datetime
+from typing import Dict, List, Union
 import uuid
 from venv import logger
 from sqlalchemy.dialects import postgresql  # or mysql, sqlite depending on your DB
 from sqlalchemy import and_, desc, or_, select
 from sqlalchemy.orm import noload
+from transformers import pipeline, AutoTokenizer
+from pinecone import SearchRerank
+from database.models.article import Article
+from domains.news.service import ArticleService
 from database.models.property import Property
 from domains.properties.service import PropertyService
 from domains.chat_session.service import ChatSessionService
 from sqlalchemy.ext.asyncio import AsyncSession
 from google.genai import types
 from configs.gemai import client
+from configs.pinecone import article_index, pc
 import re
 from litestar.exceptions import ValidationException, InternalServerException
 import requests
                             "model_id": message.id,
                         }
                         for image in data.image_list
+                    ],
                 )
             return message
         except Exception as e:
     async def summarize_session(self, session_id: uuid.UUID) -> str:
         """
+        Summarize the entire chat session by chunking the transcript to respect the model's token limit.
         """
         query = (
             select(ChatMessage)
             .where(ChatMessage.session_id == session_id)
         )
         result = await self.repository.session.execute(query)
         messages: List[ChatMessage] = result.scalars().all()
+        chunks: List[str] = []
+        current_chunk = []
+        current_tokens = 0
+        tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")
+        summarizer = pipeline(
+            "summarization",
+            model="Helsinki-NLP/opus-mt-en-vi",
+            tokenizer=tokenizer,
+            device=-1,
         )
+        for msg in messages:
+            speaker = "User" if msg.sender_id else "Assistant"
+            line = f"{speaker}: {msg.content}"
+            tokens = len(tokenizer(line, add_special_tokens=False))
+            if current_tokens + tokens > tokenizer.model_max_length:
+                chunks.append("\n".join(current_chunk))
+                current_chunk = [line]
+                current_tokens = tokens
+            else:
+                current_chunk.append(line)
+                current_tokens += tokens
+        if current_chunk:
+            chunks.append("\n".join(current_chunk))
+        partial_summaries = []
+        for chunk in chunks:
+            summary_out = summarizer(
+                chunk, max_length=150, min_length=10, do_sample=False, truncation=True
+            )
+            partial_summaries.append(summary_out[0]["summary_text"])
+        combined = "\n".join(partial_summaries)
+        final_out = summarizer(combined, max_length=200, min_length=50, do_sample=False)
+        return final_out[0]["summary_text"]
     async def build_chat_context(
         self, session_id: uuid.UUID, window_size: int = 10
                 context = await self.build_chat_context(data.session_id, window_size)
             else:
                 context = []
+            articles = await self.rag_article(data.content)
             context.append(UserContent(data.content))
+            system_instruction = """
+            You are a real estate assistant that help user choose and find the best match properties. Detect if the user wants property suggestions in any language.
             Always respond helpfully. If suggestions are requested, at the very end append exactly one line with
             #PROPERTY_CRITERIA:<json>
             where <json> exactly matches the PropertySchema fields:
                 "average_rating": number,
                 "status": boolean,
             }
+            If not, do not append the tag.
+            You will be provided with a list of relative articles that might help you answer user.
+            Each article is separated by the mark: ======== Article <number> =======.
+            If there are conflicts in information of articles, use the newer information.
+            Here is the list of relative articles that you can based on to response to user: """
+            for i, article in enumerate(articles):
+                system_instruction += f"\n ======== Article {i + 1} ============ \nTitle: {article.title} \nContent: {article.content} \nPublished date: {article.publish_date.isoformat()}"
+            system_instruction += f" If you use information from any provided article. Reference that article with the link. Also, here is there summary of the conversation between you and this customer {summary}"
             try:
                 response = client.models.generate_content(
                     model="gemini-2.0-flash",
                     contents=context,
                     config=GenerateContentConfig(
                         system_instruction=system_instruction,
                     ),
                 )
                         pass
                 raise
             assistant_text = response.text
             message = await self.create(
                 {
                     "content": assistant_text,
+                    "session_id": data.session_id,
                     "sender_id": None,
                 }
             )
         finally:
             await self.repository.session.commit()
+    async def rag_article(self, query: str) -> list[Article]:
+        summarized_query = self.summarize_query_for_rag(
+            query, max_length=len(query) // 2
+        )
+        reranked_articles = self.get_relevant_articles(summarized_query, 20, 10)
+        article_service = ArticleService(session=self.repository.session)
+        full_articles = await article_service.list(
+            Article.id.in_([article["_id"] for article in reranked_articles])
+        )
+        return full_articles
+    def summarize_query_for_rag(
+        self,
+        text: str,
+        max_length: int = 100,
+        min_length: int = 5,
+        device: Union[str, int] = -1,
+    ) -> str:
+        """
+        Summarizes a user query in any language for use in a RAG retriever.
+        Args:
+            text (str): The input text/query in any supported language.
+            max_length (int): Maximum length of the summary/query.
+            min_length (int): Minimum length of the summary/query.
+            device (Union[str, int]): Device for inference (-1 for CPU, 0 or 1 for GPU).
+        Returns:
+            str: Summarized query text.
+        """
+        summarizer = pipeline(
+            "summarization",
+            model="Helsinki-NLP/opus-mt-en-vi",
+            tokenizer="Helsinki-NLP/opus-mt-en-vi",
+            device=device,
+        )
+        summary = summarizer(
+            text, max_length=max_length, min_length=min_length, do_sample=False
+        )
+        return summary[0]["summary_text"]
+    def get_relevant_articles(
+        self, query: str, retrieval_n: int = 10, rerank_n: int = 3
+    ) -> Dict:
+        result = article_index.search(
+            "__default__",
+            query={"top_k": retrieval_n, "inputs": {"text": query}},
+            rerank=SearchRerank(
+                model="bge-reranker-v2-m3",
+                rank_fields=["summary"],
+                top_n=rerank_n,
+                parameters={"truncate": "END"},
+            ),
+        )
+        return result.to_dict()["result"]["hits"]
 async def provide_chat_message_service(
     db_session: AsyncSession,

app/domains/properties/service.py CHANGED Viewed

@@ -199,17 +199,13 @@ class PropertyService(SQLAlchemyAsyncRepositoryService[Property]):
         pagination: LimitOffset,
         user_id: uuid.UUID,
     ) -> CursorPagination[str, Property]:
-        # 1) Build Pinecone metadata filter
         meta_filter = self._build_pinecone_filter(search_param)
-        # 2) Generate user embedding from past interactions
         user_embedding = await self._compute_user_embedding(user_id)
-        # 3) Query Pinecone
         pine_res = property_index.query(
             vector=user_embedding,
             filter=meta_filter,
             top_k=pagination.limit,
-            include_metadata=True,
-            # next_page_token=search_param.next_page_token,
         )
         ids = [m["id"] for m in pine_res["matches"]]
         props = await self._fetch_properties_from_ids(ids)
@@ -261,19 +257,21 @@ class PropertyService(SQLAlchemyAsyncRepositoryService[Property]):
         if search_param.lat is not None and search_param.lng is not None:
             query = query.join(Property.address)
             radius_meters = search_param.radius * 1000
-            radius_degrees = radius_meters / 111320.0
             lat = search_param.lat
             lng = search_param.lng
             min_lat = lat - radius_degrees
             max_lat = lat + radius_degrees
             min_lng = lng - radius_degrees
             max_lng = lng + radius_degrees
-            query = query.where(and_(
-                Address.latitude >= min_lat,
-                Address.latitude <= max_lat,
-                Address.longitude >= min_lng,
-                Address.longitude <= max_lng,
-            ))
         # price filters
         if search_param.min_price is not None:
             query = query.where(Property.price >= search_param.min_price)
@@ -281,10 +279,7 @@ class PropertyService(SQLAlchemyAsyncRepositoryService[Property]):
             query = query.where(Property.price <= search_param.max_price)
         # Have review
         if search_param.has_review:
-            subquery = (
-                select(Review.id)
-                .where(Review.property_id == Property.id)
-            )
             query = query.where(exists(subquery))
         # categorical
         if search_param.property_category:
@@ -392,16 +387,12 @@ class PropertyService(SQLAlchemyAsyncRepositoryService[Property]):
     async def _compute_user_embedding(self, user_id: uuid.UUID) -> list[float]:
         user_action_repository = UserActionRepository(session=self.repository.session)
-        properties_action = await user_action_repository.get_relevant_properties(
             user_id=user_id
         )
-        if len(properties_action) < 5:
-            return next(
-                iter(property_index.fetch(["0"], namespace="Mean").vectors.values())
-            ).values
-        result = await self.fetch_pinecone_document_by_id(
-            [UUID(id) for id in properties_action.keys()]
-        )
         vectors = [value.values for value in result.values()]
         mean_vector = np.mean(vectors, axis=0).tolist()
         return mean_vector

         pagination: LimitOffset,
         user_id: uuid.UUID,
     ) -> CursorPagination[str, Property]:
         meta_filter = self._build_pinecone_filter(search_param)
         user_embedding = await self._compute_user_embedding(user_id)
         pine_res = property_index.query(
             vector=user_embedding,
             filter=meta_filter,
             top_k=pagination.limit,
+            include_metadata=False,
         )
         ids = [m["id"] for m in pine_res["matches"]]
         props = await self._fetch_properties_from_ids(ids)
         if search_param.lat is not None and search_param.lng is not None:
             query = query.join(Property.address)
             radius_meters = search_param.radius * 1000
+            radius_degrees = radius_meters / 111320.0
             lat = search_param.lat
             lng = search_param.lng
             min_lat = lat - radius_degrees
             max_lat = lat + radius_degrees
             min_lng = lng - radius_degrees
             max_lng = lng + radius_degrees
+            query = query.where(
+                and_(
+                    Address.latitude >= min_lat,
+                    Address.latitude <= max_lat,
+                    Address.longitude >= min_lng,
+                    Address.longitude <= max_lng,
+                )
+            )
         # price filters
         if search_param.min_price is not None:
             query = query.where(Property.price >= search_param.min_price)
             query = query.where(Property.price <= search_param.max_price)
         # Have review
         if search_param.has_review:
+            subquery = select(Review.id).where(Review.property_id == Property.id)
             query = query.where(exists(subquery))
         # categorical
         if search_param.property_category:
     async def _compute_user_embedding(self, user_id: uuid.UUID) -> list[float]:
         user_action_repository = UserActionRepository(session=self.repository.session)
+        property_id_list = await user_action_repository.get_relevant_properties(
             user_id=user_id
         )
+        if len(property_id_list) == 0:
+            return next(iter(property_index.fetch(["0"]).vectors.values())).values
+        result = await self.fetch_pinecone_document_by_id(property_id_list)
         vectors = [value.values for value in result.values()]
         mean_vector = np.mean(vectors, axis=0).tolist()
         return mean_vector

app/domains/user_action/service.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from collections import defaultdict
 from collections.abc import AsyncGenerator
 import uuid
 from sqlalchemy import select
@@ -11,29 +12,28 @@ from sqlalchemy.ext.asyncio import AsyncSession
 class UserActionRepository(SQLAlchemyAsyncRepository[UserAction]):
     model_type = UserAction
-    async def get_relevant_properties(self, user_id: uuid.UUID) -> dict:
         prop_ids_subq = (
             select(UserAction.property_id)
             .where(UserAction.user_id == user_id)
             .distinct()
             .limit(10)
         ).subquery()
-        # Step 2: fetch all actions for those properties
         result = await self.session.execute(
             select(UserAction)
             .where(
                 UserAction.user_id == user_id,
-                UserAction.property_id.in_(select(prop_ids_subq))
             )
             .order_by(UserAction.property_id, UserAction.created_at)
         )
         actions = result.scalars().all()
-        grouped: dict = {}
-        for act in actions:
-            grouped[str(act.property_id)].append(act)
-        return grouped
 class UserActionService(SQLAlchemyAsyncRepositoryService[UserAction]):
     repository_type = UserActionRepository

 from collections import defaultdict
 from collections.abc import AsyncGenerator
+from typing import List
 import uuid
 from sqlalchemy import select
 class UserActionRepository(SQLAlchemyAsyncRepository[UserAction]):
     model_type = UserAction
+    async def get_relevant_properties(self, user_id: uuid.UUID) -> List[uuid.UUID]:
         prop_ids_subq = (
             select(UserAction.property_id)
             .where(UserAction.user_id == user_id)
+            .where(UserAction.action == "view")
             .distinct()
             .limit(10)
         ).subquery()
         result = await self.session.execute(
             select(UserAction)
             .where(
                 UserAction.user_id == user_id,
+                UserAction.property_id.in_(select(prop_ids_subq)),
             )
             .order_by(UserAction.property_id, UserAction.created_at)
         )
         actions = result.scalars().all()
+        return [action.property_id for action in actions]
 class UserActionService(SQLAlchemyAsyncRepositoryService[UserAction]):
     repository_type = UserActionRepository

app/seed/factories/article.py CHANGED Viewed

@@ -13,6 +13,8 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from configs.gemai import client
 from google.genai.types import GenerateContentConfig
 from advanced_alchemy.utils.text import slugify
 safety_settings = [
     {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
@@ -26,86 +28,56 @@ safety_settings = [
         "threshold": "BLOCK_MEDIUM_AND_ABOVE",
     },
 ]
 async def generate_tags_and_summary(article_html_content: str) -> dict:
     """
-    Generates tags and a short description for an article using Gemini.
-    Args:
-        article_html_content: The HTML content of the article.
-    Returns:
-        A dictionary with "tags" (list of strings) and "short_description" (string).
-        Returns empty values if generation fails.
-    """
-    prompt = f"""
-    Analyze the following Vietnamese news article content (provided in HTML format) and perform two tasks:
-    1. Generate a concise short description (summary) of the article in Vietnamese. This description should be no more than 80 words and capture the main points.
-    2. Extract 3 to 7 relevant keywords (tags) for this article in Vietnamese. These tags should be single words or short phrases.
-    Article Content:
-    ```html
-    {article_html_content[:15000]}
-    ```
-    Provide your response strictly as a JSON object with two keys: "short_description" and "tags".
-    The "tags" value should be a list of strings.
-    Example JSON output:
-    {{
-        "short_description": "Một bản tóm tắt ngắn gọn của bài báo bằng tiếng Việt...,
-        "tags": ["bất động sản", "thị trường", "dự án mới", "Việt Nam"]
-    }}
     """
     try:
-        print(
-            f"Sending content to Gemini (first 100 chars): {article_html_content[:100]}..."
-        )
-        response = client.models.generate_content(
-            model="gemini-1.0-flash",
-            contents=[
-                prompt,
-            ],
-            config=GenerateContentConfig(
-                safety_settings=safety_settings,
-                top_p=1,
-                temperature=0.7,
-                max_output_tokens=2048,
-                response_modalities=["TEXT"],
-            ),
         )
-        cleaned_response_text = response.text.strip()
-        if cleaned_response_text.startswith("```json"):
-            cleaned_response_text = cleaned_response_text[7:]
-        if cleaned_response_text.endswith("```"):
-            cleaned_response_text = cleaned_response_text[:-3]
-        cleaned_response_text = cleaned_response_text.strip()
-        data = json.loads(cleaned_response_text)
-        tags = data.get("tags", [])
-        short_desc = data.get("short_description", "")
-        if not isinstance(tags, list):
-            print(
-                f"Warning: Gemini returned tags not as a list: {tags}. Using empty list."
-            )
-            tags = []
-        if not isinstance(short_desc, str):
-            print(
-                f"Warning: Gemini returned short_description not as a string: {short_desc}. Using empty string."
-            )
-            short_desc = ""
-        return {"tags": tags, "short_description": short_desc}
     except Exception as e:
-        print(f"Error generating tags/summary with Gemini: {e}")
-        print(
-            f"Failed prompt was based on content (first 100 chars): {article_html_content[:100]}..."
-        )
-        if hasattr(response, "prompt_feedback") and response.prompt_feedback:
-            print(f"Gemini Prompt Feedback: {response.prompt_feedback}")
-        return {"tags": [], "short_description": "Không thể tạo tóm tắt."}
 class ArticleFactory(BaseFactory):
@@ -150,7 +122,6 @@ class ArticleFactory(BaseFactory):
                 max_tokens=10000,
             )
             text = response.choices[0].message.content.strip()
-            print(text)
             articles = json.loads(text)
             if not isinstance(articles, list):
                 raise ValueError("Expected a JSON list of articles.")
@@ -182,35 +153,36 @@ class ArticleFactory(BaseFactory):
                     await import_articles_from_json(
                         os.path.join(fixture_path, "articles.json"), session
                     )
-                articles_data = self.fetch_articles_from_openai(count)
-                for article_data in articles_data:
-                    result = await session.execute(
-                        select(Article).filter_by(title=article_data.get("title"))
-                    )
-                    if result.scalars().first():
-                        continue
-                    publish_date_str = article_data.get("publish_date")
-                    try:
-                        publish_date = datetime.fromisoformat(publish_date_str)
-                    except Exception:
-                        publish_date = datetime.now(timezone.utc)
-                    tag_names = article_data.get("tags", [])
-                    tags = await self.get_or_create_tags(session, tag_names)
-                    article = Article(
-                        id=uuid.uuid4(),
-                        title=article_data.get("title"),
-                        publish_date=publish_date,
-                        content=article_data.get("content"),
-                        short_description=article_data.get("short_description"),
-                        author=article_data.get("author"),
-                        tags=tags,
-                        created_at=datetime.now(timezone.utc),
-                        updated_at=datetime.now(timezone.utc),
-                    )
-                    await self.repository(session=session).add(article)
             except Exception as e:
                 await session.rollback()
                 print(f"Error during ArticleFactory seeding: {e}")
@@ -223,14 +195,15 @@ class ArticleFactory(BaseFactory):
             await self.repository(session=session).delete_where(Article.id.is_not(None))
             await session.commit()
 def parse_vietnamese_datetime(date_str: str) -> datetime | None:
     """
-    Tries to parse common Vietnamese datetime string formats.
     Returns a timezone-aware datetime object (UTC) or None if parsing fails.
     """
     if not date_str or not isinstance(date_str, str):
         return None
     if "T" in date_str and ("Z" in date_str or "+" in date_str or "-" in date_str[10:]):
         try:
             dt = datetime.fromisoformat(date_str)
@@ -240,6 +213,7 @@ def parse_vietnamese_datetime(date_str: str) -> datetime | None:
         except ValueError:
             pass
     formats_to_try = [
         "%d/%m/%Y %H:%M:%S",
         "%d/%m/%Y %H:%M",
@@ -247,7 +221,9 @@ def parse_vietnamese_datetime(date_str: str) -> datetime | None:
         "%d-%m-%Y %H:%M",
         "%Y-%m-%d %H:%M:%S",
         "%Y/%m/%d %H:%M:%S",
     ]
     for fmt in formats_to_try:
         try:
             dt_naive = datetime.strptime(date_str.strip(), fmt)
@@ -255,10 +231,10 @@ def parse_vietnamese_datetime(date_str: str) -> datetime | None:
             return dt_aware
         except ValueError:
             continue
     print(f"Warning: Could not parse date string: {date_str}")
     return None
 async def get_or_create_tags(session: AsyncSession, tag_names: List[str]) -> List[Tag]:
     """
     Retrieves existing Tag objects or creates new ones for each tag name.
@@ -296,10 +272,9 @@ async def process_article_data(session: AsyncSession, article_data: Dict[str, An
         return None
     gemini_data = await generate_tags_and_summary(html_content)
     tag_names = gemini_data.get("tags", [])
-    # short_description = gemini_data.get("short_description")
-    # if not short_description:
-    short_description = "Tóm tắt không có sẵn."
     if not tag_names:
         print(f"No tags generated for article: {title}")
     publish_date = parse_vietnamese_datetime(published_date_str)
@@ -308,14 +283,14 @@ async def process_article_data(session: AsyncSession, article_data: Dict[str, An
             f"Using current time for article '{title}' due to unparseable date: {published_date_str}"
         )
         publish_date = datetime.now(timezone.utc)
-    db_tags = await get_or_create_tags(session, tag_names)
     new_article = Article(
         title=title,
         publish_date=publish_date,
         content=html_content,
         short_description=short_description[:499],
         author=source_name,
-        tags=db_tags,
     )
     return new_article
@@ -347,6 +322,8 @@ async def import_articles_from_json(json_filepath: str, session: AsyncSession):
     articles_to_add = []
     for i, item_data in enumerate(data_from_json):
         print(f"\n--- Processing item {i+1}/{len(data_from_json)} ---")
         article_obj = await process_article_data(session, item_data)
         if article_obj:

 from configs.gemai import client
 from google.genai.types import GenerateContentConfig
 from advanced_alchemy.utils.text import slugify
+from transformers import pipeline
+import re
 safety_settings = [
     {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
         "threshold": "BLOCK_MEDIUM_AND_ABOVE",
     },
 ]
+_SUMMARY_PIPELINE = pipeline(
+    "summarization",
+    model="google/long-t5-tglobal-base",
+    tokenizer="google/long-t5-tglobal-base",
+    device=-1,
+)
+_KEYPHRASE_PIPELINE = pipeline(
+    "text2text-generation",
+    model="google/long-t5-tglobal-base",
+    tokenizer="google/long-t5-tglobal-base",
+    framework="pt",
+    device=-1,
+)
 async def generate_tags_and_summary(article_html_content: str) -> dict:
     """
+    Summarize and extract tags using small transformer models.
     """
+    text = re.sub(r"<[^>]+>", " ", article_html_content)
+    text = re.sub(r"\s+", " ", text).strip()
+    if len(text) < 50:
+        return {"tags": [], "short_description": text}
     try:
+        summary_out = _SUMMARY_PIPELINE(
+            text,
+            max_length=200,
+            min_length=30,
+            do_sample=False,
         )
+        short_description = summary_out[0]["summary_text"].strip()
     except Exception as e:
+        print(f"Summarization error: {e}")
+        short_description = text[:300] + ("…" if len(text) > 300 else "")
+    try:
+        prompt = "extract keyphrases: " + text[:1000]  # limit length
+        kpop = _KEYPHRASE_PIPELINE(prompt, max_length=64, do_sample=False)
+        raw = kpop[0]["generated_text"]
+        tags = re.split(r"[;,]\s*", raw)
+        tags = list(dict.fromkeys([t.strip().lower() for t in tags if t.strip()]))
+        tags = tags[:7]
+    except Exception as e:
+        print(f"Keyphrase extraction error: {e}")
+        tags = []
+    return {
+        "short_description": short_description,
+        "tags": tags,
+    }
 class ArticleFactory(BaseFactory):
                 max_tokens=10000,
             )
             text = response.choices[0].message.content.strip()
             articles = json.loads(text)
             if not isinstance(articles, list):
                 raise ValueError("Expected a JSON list of articles.")
                     await import_articles_from_json(
                         os.path.join(fixture_path, "articles.json"), session
                     )
+                else:
+                    articles_data = self.fetch_articles_from_openai(count)
+                    for article_data in articles_data:
+                        result = await session.execute(
+                            select(Article).filter_by(title=article_data.get("title"))
+                        )
+                        if result.scalars().first():
+                            continue
+                        publish_date_str = article_data.get("publish_date")
+                        try:
+                            publish_date = datetime.fromisoformat(publish_date_str)
+                        except Exception:
+                            publish_date = datetime.now(timezone.utc)
+                        tag_names = article_data.get("tags", [])
+                        tags = await self.get_or_create_tags(session, tag_names)
+                        article = Article(
+                            id=uuid.uuid4(),
+                            title=article_data.get("title"),
+                            publish_date=publish_date,
+                            content=article_data.get("content"),
+                            short_description=article_data.get("short_description"),
+                            author=article_data.get("author"),
+                            tags=tags,
+                            created_at=datetime.now(timezone.utc),
+                            updated_at=datetime.now(timezone.utc),
+                        )
+                        await self.repository(session=session).add(article)
             except Exception as e:
                 await session.rollback()
                 print(f"Error during ArticleFactory seeding: {e}")
             await self.repository(session=session).delete_where(Article.id.is_not(None))
             await session.commit()
 def parse_vietnamese_datetime(date_str: str) -> datetime | None:
     """
+    Tries to parse common Vietnamese datetime string formats, including RFC 1123.
     Returns a timezone-aware datetime object (UTC) or None if parsing fails.
     """
     if not date_str or not isinstance(date_str, str):
         return None
+    # First: handle ISO8601 with 'T' and timezone info
     if "T" in date_str and ("Z" in date_str or "+" in date_str or "-" in date_str[10:]):
         try:
             dt = datetime.fromisoformat(date_str)
         except ValueError:
             pass
+    # Try known formats, including RFC 1123
     formats_to_try = [
         "%d/%m/%Y %H:%M:%S",
         "%d/%m/%Y %H:%M",
         "%d-%m-%Y %H:%M",
         "%Y-%m-%d %H:%M:%S",
         "%Y/%m/%d %H:%M:%S",
+        "%a, %d %b %Y %H:%M:%S GMT",  # RFC 1123 (e.g., "Sun, 01 Jun 2025 01:16:00 GMT")
     ]
     for fmt in formats_to_try:
         try:
             dt_naive = datetime.strptime(date_str.strip(), fmt)
             return dt_aware
         except ValueError:
             continue
     print(f"Warning: Could not parse date string: {date_str}")
     return None
 async def get_or_create_tags(session: AsyncSession, tag_names: List[str]) -> List[Tag]:
     """
     Retrieves existing Tag objects or creates new ones for each tag name.
         return None
     gemini_data = await generate_tags_and_summary(html_content)
     tag_names = gemini_data.get("tags", [])
+    short_description = gemini_data.get("short_description")
+    if not short_description:
+        short_description = "Tóm tắt không có sẵn."
     if not tag_names:
         print(f"No tags generated for article: {title}")
     publish_date = parse_vietnamese_datetime(published_date_str)
             f"Using current time for article '{title}' due to unparseable date: {published_date_str}"
         )
         publish_date = datetime.now(timezone.utc)
+    # db_tags = await get_or_create_tags(session, tag_names)
     new_article = Article(
         title=title,
         publish_date=publish_date,
         content=html_content,
         short_description=short_description[:499],
         author=source_name,
+        tags=[],
     )
     return new_article
     articles_to_add = []
     for i, item_data in enumerate(data_from_json):
+        if i > 2:
+            break
         print(f"\n--- Processing item {i+1}/{len(data_from_json)} ---")
         article_obj = await process_article_data(session, item_data)
         if article_obj: