Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

sofhiaazzhr Claude Sonnet 4.6 commited on 17 days ago

Commit

8daf9b5

1 Parent(s): b59ef76

feat: add sheet-level chunk on CSV/XLSX ingestion

Each tabular file now produces a sheet-level chunk (chunk_level=sheet)
alongside per-column chunks, enabling schema-aware retrieval without
loading the full Parquet at search time.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/knowledge/processing_service.py +32 -1

src/knowledge/processing_service.py CHANGED Viewed

@@ -156,6 +156,7 @@ class KnowledgeProcessingService:
                 metadata={
                     "user_id": db_doc.user_id,
                     "source_type": "document",
                     "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
                     "data": {
                         "document_id": db_doc.id,
@@ -169,12 +170,40 @@ class KnowledgeProcessingService:
             ))
         return documents
     async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
         """Profile each column of a CSV file and upload Parquet to Azure Blob."""
         df = pd.read_csv(BytesIO(content))
         await upload_parquet(df, db_doc.user_id, db_doc.id)
         logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
-        return self._profile_dataframe(df, db_doc.filename, db_doc)
     async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
         """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
@@ -185,7 +214,9 @@ class KnowledgeProcessingService:
             docs = self._profile_dataframe(df, source_name, db_doc)
             for doc in docs:
                 doc.metadata["data"]["sheet_name"] = sheet_name
             documents.extend(docs)
             await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
             logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
         return documents

                 metadata={
                     "user_id": db_doc.user_id,
                     "source_type": "document",
+                    "chunk_level": "column",
                     "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
                     "data": {
                         "document_id": db_doc.id,
             ))
         return documents
+    def _to_sheet_document(
+        self, df: pd.DataFrame, db_doc: DBDocument, sheet_name: str | None, source_name: str
+    ) -> LangChainDocument:
+        col_summary = ", ".join(f"{c} ({df[c].dtype})" for c in df.columns)
+        text = (
+            f"Source: {source_name} ({len(df)} rows)\n"
+            f"Columns ({len(df.columns)}): {col_summary}"
+        )
+        return LangChainDocument(
+            page_content=text,
+            metadata={
+                "user_id": db_doc.user_id,
+                "source_type": "document",
+                "chunk_level": "sheet",
+                "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
+                "data": {
+                    "document_id": db_doc.id,
+                    "filename": db_doc.filename,
+                    "file_type": db_doc.file_type,
+                    "sheet_name": sheet_name,
+                    "column_names": list(df.columns),
+                    "row_count": len(df),
+                },
+            },
+        )
     async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
         """Profile each column of a CSV file and upload Parquet to Azure Blob."""
         df = pd.read_csv(BytesIO(content))
         await upload_parquet(df, db_doc.user_id, db_doc.id)
         logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
+        docs = self._profile_dataframe(df, db_doc.filename, db_doc)
+        docs.append(self._to_sheet_document(df, db_doc, sheet_name=None, source_name=db_doc.filename))
+        return docs
     async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
         """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
             docs = self._profile_dataframe(df, source_name, db_doc)
             for doc in docs:
                 doc.metadata["data"]["sheet_name"] = sheet_name
+                doc.metadata["chunk_level"] = "column"
             documents.extend(docs)
+            documents.append(self._to_sheet_document(df, db_doc, sheet_name, source_name))
             await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
             logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
         return documents