Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

sofhiaazzhr commited on 9 days ago

Commit

959b1b0

1 Parent(s): 5f86993

[NOTICKET][doc] remove column filter and fallback cap for full-schema approach

Browse files

Files changed (1) hide show

src/query/executors/tabular.py +10 -18

src/query/executors/tabular.py CHANGED Viewed

@@ -1,9 +1,13 @@
 """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
 Flow:
   1. Group RetrievalResult chunks by (document_id, sheet_name).
   2. Per group: download Parquet from Azure Blob → pandas DataFrame.
-  3. Build schema context from DataFrame columns + sample values.
   4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
   5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
   6. Fallback to raw rows if all retries fail.
@@ -28,7 +32,6 @@ logger = get_logger("tabular_executor")
 class _GroupInfo(TypedDict):
-    columns: list[str]
     filename: str
     file_type: str
@@ -225,7 +228,7 @@ class TabularExecutor(BaseExecutor):
         if not tabular:
             return []
-        # Group by (document_id, sheet_name) → collect relevant column names
         groups: dict[tuple[str, str | None], _GroupInfo] = {}
         for r in tabular:
             data = r.metadata.get("data", {})
@@ -233,29 +236,18 @@ class TabularExecutor(BaseExecutor):
             if not doc_id:
                 continue
             sheet_name = data.get("sheet_name")  # None for CSV
-            col_name = data.get("column_name")
-            filename = data.get("filename", "")
-            file_type = data.get("file_type", "")
             key = (doc_id, sheet_name)
             if key not in groups:
                 groups[key] = {
-                    "columns": [],
-                    "filename": filename,
-                    "file_type": file_type,
                 }
-            if col_name and col_name not in groups[key]["columns"]:
-                groups[key]["columns"].append(col_name)
         async def _process_group(
             doc_id: str, sheet_name: str | None, info: _GroupInfo
         ) -> QueryResult | None:
             try:
                 df = await download_parquet(user_id, doc_id, sheet_name)
-                if info["columns"]:
-                    valid_cols = [c for c in info["columns"] if c in df.columns]
-                    if valid_cols:
-                        df = df[valid_cols]
                 df_result = await self._query_with_agent(df, question, limit)
                 table_label = info["filename"]
@@ -321,9 +313,9 @@ class TabularExecutor(BaseExecutor):
                 prev_error = str(e)
                 logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
-        # Fallback: return raw rows
         logger.warning("tabular agent failed after retries, returning raw rows")
-        return df.head(limit)[df.columns[:20]]
 tabular_executor = TabularExecutor()

 """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
+Receives sheet-level RetrievalResults from SchemaRetriever (each result
+represents a relevant sheet, with its full column list available via
+data.column_names in metadata).
 Flow:
   1. Group RetrievalResult chunks by (document_id, sheet_name).
   2. Per group: download Parquet from Azure Blob → pandas DataFrame.
+  3. Build schema context from full DataFrame columns + sample values.
   4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
   5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
   6. Fallback to raw rows if all retries fail.
 class _GroupInfo(TypedDict):
     filename: str
     file_type: str
         if not tabular:
             return []
+        # Group by (document_id, sheet_name) — one parquet download per group
         groups: dict[tuple[str, str | None], _GroupInfo] = {}
         for r in tabular:
             data = r.metadata.get("data", {})
             if not doc_id:
                 continue
             sheet_name = data.get("sheet_name")  # None for CSV
             key = (doc_id, sheet_name)
             if key not in groups:
                 groups[key] = {
+                    "filename": data.get("filename", ""),
+                    "file_type": data.get("file_type", ""),
                 }
         async def _process_group(
             doc_id: str, sheet_name: str | None, info: _GroupInfo
         ) -> QueryResult | None:
             try:
                 df = await download_parquet(user_id, doc_id, sheet_name)
                 df_result = await self._query_with_agent(df, question, limit)
                 table_label = info["filename"]
                 prev_error = str(e)
                 logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
+        # Fallback: return raw rows (all columns — chat.py caps rows at 20 before LLM)
         logger.warning("tabular agent failed after retries, returning raw rows")
+        return df.head(limit)
 tabular_executor = TabularExecutor()