Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

sofhiaazzhr commited on 12 days ago

Commit

73b7fe3

1 Parent(s): 36ffff4

make executors self-contained, remove redundant pre-filter

Browse files

Files changed (2) hide show

src/query/executors/tabular.py +8 -42
src/query/query_executor.py +2 -12

src/query/executors/tabular.py CHANGED Viewed

@@ -1,13 +1,9 @@
 """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
-Receives sheet-level RetrievalResults from SchemaRetriever (each result
-represents a relevant sheet, with its full column list available via
-data.column_names in metadata).
 Flow:
   1. Group RetrievalResult chunks by (document_id, sheet_name).
   2. Per group: download Parquet from Azure Blob → pandas DataFrame.
-  3. Build schema context from full DataFrame columns + sample values.
   4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
   5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
   6. Fallback to raw rows if all retries fail.
@@ -50,12 +46,8 @@ IMPORTANT rules:
 - For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
 - For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
   Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
-- IMPORTANT: When the question uses "or" / "atau" between values of the same column, you MUST use or_filters (NOT filters).
-  or_filters applies OR logic: rows matching ANY condition are kept.
-  filters applies AND logic: rows must match ALL conditions.
-  Example: "(status FAILED or REVERSED) AND payment_channel=Tokopedia" →
-    or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}]
-    filters=[{{"col":"payment_channel","value":"Tokopedia","op":"eq"}}]
 - For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
 Schema:
@@ -85,9 +77,6 @@ class TabularOperation(BaseModel):
 def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
     numeric = pd.to_numeric(df[col], errors="coerce")
-    coerced_nulls = numeric.isnull() & df[col].notna()
-    if coerced_nulls.any():
-        logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
     if operator == "eq":
         return df[col].astype(str) == str(value)
     elif operator == "ne":
@@ -104,23 +93,7 @@ def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> p
 def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
-    numeric = pd.to_numeric(df[col], errors="coerce")
-    coerced_nulls = numeric.isnull() & df[col].notna()
-    if coerced_nulls.any():
-        logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
-    if operator == "eq":
-        return df[df[col].astype(str) == str(value)]
-    elif operator == "ne":
-        return df[df[col].astype(str) != str(value)]
-    elif operator == "gt":
-        return df[numeric > float(value)]
-    elif operator == "gte":
-        return df[numeric >= float(value)]
-    elif operator == "lt":
-        return df[numeric < float(value)]
-    elif operator == "lte":
-        return df[numeric <= float(value)]
-    raise ValueError(f"Unknown operator: {operator}")
 def _build_schema_context(df: pd.DataFrame) -> str:
@@ -181,15 +154,9 @@ def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.D
             raise ValueError(f"sort requires sort_col, got {op}")
         return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
     elif op.operation == "aggregate":
-        if not op.agg_func:
-            raise ValueError(f"aggregate requires agg_func, got {op}")
-        if op.agg_func == "count":
-            if not op.value_col:
-                return pd.DataFrame([{"column_name": c, "dtype": str(df[c].dtype)} for c in df.columns])
-            return pd.DataFrame([{"count": int(df[op.value_col].count()), "operation": "count"}])
-        if not op.value_col:
-            raise ValueError(f"aggregate requires value_col for {op.agg_func}, got {op}")
-        funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max"}
         value = getattr(df[op.value_col], funcs[op.agg_func])()
         return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
     else:  # "raw"
@@ -279,7 +246,6 @@ class TabularExecutor(BaseExecutor):
                 )
                 return None
-        # Each group runs independently — cross-file JOIN is out of scope for v1.
         gathered = await asyncio.gather(*[
             _process_group(doc_id, sheet_name, info)
             for (doc_id, sheet_name), info in groups.items()
@@ -313,7 +279,7 @@ class TabularExecutor(BaseExecutor):
                 prev_error = str(e)
                 logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
-        # Fallback: return raw rows (all columns — chat.py caps rows at 20 before LLM)
         logger.warning("tabular agent failed after retries, returning raw rows")
         return df.head(limit)

 """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
 Flow:
   1. Group RetrievalResult chunks by (document_id, sheet_name).
   2. Per group: download Parquet from Azure Blob → pandas DataFrame.
+  3. Build schema context from DataFrame columns + sample values.
   4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
   5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
   6. Fallback to raw rows if all retries fail.
 - For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
 - For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
   Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
+- For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
+  Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
 - For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
 Schema:
 def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
     numeric = pd.to_numeric(df[col], errors="coerce")
     if operator == "eq":
         return df[col].astype(str) == str(value)
     elif operator == "ne":
 def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
+    return df[_get_filter_mask(df, col, value, operator)]
 def _build_schema_context(df: pd.DataFrame) -> str:
             raise ValueError(f"sort requires sort_col, got {op}")
         return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
     elif op.operation == "aggregate":
+        if not op.value_col or not op.agg_func:
+            raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
+        funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
         value = getattr(df[op.value_col], funcs[op.agg_func])()
         return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
     else:  # "raw"
                 )
                 return None
         gathered = await asyncio.gather(*[
             _process_group(doc_id, sheet_name, info)
             for (doc_id, sheet_name), info in groups.items()
                 prev_error = str(e)
                 logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
+        # Fallback: return raw rows
         logger.warning("tabular agent failed after retries, returning raw rows")
         return df.head(limit)

src/query/query_executor.py CHANGED Viewed

@@ -22,19 +22,9 @@ class QueryExecutor:
         question: str,
         limit: int = 100,
     ) -> list[QueryResult]:
-        db_results = [r for r in results if r.source_type == "database"]
-        tabular_results = [
-            r for r in results
-            if r.source_type == "document"
-            and r.metadata.get("data", {}).get("file_type") in ("csv", "xlsx")
-        ]
-        async def _empty() -> list[QueryResult]:
-            return []
         batches = await asyncio.gather(
-            db_executor.execute(db_results, user_id, db, question, limit) if db_results else _empty(),
-            tabular_executor.execute(tabular_results, user_id, db, question, limit) if tabular_results else _empty(),
             return_exceptions=True,
         )

         question: str,
         limit: int = 100,
     ) -> list[QueryResult]:
         batches = await asyncio.gather(
+            db_executor.execute(results, user_id, db, question, limit),
+            tabular_executor.execute(results, user_id, db, question, limit),
             return_exceptions=True,
         )