File size: 5,693 Bytes
b107fad
 
 
 
 
e5c9b01
e70a1f7
b107fad
 
 
 
 
 
e70a1f7
b107fad
e70a1f7
b107fad
e70a1f7
b107fad
 
 
e70a1f7
b107fad
 
 
 
 
e70a1f7
b107fad
e70a1f7
b107fad
 
e70a1f7
b107fad
 
 
e70a1f7
b107fad
e5c9b01
 
 
b107fad
e5c9b01
 
 
 
c752db9
 
 
 
 
 
 
 
 
 
 
 
 
 
e5c9b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Excel analysis tool for the AI agent project.
"""

import os
import pandas as pd
from typing import Optional
from .base_tool import EnhancedTool

class ExcelAnalysisTool(EnhancedTool):
    """Tool for analyzing Excel spreadsheets."""
    
    name = "ExcelAnalysisTool"
    description = "Analyze a downloaded Excel or CSV file associated with a task ID."
    inputs = {
        "task_id": {
            "type": "string",
            "description": "Task ID for which the Excel/CSV file has been downloaded"
        },
        "query": {
            "type": "string",
            "description": "Query describing what to analyze in the file",
            "nullable": True
        }
    }
    output_type = "string"
    
    def forward(self, task_id: str, query: Optional[str] = None) -> str:
        """
        Analyze an Excel/CSV file.
        
        Args:
            task_id: Task ID for which the file has been downloaded
            query: Query describing what to analyze
            
        Returns:
            Analysis results
        """
        try:
            # Construct filename based on task_id
            filename = f"{task_id}_downloaded_file"
            
            # Check if file exists
            if not os.path.exists(filename):
                return f"Error: File for task {task_id} does not exist. Please download it first."
            
            # Try to determine file type and read accordingly
            try:
                # First try Excel format
                df = pd.read_excel(filename, engine="openpyxl")
            except Exception as excel_error:
                # If Excel reading fails, try CSV
                try:
                    df = pd.read_csv(filename)
                except Exception as csv_error:
                    # If CSV reading fails, try TSV
                    try:
                        df = pd.read_csv(filename, sep='\t')
                    except Exception as tsv_error:
                        return f"Error: Unable to read file as Excel, CSV, or TSV. Original error: {str(excel_error)}"
            
            # Basic analysis if no specific query
            if not query:
                # Create a basic overview of the data
                info = []
                info.append(f"Number of rows: {df.shape[0]}")
                info.append(f"Number of columns: {df.shape[1]}")
                info.append(f"Column names: {', '.join(df.columns.tolist())}")
                
                # Data types
                info.append("\nData types:")
                for column, dtype in df.dtypes.items():
                    info.append(f"- {column}: {dtype}")
                
                # Basic statistics for numeric columns
                if df.select_dtypes(include=['number']).shape[1] > 0:
                    info.append("\nBasic statistics for numeric columns:")
                    desc = df.describe().to_string()
                    info.append(desc)
                
                # Sample data
                info.append("\nFirst 5 rows:")
                info.append(df.head().to_string())
                
                return "\n".join(info)
            
            # Handle specific query
            query_lower = query.lower()
            
            if "sum" in query_lower or "total" in query_lower:
                # Extract column name if specified
                for col in df.columns:
                    if col.lower() in query_lower and col in df.select_dtypes(include=['number']).columns:
                        return f"Sum of values in column {col}: {df[col].sum()}"
                # Otherwise sum all numeric columns
                return df.select_dtypes(include=['number']).sum().to_string()
                
            elif "average" in query_lower or "mean" in query_lower:
                # Extract column name if specified
                for col in df.columns:
                    if col.lower() in query_lower and col in df.select_dtypes(include=['number']).columns:
                        return f"Average value in column {col}: {df[col].mean()}"
                # Otherwise calculate means for all numeric columns
                return df.select_dtypes(include=['number']).mean().to_string()
                
            elif "maximum" in query_lower or "max" in query_lower:
                # Extract column name if specified
                for col in df.columns:
                    if col.lower() in query_lower and col in df.select_dtypes(include=['number']).columns:
                        return f"Maximum value in column {col}: {df[col].max()}"
                # Otherwise find maxima for all numeric columns
                return df.select_dtypes(include=['number']).max().to_string()
                
            elif "minimum" in query_lower or "min" in query_lower:
                # Extract column name if specified
                for col in df.columns:
                    if col.lower() in query_lower and col in df.select_dtypes(include=['number']).columns:
                        return f"Minimum value in column {col}: {df[col].min()}"
                # Otherwise find minima for all numeric columns
                return df.select_dtypes(include=['number']).min().to_string()
                
            elif "count" in query_lower:
                # Count non-empty values
                return df.count().to_string()
                
            else:
                # Default: return basic info and first few rows
                return f"Analysis for query '{query}':\n\n{df.head().to_string()}"
                
        except Exception as e:
            return f"Error analyzing Excel file: {str(e)}"