Arslan1997 commited on
Commit
11794cc
·
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +18 -0
  2. .env-template +15 -0
  3. .gitattributes +38 -0
  4. .gitignore +48 -0
  5. Dockerfile +34 -0
  6. Housing.csv +546 -0
  7. Procfile +1 -0
  8. README.md +10 -0
  9. agents_config.json +149 -0
  10. app.py +1589 -0
  11. docs/README.md +251 -0
  12. docs/api/README.md +23 -0
  13. docs/api/routes/analytics.md +562 -0
  14. docs/api/routes/chats.md +181 -0
  15. docs/api/routes/code.md +182 -0
  16. docs/api/routes/deep_analysis.md +348 -0
  17. docs/api/routes/feedback.md +153 -0
  18. docs/api/routes/session.md +273 -0
  19. docs/api/routes/templates.md +363 -0
  20. docs/architecture/architecture.md +427 -0
  21. docs/development/development_workflow.md +506 -0
  22. docs/getting_started.md +273 -0
  23. docs/system/database-schema.md +289 -0
  24. docs/system/shared_dataframe.md +91 -0
  25. docs/troubleshooting/troubleshooting.md +537 -0
  26. entrypoint_local.sh +175 -0
  27. images/AI snapshot-chat.png +3 -0
  28. images/Auto-Analyst Banner.png +3 -0
  29. images/Auto-analyst-poster.png +3 -0
  30. images/Auto-analysts icon small.png +3 -0
  31. images/auto-analyst logo.png +3 -0
  32. requirements.txt +62 -0
  33. scripts/__init__.py +0 -0
  34. scripts/format_response.py +1112 -0
  35. scripts/init_production_db.py +191 -0
  36. scripts/populate_agent_templates.py +508 -0
  37. scripts/tier_maker.py +86 -0
  38. src/__init__.py +0 -0
  39. src/agents/agents.py +0 -0
  40. src/agents/deep_agents.py +1085 -0
  41. src/agents/marketing_analytics_agents.py +75 -0
  42. src/agents/memory_agents.py +68 -0
  43. src/agents/retrievers/retrievers.py +153 -0
  44. src/db/__init__.py +0 -0
  45. src/db/init_db.py +68 -0
  46. src/db/schemas/__init__.py +0 -0
  47. src/db/schemas/models.py +237 -0
  48. src/managers/ai_manager.py +84 -0
  49. src/managers/chat_manager.py +944 -0
  50. src/managers/session_manager.py +437 -0
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.db
6
+ *.sqlite3
7
+ *.log
8
+ .env
9
+ venv/
10
+ data/
11
+ notebooks/
12
+ *.ipynb
13
+ .idea/
14
+ .vscode/
15
+ .DS_Store
16
+ # Exclude most JSON files but allow agents_config.json
17
+ *.json
18
+ !agents_config.json
.env-template ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY=your-openai-api-key-here
2
+ MODEL_PROVIDER=openai # openai, groq, anthropic, gemini
3
+ MODEL_NAME=gpt-4o-mini
4
+ TEMPERATURE=0.7
5
+ MAX_TOKENS=6000
6
+ GROQ_API_KEY=your-groq-api-key-here
7
+ ANTHROPIC_API_KEY=your-anthropic-api-key-here
8
+ GEMINI_API_KEY=your-gemini-api-key-here
9
+
10
+ ADMIN_API_KEY=admin123
11
+
12
+ DATABASE_URL=sqlite:///chat_database.db
13
+ ENVIRONMENT="development"
14
+
15
+ FRONTEND_URL="http://localhost:3000/"
.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/Auto-Analyst[[:space:]]Banner.png filter=lfs diff=lfs merge=lfs -text
37
+ chat_database.db filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ flask_app/_pycache__
3
+ flask_app/instance
4
+ flask_app/*.env
5
+ *.env
6
+ __pycache__
7
+
8
+ venv/
9
+
10
+ .env
11
+
12
+ try*
13
+
14
+ logs/
15
+
16
+ updated_code.py
17
+ sample_code.py
18
+
19
+
20
+ *.dump
21
+
22
+ migrations/
23
+
24
+ *.pyc
25
+ alembic.ini
26
+
27
+
28
+ *.db
29
+
30
+ schema*.md
31
+
32
+ # agent_config.json
33
+
34
+
35
+ notebooks/
36
+
37
+
38
+
39
+ testing.ipynb
40
+ redis_index.json
41
+ email_to_userid_mapping.json
42
+ redis_backup_20250906_143859.json
43
+ "*.db"
44
+ "*.sqlite"
45
+ "*.sqlite3"
46
+ "venv/"
47
+ "__pycache__/"
48
+ "*.pyc"
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+
10
+
11
+ COPY --chown=user ./requirements.txt requirements.txt
12
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
+
14
+ COPY --chown=user . /app
15
+
16
+ # Verify agents_config.json was copied (it should be in the backend directory)
17
+ RUN if [ -f "/app/agents_config.json" ]; then \
18
+ echo "✅ agents_config.json found in container"; \
19
+ ls -la /app/agents_config.json; \
20
+ else \
21
+ echo "⚠️ agents_config.json not found in container - will use fallback templates"; \
22
+ ls -la /app/ | grep -E "agents|config" || echo "No config files found"; \
23
+ fi
24
+
25
+ # Make entrypoint script executable
26
+ USER root
27
+ RUN chmod +x /app/entrypoint_local.sh
28
+ # Make populate script executable
29
+ RUN chmod +x /app/scripts/populate_agent_templates.py
30
+
31
+ USER user
32
+
33
+ # Use the entrypoint script instead of directly running uvicorn
34
+ CMD ["/app/entrypoint_local.sh"]
Housing.csv ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
2
+ 13300000,7420,4,2,3,yes,yes,no,no,yes,2,no,semi-furnished
3
+ 12250000,8960,4,4,4,yes,no,yes,yes,yes,3,yes,furnished
4
+ 12250000,9960,3,2,2,yes,yes,no,no,yes,2,yes,unfurnished
5
+ 12215000,7500,4,2,2,yes,yes,no,yes,yes,3,no,furnished
6
+ 11410000,7420,4,1,2,yes,yes,yes,no,no,2,yes,unfurnished
7
+ 10850000,7500,3,3,1,yes,no,yes,no,no,2,yes,semi-furnished
8
+ 10150000,8580,4,3,4,yes,yes,no,no,yes,2,yes,semi-furnished
9
+ 10150000,16200,5,3,2,yes,yes,yes,no,yes,0,no,unfurnished
10
+ 9870000,8100,4,1,2,yes,yes,no,no,no,2,no,semi-furnished
11
+ 9800000,5750,3,2,4,yes,no,yes,yes,yes,1,yes,semi-furnished
12
+ 9800000,13200,3,1,2,yes,yes,no,yes,yes,2,no,semi-furnished
13
+ 9681000,6000,4,3,2,yes,yes,no,no,yes,2,yes,semi-furnished
14
+ 9310000,6550,4,2,2,yes,yes,no,no,no,1,no,unfurnished
15
+ 9240000,3500,4,2,2,yes,yes,no,yes,yes,2,yes,furnished
16
+ 9240000,7800,3,2,2,yes,no,no,no,no,0,yes,furnished
17
+ 9100000,6000,4,1,2,yes,yes,no,no,yes,2,no,semi-furnished
18
+ 9100000,6600,4,2,2,yes,no,yes,yes,no,1,yes,furnished
19
+ 8960000,8500,3,2,4,yes,no,yes,no,no,2,no,furnished
20
+ 8890000,4600,3,2,2,yes,no,yes,no,yes,2,no,semi-furnished
21
+ 8855000,6420,3,2,2,yes,yes,no,yes,no,1,yes,semi-furnished
22
+ 8750000,4320,3,1,2,yes,no,no,yes,yes,2,yes,furnished
23
+ 8680000,7155,3,2,1,yes,yes,no,yes,no,2,no,unfurnished
24
+ 8645000,8050,3,1,1,yes,no,no,no,yes,1,yes,semi-furnished
25
+ 8645000,4560,3,2,2,yes,no,yes,yes,yes,1,yes,unfurnished
26
+ 8575000,8800,3,2,2,yes,no,yes,yes,no,2,yes,furnished
27
+ 8540000,6540,4,2,2,yes,no,no,yes,yes,2,yes,semi-furnished
28
+ 8463000,6000,3,2,4,yes,no,yes,yes,yes,0,yes,unfurnished
29
+ 8400000,8875,3,1,1,yes,no,yes,yes,no,1,no,semi-furnished
30
+ 8400000,7950,5,2,2,yes,no,no,yes,yes,2,no,furnished
31
+ 8400000,5500,4,2,2,yes,no,yes,no,yes,1,yes,unfurnished
32
+ 8400000,7475,3,2,4,yes,yes,no,no,no,2,yes,furnished
33
+ 8400000,7000,3,1,4,yes,yes,yes,yes,no,2,yes,furnished
34
+ 8295000,4880,4,2,2,yes,no,no,yes,yes,1,no,semi-furnished
35
+ 8190000,5960,3,3,2,yes,no,no,yes,yes,1,yes,unfurnished
36
+ 8120000,6840,5,1,2,yes,no,no,no,no,1,no,furnished
37
+ 8080940,7000,3,2,4,yes,yes,no,no,no,2,no,furnished
38
+ 8043000,7482,3,2,3,yes,no,yes,no,yes,1,no,unfurnished
39
+ 7980000,9000,4,2,4,yes,yes,no,no,no,2,yes,furnished
40
+ 7962500,6000,3,1,4,yes,yes,no,no,yes,2,no,furnished
41
+ 7910000,6000,4,2,4,yes,yes,no,no,yes,1,yes,unfurnished
42
+ 7875000,6550,3,1,2,yes,yes,yes,no,yes,0,no,unfurnished
43
+ 7840000,6360,3,2,4,yes,yes,no,no,yes,0,yes,unfurnished
44
+ 7700000,6480,3,2,4,yes,no,yes,no,yes,2,no,unfurnished
45
+ 7700000,6000,4,2,4,yes,no,no,yes,no,2,yes,semi-furnished
46
+ 7560000,6000,4,2,4,yes,no,yes,yes,no,1,no,unfurnished
47
+ 7560000,6000,3,2,3,yes,no,yes,yes,no,0,yes,semi-furnished
48
+ 7525000,6000,3,2,4,yes,no,no,yes,no,1,yes,semi-furnished
49
+ 7490000,6600,3,1,4,yes,yes,yes,no,yes,3,yes,semi-furnished
50
+ 7455000,4300,3,2,2,yes,no,no,no,yes,1,yes,furnished
51
+ 7420000,7440,3,2,1,yes,no,no,yes,yes,0,no,semi-furnished
52
+ 7420000,7440,3,2,4,yes,yes,no,no,yes,1,no,semi-furnished
53
+ 7420000,6325,3,1,4,yes,no,yes,yes,no,1,yes,furnished
54
+ 7350000,6000,4,2,4,yes,yes,yes,no,yes,1,no,furnished
55
+ 7350000,5150,3,2,4,yes,no,no,yes,yes,2,yes,furnished
56
+ 7350000,6000,3,2,2,yes,yes,no,no,no,1,yes,unfurnished
57
+ 7350000,6000,3,1,2,yes,no,yes,no,no,1,no,furnished
58
+ 7343000,11440,4,1,2,yes,no,no,yes,no,1,yes,furnished
59
+ 7245000,9000,4,2,4,yes,yes,no,no,no,1,yes,semi-furnished
60
+ 7210000,7680,4,2,4,yes,yes,no,yes,yes,1,no,furnished
61
+ 7210000,6000,3,2,4,yes,yes,yes,yes,yes,1,yes,unfurnished
62
+ 7140000,6000,3,2,2,yes,yes,yes,yes,no,1,yes,furnished
63
+ 7070000,8880,2,1,1,yes,yes,yes,yes,yes,1,yes,unfurnished
64
+ 7070000,6240,4,2,2,yes,yes,no,no,yes,1,yes,furnished
65
+ 7035000,6360,4,2,3,yes,yes,no,no,yes,2,no,furnished
66
+ 7000000,11175,3,1,1,yes,yes,no,no,no,1,no,furnished
67
+ 6930000,8880,3,2,2,yes,no,yes,yes,yes,1,no,furnished
68
+ 6930000,13200,2,1,1,yes,no,no,no,no,1,yes,unfurnished
69
+ 6895000,7700,3,2,1,yes,yes,no,yes,yes,2,yes,unfurnished
70
+ 6860000,6000,3,1,1,yes,no,no,no,yes,1,no,semi-furnished
71
+ 6790000,12090,4,2,2,yes,no,no,no,yes,2,yes,unfurnished
72
+ 6790000,4000,3,2,2,yes,no,yes,no,no,0,yes,furnished
73
+ 6755000,6000,4,2,4,yes,no,yes,yes,yes,0,yes,semi-furnished
74
+ 6720000,5020,3,1,4,yes,yes,no,no,no,0,yes,semi-furnished
75
+ 6685000,6600,2,2,4,yes,no,yes,no,no,0,no,semi-furnished
76
+ 6650000,4040,3,1,2,yes,yes,yes,yes,no,1,no,furnished
77
+ 6650000,4260,4,2,2,yes,no,no,no,yes,0,no,unfurnished
78
+ 6650000,6420,3,2,3,yes,no,no,yes,yes,0,yes,furnished
79
+ 6650000,6500,3,2,3,yes,no,yes,no,yes,0,no,semi-furnished
80
+ 6650000,5700,3,1,1,yes,yes,no,yes,yes,2,no,furnished
81
+ 6650000,6000,3,2,3,yes,no,yes,yes,no,0,yes,furnished
82
+ 6629000,6000,3,1,2,yes,yes,no,no,no,1,yes,unfurnished
83
+ 6615000,4000,3,2,2,yes,no,yes,yes,no,1,no,unfurnished
84
+ 6615000,10500,3,2,1,yes,yes,no,yes,yes,1,no,semi-furnished
85
+ 6580000,6000,3,2,4,yes,no,yes,yes,no,0,no,furnished
86
+ 6510000,3760,3,1,2,yes,yes,yes,yes,yes,2,no,furnished
87
+ 6510000,8250,3,2,3,yes,yes,yes,yes,no,0,yes,unfurnished
88
+ 6510000,6670,3,1,3,yes,no,no,yes,yes,0,no,furnished
89
+ 6475000,3960,3,1,1,yes,yes,yes,no,yes,2,no,furnished
90
+ 6475000,7410,3,1,1,yes,no,yes,yes,no,2,yes,furnished
91
+ 6440000,8580,5,3,2,yes,no,no,yes,no,2,no,unfurnished
92
+ 6440000,5000,3,1,2,yes,no,no,no,no,0,yes,semi-furnished
93
+ 6419000,6750,2,1,1,yes,no,yes,no,no,2,yes,semi-furnished
94
+ 6405000,4800,3,2,4,yes,no,yes,no,yes,0,yes,furnished
95
+ 6300000,7200,3,2,1,yes,no,no,no,no,3,no,semi-furnished
96
+ 6300000,6000,4,2,4,yes,no,no,no,no,1,no,unfurnished
97
+ 6300000,4100,3,2,3,yes,no,no,no,no,2,yes,unfurnished
98
+ 6300000,9000,3,1,1,yes,no,no,no,yes,1,no,semi-furnished
99
+ 6300000,6400,3,1,1,yes,no,no,yes,yes,1,no,semi-furnished
100
+ 6293000,6600,3,2,3,yes,no,yes,yes,no,0,yes,furnished
101
+ 6265000,6000,4,1,3,yes,yes,no,no,yes,0,yes,semi-furnished
102
+ 6230000,6600,3,2,1,yes,yes,yes,yes,no,0,no,furnished
103
+ 6230000,5500,3,1,3,yes,no,yes,yes,yes,1,yes,furnished
104
+ 6195000,5500,3,2,4,yes,no,yes,yes,yes,1,no,unfurnished
105
+ 6195000,6350,3,2,3,yes,no,no,no,yes,0,no,unfurnished
106
+ 6195000,5500,3,2,1,yes,no,yes,no,yes,2,no,unfurnished
107
+ 6160000,4500,3,1,4,yes,no,yes,yes,no,0,no,unfurnished
108
+ 6160000,5450,4,2,1,yes,no,no,no,no,0,yes,semi-furnished
109
+ 6125000,6420,3,1,3,yes,no,yes,yes,no,0,yes,unfurnished
110
+ 6107500,3240,4,1,3,yes,no,yes,yes,yes,1,yes,semi-furnished
111
+ 6090000,6615,4,2,2,yes,yes,no,yes,no,1,yes,unfurnished
112
+ 6090000,6600,3,1,1,yes,no,yes,yes,no,2,no,semi-furnished
113
+ 6090000,8372,3,1,3,yes,yes,yes,no,no,2,no,furnished
114
+ 6083000,4300,6,2,2,yes,no,yes,no,yes,0,yes,semi-furnished
115
+ 6083000,9620,3,1,1,yes,no,no,no,yes,2,no,semi-furnished
116
+ 6020000,6800,2,1,1,yes,yes,no,yes,yes,2,no,unfurnished
117
+ 6020000,8000,3,1,1,yes,no,no,no,no,2,no,semi-furnished
118
+ 6020000,6900,3,2,1,yes,yes,yes,yes,no,0,yes,furnished
119
+ 5950000,3700,4,1,2,yes,no,yes,no,no,0,yes,furnished
120
+ 5950000,6420,3,1,1,yes,no,no,yes,yes,0,yes,unfurnished
121
+ 5950000,7020,3,1,1,yes,yes,yes,yes,yes,2,no,furnished
122
+ 5950000,6540,3,1,1,yes,no,yes,yes,no,2,yes,semi-furnished
123
+ 5950000,7231,3,1,2,yes,yes,no,no,yes,0,yes,furnished
124
+ 5950000,6254,4,2,1,yes,no,no,no,yes,1,yes,unfurnished
125
+ 5950000,7320,4,2,2,yes,yes,yes,no,yes,0,yes,semi-furnished
126
+ 5950000,6525,3,2,4,yes,yes,yes,no,yes,1,yes,unfurnished
127
+ 5943000,15600,3,1,1,yes,no,yes,yes,no,2,yes,semi-furnished
128
+ 5880000,7160,3,1,1,yes,no,no,no,yes,2,no,furnished
129
+ 5880000,6500,3,2,3,yes,yes,no,no,yes,0,no,unfurnished
130
+ 5873000,5500,3,1,3,yes,no,no,yes,yes,1,yes,unfurnished
131
+ 5873000,11460,3,1,3,yes,no,no,yes,yes,2,yes,semi-furnished
132
+ 5866000,4800,3,1,1,yes,no,yes,yes,no,0,no,unfurnished
133
+ 5810000,5828,4,1,4,yes,yes,no,no,no,0,no,semi-furnished
134
+ 5810000,5200,3,1,3,yes,yes,no,yes,yes,0,yes,semi-furnished
135
+ 5810000,4800,3,1,3,yes,yes,no,yes,yes,0,yes,semi-furnished
136
+ 5803000,7000,3,1,1,yes,yes,no,yes,yes,2,yes,semi-furnished
137
+ 5775000,6000,3,2,4,yes,yes,no,no,yes,0,yes,furnished
138
+ 5740000,5400,4,2,2,yes,yes,yes,yes,yes,2,no,semi-furnished
139
+ 5740000,4640,4,1,2,yes,yes,yes,no,yes,1,yes,unfurnished
140
+ 5740000,5000,3,1,3,yes,yes,no,no,no,0,yes,furnished
141
+ 5740000,6360,3,1,1,yes,yes,yes,yes,yes,2,yes,unfurnished
142
+ 5740000,5800,3,2,4,yes,no,no,yes,no,0,yes,furnished
143
+ 5652500,6660,4,2,2,yes,yes,no,no,no,1,no,semi-furnished
144
+ 5600000,10500,4,2,2,yes,no,yes,yes,yes,1,no,furnished
145
+ 5600000,4800,5,2,3,no,no,yes,yes,no,0,no,furnished
146
+ 5600000,4700,4,1,2,yes,no,no,no,no,1,yes,unfurnished
147
+ 5600000,5000,3,1,4,yes,yes,no,no,no,0,yes,unfurnished
148
+ 5600000,10500,2,1,1,yes,yes,no,yes,no,1,no,semi-furnished
149
+ 5600000,5500,3,2,2,yes,yes,yes,no,no,1,yes,furnished
150
+ 5600000,6360,3,1,3,yes,yes,no,yes,no,0,yes,furnished
151
+ 5600000,6600,4,2,1,yes,no,no,yes,yes,0,no,furnished
152
+ 5600000,5136,3,1,2,yes,yes,no,yes,no,0,no,semi-furnished
153
+ 5565000,4400,4,1,2,yes,yes,yes,no,no,2,no,semi-furnished
154
+ 5565000,5400,5,1,2,yes,yes,no,yes,no,0,no,semi-furnished
155
+ 5530000,3300,3,3,2,yes,yes,no,yes,yes,0,yes,semi-furnished
156
+ 5530000,3650,3,2,2,yes,yes,yes,yes,no,2,yes,semi-furnished
157
+ 5530000,6100,3,2,1,yes,no,no,no,yes,2,no,unfurnished
158
+ 5523000,6900,3,1,1,yes,yes,no,no,no,0,yes,furnished
159
+ 5495000,2817,4,2,2,no,no,yes,no,yes,1,no,furnished
160
+ 5495000,7980,3,1,1,yes,yes,no,no,yes,2,no,furnished
161
+ 5460000,3150,3,2,1,yes,no,yes,no,no,0,yes,furnished
162
+ 5460000,6210,4,1,4,yes,yes,yes,yes,no,0,no,furnished
163
+ 5460000,6100,3,1,3,yes,yes,no,no,yes,0,yes,furnished
164
+ 5460000,6600,4,2,2,yes,no,no,no,no,0,yes,furnished
165
+ 5425000,6825,3,1,1,yes,no,no,no,yes,0,yes,furnished
166
+ 5390000,6710,3,2,2,yes,no,no,yes,yes,1,yes,unfurnished
167
+ 5383000,6450,3,2,1,yes,yes,yes,no,yes,0,no,furnished
168
+ 5320000,7800,3,1,1,yes,no,no,no,yes,2,yes,unfurnished
169
+ 5285000,4600,2,2,1,yes,yes,yes,yes,yes,2,no,semi-furnished
170
+ 5250000,4260,4,1,2,yes,yes,no,no,no,0,no,furnished
171
+ 5250000,6540,4,2,2,no,no,no,yes,yes,0,no,semi-furnished
172
+ 5250000,5500,3,2,1,yes,no,no,yes,yes,0,no,semi-furnished
173
+ 5250000,10269,3,1,1,yes,yes,yes,yes,no,1,yes,semi-furnished
174
+ 5250000,8400,3,1,2,yes,yes,no,yes,yes,2,yes,unfurnished
175
+ 5250000,5300,4,2,1,yes,no,no,yes,no,0,no,semi-furnished
176
+ 5250000,3800,3,1,2,yes,no,no,no,no,1,no,semi-furnished
177
+ 5250000,9800,4,2,2,yes,no,no,yes,no,2,yes,semi-furnished
178
+ 5250000,8520,3,1,1,yes,yes,yes,yes,yes,2,no,furnished
179
+ 5243000,6050,3,1,1,yes,yes,no,no,yes,0,yes,furnished
180
+ 5229000,7085,3,1,1,yes,yes,yes,yes,yes,2,no,unfurnished
181
+ 5215000,3180,3,2,2,yes,yes,yes,no,no,2,yes,unfurnished
182
+ 5215000,4500,4,2,1,no,yes,yes,no,yes,2,no,furnished
183
+ 5215000,7200,3,1,2,yes,yes,yes,no,no,1,no,unfurnished
184
+ 5145000,3410,3,1,2,no,no,yes,yes,yes,0,yes,furnished
185
+ 5145000,7980,3,1,1,yes,yes,yes,yes,no,1,no,semi-furnished
186
+ 5110000,3000,3,2,2,yes,yes,no,yes,yes,0,no,semi-furnished
187
+ 5110000,3000,3,1,2,yes,yes,yes,no,yes,0,yes,unfurnished
188
+ 5110000,11410,2,1,2,yes,no,yes,yes,yes,0,no,furnished
189
+ 5110000,6100,3,1,1,yes,yes,yes,yes,no,0,yes,furnished
190
+ 5075000,5720,2,1,2,yes,yes,no,yes,no,0,yes,unfurnished
191
+ 5040000,3540,2,1,1,no,no,no,no,yes,0,no,furnished
192
+ 5040000,7600,4,1,2,yes,yes,yes,no,yes,2,no,semi-furnished
193
+ 5040000,10700,3,1,2,yes,yes,no,yes,no,0,no,unfurnished
194
+ 5040000,6600,3,1,1,yes,yes,yes,yes,no,0,no,unfurnished
195
+ 5033000,4800,2,1,1,yes,yes,yes,yes,no,0,no,furnished
196
+ 5005000,8150,3,2,1,yes,yes,no,yes,yes,0,no,unfurnished
197
+ 4970000,4410,4,3,2,yes,no,yes,no,no,2,no,furnished
198
+ 4970000,7686,3,1,1,yes,no,no,no,yes,0,yes,furnished
199
+ 4956000,2800,3,2,2,no,no,no,no,no,1,no,furnished
200
+ 4935000,5948,3,1,2,yes,yes,yes,no,yes,0,yes,furnished
201
+ 4907000,4200,3,1,2,yes,yes,yes,yes,no,1,no,furnished
202
+ 4900000,4520,3,1,2,yes,yes,no,yes,no,0,no,unfurnished
203
+ 4900000,4095,3,1,2,no,no,yes,yes,no,0,yes,semi-furnished
204
+ 4900000,4120,2,1,1,yes,yes,yes,no,no,1,no,semi-furnished
205
+ 4900000,5400,4,1,2,yes,yes,yes,yes,no,0,no,unfurnished
206
+ 4900000,4770,3,1,1,yes,no,no,no,yes,0,yes,unfurnished
207
+ 4900000,6300,3,1,1,yes,yes,no,no,no,2,no,semi-furnished
208
+ 4900000,5800,2,1,1,yes,no,yes,yes,no,0,no,semi-furnished
209
+ 4900000,3000,3,1,2,yes,no,no,yes,no,0,no,semi-furnished
210
+ 4900000,2970,3,1,3,yes,no,no,yes,yes,0,yes,semi-furnished
211
+ 4900000,6720,3,1,1,yes,yes,no,no,no,0,no,semi-furnished
212
+ 4900000,4646,3,1,2,yes,yes,no,yes,yes,2,no,furnished
213
+ 4900000,12900,3,1,1,yes,yes,no,no,no,2,no,unfurnished
214
+ 4893000,3420,4,2,2,yes,yes,yes,yes,no,2,no,furnished
215
+ 4893000,4995,4,2,1,yes,yes,yes,no,yes,0,yes,semi-furnished
216
+ 4865000,4350,2,1,1,yes,yes,yes,no,no,0,yes,semi-furnished
217
+ 4830000,4160,3,1,3,yes,yes,no,yes,yes,0,no,semi-furnished
218
+ 4830000,6040,3,1,1,yes,no,no,no,yes,2,no,furnished
219
+ 4830000,6862,3,1,2,yes,yes,yes,no,no,2,yes,furnished
220
+ 4830000,4815,2,1,1,yes,no,no,yes,yes,0,yes,semi-furnished
221
+ 4795000,7000,3,1,2,yes,yes,yes,no,no,0,no,furnished
222
+ 4795000,8100,4,1,4,yes,yes,yes,yes,no,2,yes,furnished
223
+ 4767000,3420,4,2,2,yes,yes,yes,no,yes,0,no,semi-furnished
224
+ 4760000,9166,2,1,1,yes,no,no,yes,yes,2,yes,furnished
225
+ 4760000,6321,3,1,2,yes,no,no,yes,yes,1,yes,unfurnished
226
+ 4760000,10240,2,1,1,yes,no,yes,no,yes,2,yes,unfurnished
227
+ 4753000,6440,2,1,1,yes,no,yes,no,no,3,yes,unfurnished
228
+ 4690000,5170,3,1,4,yes,yes,yes,no,no,0,no,unfurnished
229
+ 4690000,6000,2,1,1,yes,no,yes,no,yes,1,yes,semi-furnished
230
+ 4690000,3630,3,1,2,yes,yes,no,no,no,2,yes,semi-furnished
231
+ 4690000,9667,4,2,2,yes,yes,no,yes,no,1,no,semi-furnished
232
+ 4690000,5400,2,1,2,yes,no,yes,no,no,0,yes,unfurnished
233
+ 4690000,4320,3,1,1,yes,no,yes,yes,no,0,yes,unfurnished
234
+ 4655000,3745,3,1,2,yes,no,no,yes,yes,0,no,furnished
235
+ 4620000,4160,3,1,1,yes,no,no,no,yes,0,yes,furnished
236
+ 4620000,3880,3,2,2,yes,no,yes,yes,no,2,no,unfurnished
237
+ 4620000,5680,3,1,2,yes,no,yes,yes,yes,1,no,unfurnished
238
+ 4620000,2870,2,1,2,yes,no,yes,yes,yes,0,no,unfurnished
239
+ 4620000,5010,3,1,2,yes,no,no,yes,yes,0,no,unfurnished
240
+ 4613000,4510,4,2,2,yes,yes,no,no,no,0,yes,furnished
241
+ 4585000,4000,3,1,2,yes,no,no,yes,no,1,no,unfurnished
242
+ 4585000,3840,3,1,2,yes,no,yes,no,no,1,yes,furnished
243
+ 4550000,3760,3,1,1,yes,yes,yes,no,yes,2,yes,unfurnished
244
+ 4550000,3640,3,1,2,yes,no,no,yes,no,0,yes,unfurnished
245
+ 4550000,2550,3,1,2,yes,yes,no,no,no,0,yes,unfurnished
246
+ 4550000,5320,3,1,2,yes,yes,yes,no,yes,0,yes,furnished
247
+ 4550000,5360,3,1,2,yes,no,no,no,no,2,yes,furnished
248
+ 4550000,3520,3,1,1,yes,yes,no,yes,no,0,yes,semi-furnished
249
+ 4550000,8400,4,1,4,yes,yes,yes,no,yes,3,yes,unfurnished
250
+ 4543000,4100,2,2,1,yes,yes,no,no,yes,0,yes,semi-furnished
251
+ 4543000,4990,4,2,2,yes,yes,yes,no,yes,0,yes,semi-furnished
252
+ 4515000,3510,3,1,3,yes,no,yes,yes,no,0,no,furnished
253
+ 4515000,3450,3,1,2,yes,yes,yes,no,no,1,no,furnished
254
+ 4515000,9860,3,1,1,yes,no,yes,yes,no,0,no,semi-furnished
255
+ 4515000,3520,2,1,2,yes,yes,yes,yes,yes,0,yes,unfurnished
256
+ 4480000,4510,4,1,2,yes,yes,yes,yes,yes,2,no,furnished
257
+ 4480000,5885,2,1,1,yes,yes,yes,yes,no,1,no,unfurnished
258
+ 4480000,4000,3,1,2,yes,yes,no,yes,no,2,no,unfurnished
259
+ 4480000,8250,3,1,1,yes,no,yes,yes,yes,0,yes,semi-furnished
260
+ 4480000,4040,3,1,2,yes,no,yes,no,yes,1,yes,semi-furnished
261
+ 4473000,6360,2,1,1,yes,yes,no,yes,yes,1,yes,semi-furnished
262
+ 4473000,3162,3,1,2,yes,yes,no,no,yes,1,yes,semi-furnished
263
+ 4473000,3510,3,1,2,yes,no,no,no,no,0,no,furnished
264
+ 4445000,3750,2,1,1,yes,yes,yes,yes,yes,0,no,furnished
265
+ 4410000,3968,3,1,2,no,yes,no,no,no,0,no,semi-furnished
266
+ 4410000,4900,2,1,2,yes,yes,yes,no,no,0,no,furnished
267
+ 4403000,2880,3,1,2,yes,no,yes,no,no,0,yes,semi-furnished
268
+ 4403000,4880,3,1,1,yes,no,yes,no,yes,2,yes,unfurnished
269
+ 4403000,4920,3,1,2,yes,no,yes,yes,yes,1,yes,unfurnished
270
+ 4382000,4950,4,1,2,yes,yes,no,no,yes,0,no,unfurnished
271
+ 4375000,3900,3,1,2,yes,yes,no,yes,yes,0,yes,semi-furnished
272
+ 4340000,4500,3,2,3,yes,no,yes,no,no,1,yes,unfurnished
273
+ 4340000,1905,5,1,2,no,no,yes,no,yes,0,no,furnished
274
+ 4340000,4075,3,1,1,yes,no,yes,yes,yes,2,no,unfurnished
275
+ 4340000,3500,4,1,2,yes,no,no,no,no,2,yes,unfurnished
276
+ 4340000,6450,4,1,2,yes,yes,yes,no,yes,0,yes,unfurnished
277
+ 4319000,4032,2,1,1,yes,no,yes,no,no,0,yes,furnished
278
+ 4305000,4400,2,1,1,yes,yes,no,yes,no,1,yes,semi-furnished
279
+ 4305000,10360,2,1,1,yes,no,no,no,no,1,yes,furnished
280
+ 4277000,3400,3,1,2,yes,yes,yes,yes,no,2,yes,semi-furnished
281
+ 4270000,6360,2,1,1,yes,no,no,no,no,0,no,semi-furnished
282
+ 4270000,6360,2,1,2,yes,no,no,yes,no,0,no,unfurnished
283
+ 4270000,4500,2,1,1,yes,no,yes,yes,no,2,no,semi-furnished
284
+ 4270000,2175,3,1,2,no,no,no,no,no,0,yes,furnished
285
+ 4270000,4360,4,1,2,yes,yes,yes,no,yes,0,no,furnished
286
+ 4270000,7770,2,1,1,yes,no,no,yes,yes,1,yes,semi-furnished
287
+ 4235000,6650,3,1,2,yes,yes,yes,yes,yes,0,yes,unfurnished
288
+ 4235000,2787,3,1,1,yes,yes,yes,no,yes,0,no,unfurnished
289
+ 4200000,5500,3,1,2,yes,yes,yes,yes,no,0,yes,furnished
290
+ 4200000,5040,3,1,2,yes,yes,no,no,yes,0,yes,unfurnished
291
+ 4200000,5850,2,1,1,yes,no,yes,no,no,2,no,semi-furnished
292
+ 4200000,2610,4,3,2,no,yes,no,no,yes,0,no,semi-furnished
293
+ 4200000,2953,3,1,2,yes,yes,yes,no,yes,0,no,furnished
294
+ 4200000,2747,4,2,2,no,yes,yes,yes,yes,0,yes,semi-furnished
295
+ 4200000,4410,2,1,1,no,no,no,no,no,1,no,furnished
296
+ 4200000,4000,4,2,2,no,no,yes,yes,yes,0,no,semi-furnished
297
+ 4200000,2325,3,1,2,no,no,yes,no,no,0,yes,furnished
298
+ 4200000,4600,3,2,2,yes,no,no,yes,no,1,no,unfurnished
299
+ 4200000,3640,3,2,2,yes,yes,yes,no,yes,0,no,semi-furnished
300
+ 4200000,5800,3,1,1,yes,yes,yes,yes,yes,2,no,semi-furnished
301
+ 4200000,7000,3,1,1,yes,no,yes,yes,yes,3,yes,furnished
302
+ 4200000,4079,3,1,3,yes,yes,yes,no,no,0,yes,furnished
303
+ 4200000,3520,3,1,2,yes,yes,no,no,yes,0,yes,semi-furnished
304
+ 4200000,2145,3,1,3,yes,yes,no,no,no,1,yes,furnished
305
+ 4200000,4500,3,1,1,yes,no,yes,no,no,0,no,semi-furnished
306
+ 4193000,8250,3,1,1,yes,no,no,yes,yes,3,yes,unfurnished
307
+ 4193000,3450,3,1,2,yes,yes,yes,no,yes,1,yes,furnished
308
+ 4165000,4840,3,1,2,yes,no,no,yes,no,1,no,semi-furnished
309
+ 4165000,4080,3,1,2,yes,no,yes,yes,no,2,no,unfurnished
310
+ 4165000,4046,3,1,2,yes,no,yes,no,no,1,no,unfurnished
311
+ 4130000,4632,4,1,2,yes,no,yes,yes,yes,0,yes,semi-furnished
312
+ 4130000,5985,3,1,1,yes,no,yes,yes,no,0,yes,furnished
313
+ 4123000,6060,2,1,1,yes,yes,yes,yes,yes,1,yes,semi-furnished
314
+ 4098500,3600,3,1,1,yes,no,yes,yes,no,0,no,unfurnished
315
+ 4095000,3680,3,2,2,yes,yes,no,no,no,0,no,semi-furnished
316
+ 4095000,4040,2,1,2,yes,yes,yes,no,yes,1,yes,furnished
317
+ 4095000,5600,2,1,1,yes,no,no,no,no,0,no,unfurnished
318
+ 4060000,5900,4,2,2,no,yes,yes,no,yes,1,no,semi-furnished
319
+ 4060000,4992,3,2,2,yes,yes,no,yes,yes,2,no,furnished
320
+ 4060000,4340,3,1,1,yes,yes,no,yes,no,0,yes,unfurnished
321
+ 4060000,3000,4,1,3,yes,no,no,no,yes,2,no,furnished
322
+ 4060000,4320,3,1,2,yes,no,yes,no,yes,2,no,unfurnished
323
+ 4025000,3630,3,2,2,yes,yes,yes,no,no,2,yes,semi-furnished
324
+ 4025000,3460,3,2,1,yes,no,no,yes,yes,1,yes,furnished
325
+ 4025000,5400,3,1,1,yes,no,yes,no,yes,3,no,unfurnished
326
+ 4007500,4500,3,1,2,no,no,no,no,yes,0,no,furnished
327
+ 4007500,3460,4,1,2,yes,no,yes,no,no,0,yes,furnished
328
+ 3990000,4100,4,1,1,no,yes,no,yes,yes,0,yes,semi-furnished
329
+ 3990000,6480,3,1,2,no,yes,no,yes,no,1,no,furnished
330
+ 3990000,4500,3,2,2,no,no,yes,yes,no,0,no,unfurnished
331
+ 3990000,3960,3,1,2,yes,no,no,yes,yes,0,no,unfurnished
332
+ 3990000,4050,2,1,2,yes,yes,yes,yes,yes,0,no,unfurnished
333
+ 3920000,7260,3,2,1,yes,yes,no,yes,yes,3,yes,unfurnished
334
+ 3920000,5500,4,1,2,yes,no,no,yes,no,0,no,semi-furnished
335
+ 3920000,3000,3,1,2,yes,yes,yes,yes,yes,0,no,furnished
336
+ 3920000,3290,2,1,1,yes,no,yes,no,yes,1,yes,semi-furnished
337
+ 3920000,3816,2,1,1,yes,no,yes,no,yes,2,no,furnished
338
+ 3920000,8080,3,1,1,yes,yes,no,yes,yes,2,yes,furnished
339
+ 3920000,2145,4,2,1,yes,yes,yes,yes,yes,0,no,furnished
340
+ 3885000,3780,2,1,2,yes,no,no,no,no,0,no,semi-furnished
341
+ 3885000,3180,4,2,2,yes,no,no,yes,no,0,yes,furnished
342
+ 3850000,5300,5,2,2,yes,yes,yes,yes,yes,0,yes,semi-furnished
343
+ 3850000,3180,2,2,1,yes,no,yes,yes,yes,2,no,unfurnished
344
+ 3850000,7152,3,1,2,yes,yes,no,no,yes,0,no,unfurnished
345
+ 3850000,4080,2,1,1,yes,no,no,no,yes,0,no,furnished
346
+ 3850000,3850,2,1,1,yes,yes,yes,no,no,0,yes,semi-furnished
347
+ 3850000,2015,3,1,2,yes,yes,yes,no,yes,0,no,unfurnished
348
+ 3850000,2176,2,1,2,yes,yes,yes,no,no,0,yes,semi-furnished
349
+ 3836000,3350,3,1,2,yes,no,yes,yes,no,0,yes,semi-furnished
350
+ 3815000,3150,2,2,1,no,no,yes,no,yes,0,no,furnished
351
+ 3780000,4820,3,1,2,yes,yes,no,yes,no,0,yes,unfurnished
352
+ 3780000,3420,2,1,2,yes,no,no,yes,no,1,no,furnished
353
+ 3780000,3600,2,1,1,yes,yes,no,no,no,0,no,furnished
354
+ 3780000,5830,2,1,1,yes,yes,no,no,no,2,no,unfurnished
355
+ 3780000,2856,3,1,3,yes,no,yes,yes,yes,0,no,unfurnished
356
+ 3780000,8400,2,1,1,yes,no,yes,yes,yes,1,no,semi-furnished
357
+ 3773000,8250,3,1,1,yes,yes,yes,yes,no,2,yes,unfurnished
358
+ 3773000,2520,5,2,1,no,no,no,no,yes,1,yes,semi-furnished
359
+ 3773000,6930,4,1,2,no,no,no,no,yes,1,no,semi-furnished
360
+ 3745000,3480,2,1,1,yes,yes,yes,no,yes,0,no,semi-furnished
361
+ 3710000,3600,3,1,1,yes,yes,yes,no,yes,1,yes,unfurnished
362
+ 3710000,4040,2,1,1,yes,no,no,no,yes,0,yes,furnished
363
+ 3710000,6020,3,1,1,yes,yes,no,yes,no,0,yes,semi-furnished
364
+ 3710000,4050,2,1,1,yes,yes,yes,no,yes,0,yes,furnished
365
+ 3710000,3584,2,1,1,yes,no,yes,yes,yes,0,no,furnished
366
+ 3703000,3120,3,1,2,no,yes,no,yes,yes,0,no,unfurnished
367
+ 3703000,5450,2,1,1,yes,yes,yes,yes,yes,0,yes,furnished
368
+ 3675000,3630,2,1,1,yes,no,yes,no,no,0,no,furnished
369
+ 3675000,3630,2,1,1,yes,no,yes,no,no,0,yes,unfurnished
370
+ 3675000,5640,2,1,1,no,yes,no,no,no,0,no,semi-furnished
371
+ 3675000,3600,2,1,1,yes,yes,no,yes,no,0,yes,furnished
372
+ 3640000,4280,2,1,1,yes,no,no,yes,yes,2,no,unfurnished
373
+ 3640000,3570,3,1,2,yes,no,no,yes,no,0,yes,furnished
374
+ 3640000,3180,3,1,2,no,no,no,yes,yes,0,yes,furnished
375
+ 3640000,3000,2,1,2,yes,no,no,yes,yes,0,no,semi-furnished
376
+ 3640000,3520,2,2,1,yes,yes,no,yes,yes,0,yes,furnished
377
+ 3640000,5960,3,1,2,yes,no,yes,no,yes,0,no,unfurnished
378
+ 3640000,4130,3,2,2,yes,no,yes,no,yes,2,no,furnished
379
+ 3640000,2850,3,2,2,no,yes,yes,no,no,0,yes,semi-furnished
380
+ 3640000,2275,3,1,3,yes,no,yes,no,no,0,no,semi-furnished
381
+ 3633000,3520,3,1,1,yes,yes,yes,yes,no,2,no,unfurnished
382
+ 3605000,4500,2,1,1,yes,no,yes,yes,no,0,no,semi-furnished
383
+ 3605000,4000,2,1,1,yes,no,yes,no,yes,0,no,furnished
384
+ 3570000,3150,3,1,2,yes,no,no,yes,no,0,no,semi-furnished
385
+ 3570000,4500,4,2,2,yes,yes,no,no,yes,2,yes,unfurnished
386
+ 3570000,4500,2,1,1,no,yes,yes,no,no,0,no,semi-furnished
387
+ 3570000,3640,2,1,1,yes,no,yes,no,yes,0,no,semi-furnished
388
+ 3535000,3850,3,1,1,yes,yes,no,yes,yes,2,no,semi-furnished
389
+ 3500000,4240,3,1,2,yes,yes,yes,no,no,0,no,unfurnished
390
+ 3500000,3650,3,1,2,yes,yes,yes,no,yes,0,no,semi-furnished
391
+ 3500000,4600,4,1,2,yes,yes,no,no,yes,0,yes,semi-furnished
392
+ 3500000,2135,3,2,2,no,yes,yes,yes,no,0,no,furnished
393
+ 3500000,3036,3,1,2,yes,yes,no,yes,yes,0,yes,unfurnished
394
+ 3500000,3990,3,1,2,yes,no,yes,no,no,0,yes,semi-furnished
395
+ 3500000,7424,3,1,1,no,no,no,yes,no,0,yes,unfurnished
396
+ 3500000,3480,3,1,1,no,yes,no,yes,no,0,yes,semi-furnished
397
+ 3500000,3600,6,1,2,yes,no,no,yes,yes,1,no,semi-furnished
398
+ 3500000,3640,2,1,1,yes,no,yes,yes,yes,1,yes,semi-furnished
399
+ 3500000,5900,2,1,1,yes,no,no,yes,yes,1,yes,furnished
400
+ 3500000,3120,3,1,2,yes,no,yes,yes,no,1,no,furnished
401
+ 3500000,7350,2,1,1,yes,yes,yes,yes,yes,1,no,furnished
402
+ 3500000,3512,2,1,1,yes,no,yes,yes,no,1,no,unfurnished
403
+ 3500000,9500,3,1,2,yes,no,yes,yes,no,3,yes,unfurnished
404
+ 3500000,5880,2,1,1,yes,yes,no,yes,yes,0,no,furnished
405
+ 3500000,12944,3,1,1,yes,yes,no,no,no,0,no,unfurnished
406
+ 3493000,4900,3,1,2,no,no,yes,no,yes,0,yes,semi-furnished
407
+ 3465000,3060,3,1,1,yes,no,yes,yes,yes,0,no,semi-furnished
408
+ 3465000,5320,2,1,1,yes,no,no,no,yes,1,no,semi-furnished
409
+ 3465000,2145,3,1,3,yes,no,yes,no,yes,0,no,furnished
410
+ 3430000,4000,2,1,1,yes,no,no,yes,yes,0,no,unfurnished
411
+ 3430000,3185,2,1,1,yes,no,yes,yes,no,2,yes,furnished
412
+ 3430000,3850,3,1,1,yes,yes,yes,yes,no,0,no,unfurnished
413
+ 3430000,2145,3,1,3,yes,no,no,no,no,0,no,semi-furnished
414
+ 3430000,2610,3,1,2,yes,no,yes,no,yes,0,yes,furnished
415
+ 3430000,1950,3,2,2,yes,yes,no,yes,yes,0,no,unfurnished
416
+ 3423000,4040,2,1,1,yes,yes,yes,yes,yes,0,yes,unfurnished
417
+ 3395000,4785,3,1,2,yes,yes,yes,yes,no,1,no,semi-furnished
418
+ 3395000,3450,3,1,1,yes,yes,yes,no,yes,2,yes,furnished
419
+ 3395000,3640,2,1,1,yes,no,no,no,no,0,yes,furnished
420
+ 3360000,3500,4,1,2,yes,no,yes,no,no,2,yes,furnished
421
+ 3360000,4960,4,1,3,no,no,yes,no,yes,0,yes,semi-furnished
422
+ 3360000,4120,2,1,2,yes,no,yes,yes,yes,0,no,furnished
423
+ 3360000,4750,2,1,1,yes,no,yes,yes,no,0,yes,semi-furnished
424
+ 3360000,3720,2,1,1,no,no,no,yes,yes,0,no,semi-furnished
425
+ 3360000,3750,3,1,1,yes,yes,no,yes,no,0,yes,semi-furnished
426
+ 3360000,3100,3,1,2,no,no,yes,no,yes,0,yes,unfurnished
427
+ 3360000,3185,2,1,1,yes,yes,no,no,yes,2,yes,furnished
428
+ 3353000,2700,3,1,1,no,yes,yes,yes,no,0,no,furnished
429
+ 3332000,2145,3,1,2,yes,no,yes,yes,yes,0,yes,semi-furnished
430
+ 3325000,4040,2,1,1,yes,yes,yes,no,no,1,no,semi-furnished
431
+ 3325000,4775,4,1,2,yes,no,no,yes,yes,0,no,furnished
432
+ 3290000,2500,2,1,1,no,yes,no,no,no,0,no,semi-furnished
433
+ 3290000,3180,4,1,2,yes,no,no,yes,no,0,no,furnished
434
+ 3290000,6060,3,1,1,yes,yes,no,yes,yes,0,no,unfurnished
435
+ 3290000,3480,4,1,2,no,no,no,yes,no,1,no,semi-furnished
436
+ 3290000,3792,4,1,2,yes,no,no,yes,yes,0,no,furnished
437
+ 3290000,4040,2,1,1,yes,no,no,yes,yes,0,no,semi-furnished
438
+ 3290000,2145,3,1,2,yes,no,no,yes,yes,0,no,furnished
439
+ 3290000,5880,3,1,1,yes,no,no,yes,no,1,yes,furnished
440
+ 3255000,4500,2,1,1,no,yes,yes,no,no,0,yes,semi-furnished
441
+ 3255000,3930,2,1,1,no,yes,no,no,yes,0,yes,unfurnished
442
+ 3234000,3640,4,1,2,yes,yes,no,no,yes,0,yes,furnished
443
+ 3220000,4370,3,1,2,yes,no,no,no,no,0,yes,furnished
444
+ 3220000,2684,2,1,1,yes,yes,no,no,yes,1,no,furnished
445
+ 3220000,4320,3,1,1,no,no,no,yes,yes,1,no,unfurnished
446
+ 3220000,3120,3,1,2,no,no,no,yes,no,0,no,unfurnished
447
+ 3150000,3450,1,1,1,yes,yes,no,no,yes,0,no,semi-furnished
448
+ 3150000,3986,2,2,1,no,yes,no,yes,no,1,no,unfurnished
449
+ 3150000,3500,2,1,1,no,no,no,no,yes,0,no,semi-furnished
450
+ 3150000,4095,2,1,1,yes,yes,no,no,no,2,no,semi-furnished
451
+ 3150000,1650,3,1,2,no,no,no,yes,yes,0,yes,unfurnished
452
+ 3150000,3450,3,1,2,yes,no,yes,yes,yes,0,yes,semi-furnished
453
+ 3150000,6750,2,1,1,yes,no,yes,yes,no,0,yes,unfurnished
454
+ 3150000,9000,3,1,2,yes,no,no,no,yes,2,yes,semi-furnished
455
+ 3150000,3069,2,1,1,yes,no,no,yes,no,1,no,unfurnished
456
+ 3143000,4500,3,1,2,yes,yes,yes,no,yes,0,no,unfurnished
457
+ 3129000,5495,3,1,1,yes,yes,no,no,no,0,no,semi-furnished
458
+ 3118850,2398,3,1,1,yes,no,yes,no,no,0,yes,furnished
459
+ 3115000,3000,3,1,1,no,no,yes,yes,no,0,no,semi-furnished
460
+ 3115000,3850,3,1,2,yes,yes,yes,yes,no,0,no,unfurnished
461
+ 3115000,3500,2,1,1,yes,yes,yes,no,yes,0,no,furnished
462
+ 3087000,8100,2,1,1,yes,no,yes,yes,yes,1,yes,semi-furnished
463
+ 3080000,4960,2,1,1,yes,yes,no,yes,no,0,no,semi-furnished
464
+ 3080000,2160,3,1,2,no,no,yes,yes,no,0,yes,semi-furnished
465
+ 3080000,3090,2,1,1,yes,yes,no,no,no,0,yes,furnished
466
+ 3080000,4500,2,1,2,yes,no,no,no,yes,1,yes,unfurnished
467
+ 3045000,3800,2,1,1,yes,yes,yes,yes,yes,0,yes,semi-furnished
468
+ 3010000,3090,3,1,2,no,no,yes,no,yes,0,no,semi-furnished
469
+ 3010000,3240,3,1,2,yes,yes,no,yes,yes,2,yes,semi-furnished
470
+ 3010000,2835,2,1,1,yes,yes,yes,yes,no,0,yes,semi-furnished
471
+ 3010000,4600,2,1,1,yes,no,yes,yes,no,0,no,semi-furnished
472
+ 3010000,5076,3,1,1,no,yes,no,yes,no,0,no,unfurnished
473
+ 3010000,3750,3,1,2,yes,no,no,no,yes,0,yes,unfurnished
474
+ 3010000,3630,4,1,2,yes,yes,yes,no,yes,3,no,semi-furnished
475
+ 3003000,8050,2,1,1,yes,no,no,no,yes,0,yes,furnished
476
+ 2975000,4352,4,1,2,no,yes,yes,yes,no,1,yes,unfurnished
477
+ 2961000,3000,2,1,2,yes,no,yes,yes,no,0,yes,furnished
478
+ 2940000,5850,3,1,2,yes,yes,yes,yes,yes,1,yes,unfurnished
479
+ 2940000,4960,2,1,1,yes,no,no,no,no,0,no,semi-furnished
480
+ 2940000,3600,3,1,2,no,yes,no,yes,yes,1,yes,unfurnished
481
+ 2940000,3660,4,1,2,no,no,no,no,yes,0,no,semi-furnished
482
+ 2940000,3480,3,1,2,no,no,no,no,no,1,no,unfurnished
483
+ 2940000,2700,2,1,1,no,no,no,no,yes,0,no,semi-furnished
484
+ 2940000,3150,3,1,2,no,yes,yes,no,no,0,no,semi-furnished
485
+ 2940000,6615,3,1,2,yes,no,yes,no,yes,0,yes,semi-furnished
486
+ 2870000,3040,2,1,1,no,yes,yes,no,no,0,no,semi-furnished
487
+ 2870000,3630,2,1,1,yes,no,yes,yes,no,0,yes,furnished
488
+ 2870000,6000,2,1,1,yes,yes,no,yes,no,0,no,semi-furnished
489
+ 2870000,5400,4,1,2,yes,no,yes,no,yes,0,no,furnished
490
+ 2852500,5200,4,1,3,yes,yes,no,no,yes,0,yes,furnished
491
+ 2835000,3300,3,1,2,no,yes,yes,yes,no,1,yes,semi-furnished
492
+ 2835000,4350,3,1,2,no,no,no,no,no,1,yes,furnished
493
+ 2835000,2640,2,1,1,no,yes,no,no,yes,1,no,unfurnished
494
+ 2800000,2650,3,1,2,yes,yes,yes,yes,yes,1,no,semi-furnished
495
+ 2800000,3960,3,1,1,yes,no,yes,no,yes,0,yes,furnished
496
+ 2730000,6800,2,1,1,yes,yes,yes,no,no,0,yes,semi-furnished
497
+ 2730000,4000,3,1,2,yes,yes,yes,no,yes,1,yes,semi-furnished
498
+ 2695000,4000,2,1,1,yes,yes,no,yes,yes,0,yes,semi-furnished
499
+ 2660000,3934,2,1,1,yes,no,yes,yes,no,0,no,furnished
500
+ 2660000,2000,2,1,2,yes,yes,yes,yes,yes,0,yes,unfurnished
501
+ 2660000,3630,3,3,2,no,no,no,no,yes,0,yes,unfurnished
502
+ 2660000,2800,3,1,1,yes,yes,yes,no,yes,0,yes,furnished
503
+ 2660000,2430,3,1,1,no,yes,yes,yes,no,0,yes,semi-furnished
504
+ 2660000,3480,2,1,1,yes,yes,yes,no,no,1,no,furnished
505
+ 2660000,4000,3,1,1,yes,no,yes,no,no,0,yes,furnished
506
+ 2653000,3185,2,1,1,yes,yes,yes,no,no,0,no,furnished
507
+ 2653000,4000,3,1,2,yes,yes,no,no,no,0,yes,semi-furnished
508
+ 2604000,2910,2,1,1,no,no,yes,no,no,0,yes,furnished
509
+ 2590000,3600,2,1,1,yes,yes,yes,yes,no,0,no,semi-furnished
510
+ 2590000,4400,2,1,1,yes,no,yes,no,yes,0,yes,furnished
511
+ 2590000,3600,2,2,2,yes,no,yes,yes,yes,1,no,unfurnished
512
+ 2520000,2880,3,1,1,no,yes,yes,no,no,0,no,furnished
513
+ 2520000,3180,3,1,1,no,no,no,yes,yes,0,yes,furnished
514
+ 2520000,3000,2,1,2,yes,yes,yes,no,yes,0,no,unfurnished
515
+ 2485000,4400,3,1,2,yes,no,no,yes,no,0,no,semi-furnished
516
+ 2485000,3000,3,1,2,no,no,no,no,yes,0,no,semi-furnished
517
+ 2450000,3210,3,1,2,yes,yes,yes,yes,yes,0,no,furnished
518
+ 2450000,3240,2,1,1,no,no,yes,no,yes,1,no,unfurnished
519
+ 2450000,3000,2,1,1,yes,no,yes,yes,yes,1,no,semi-furnished
520
+ 2450000,3500,2,1,1,yes,no,yes,no,yes,0,no,furnished
521
+ 2450000,4840,2,1,2,yes,no,no,yes,no,0,no,unfurnished
522
+ 2450000,7700,2,1,1,yes,no,yes,yes,no,0,yes,semi-furnished
523
+ 2408000,3635,2,1,1,no,yes,yes,no,no,0,yes,semi-furnished
524
+ 2380000,2475,3,1,2,yes,no,yes,no,yes,0,no,unfurnished
525
+ 2380000,2787,4,2,2,yes,yes,yes,yes,no,0,yes,unfurnished
526
+ 2380000,3264,2,1,1,yes,no,yes,no,yes,0,no,unfurnished
527
+ 2345000,3640,2,1,1,yes,yes,yes,yes,yes,0,no,unfurnished
528
+ 2310000,3180,2,1,1,yes,yes,no,yes,yes,0,yes,furnished
529
+ 2275000,1836,2,1,1,no,no,no,yes,yes,0,no,unfurnished
530
+ 2275000,3970,1,1,1,no,no,yes,no,no,0,no,semi-furnished
531
+ 2275000,3970,3,1,2,yes,no,yes,yes,no,0,no,semi-furnished
532
+ 2240000,1950,3,1,1,no,yes,yes,yes,yes,0,no,furnished
533
+ 2233000,5300,3,1,1,no,no,no,no,no,0,yes,semi-furnished
534
+ 2135000,3000,2,1,1,no,yes,yes,no,yes,0,no,semi-furnished
535
+ 2100000,2400,3,1,2,yes,yes,no,no,no,0,no,unfurnished
536
+ 2100000,3000,4,1,2,yes,yes,yes,no,no,0,no,semi-furnished
537
+ 2100000,3360,2,1,1,yes,no,yes,yes,yes,1,yes,furnished
538
+ 1960000,3420,5,1,2,no,no,yes,yes,no,0,no,unfurnished
539
+ 1890000,1700,3,1,2,yes,yes,yes,yes,yes,0,yes,unfurnished
540
+ 1890000,3649,2,1,1,yes,yes,no,yes,no,0,yes,furnished
541
+ 1855000,2990,2,1,1,no,no,no,no,yes,1,no,unfurnished
542
+ 1820000,3000,2,1,1,yes,yes,yes,yes,yes,2,no,unfurnished
543
+ 1767150,2400,3,1,1,no,no,yes,yes,no,0,yes,furnished
544
+ 1750000,3620,2,1,1,yes,yes,no,no,no,0,no,unfurnished
545
+ 1750000,2910,3,1,1,no,no,no,yes,yes,0,yes,furnished
546
+ 1750000,3850,3,1,2,yes,yes,no,yes,no,0,yes,furnished
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: (python scripts/init_production_db.py || echo "DB init failed") && (python scripts/populate_agent_templates.py || echo "Template init failed") && uvicorn app:app --host 0.0.0.0 --port $PORT
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Auto Analyst Backend
3
+ emoji: 🦀
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
agents_config.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "templates": [
3
+ {
4
+ "template_name": "preprocessing_agent",
5
+ "display_name": "Data Preprocessing Agent",
6
+ "description": "Cleans and prepares a DataFrame using Pandas and NumPy—handles missing values, detects column types, and converts date strings to datetime",
7
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
8
+ "category": "Data Manipulation",
9
+ "is_premium_only": false,
10
+ "variant_type": "individual",
11
+ "base_agent": "preprocessing_agent",
12
+ "is_active": true,
13
+ "prompt_template": "You are a AI data-preprocessing agent. Generate clean and efficient Python code using NumPy and Pandas to perform introductory data preprocessing on a pre-loaded DataFrame df, based on the user's analysis goals.\nPreprocessing Requirements:\n1. Identify Column Types\n- Separate columns into numeric and categorical using:\n categorical_columns = df.select_dtypes(include=[object, 'category']).columns.tolist()\n numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()\n2. Handle Missing Values\n- Numeric columns: Impute missing values using the mean of each column\n- Categorical columns: Impute missing values using the mode of each column\n3. Convert Date Strings to Datetime\n- For any column suspected to represent dates (in string format), convert it to datetime using:\n def safe_to_datetime(date):\n try:\n return pd.to_datetime(date, errors='coerce', cache=False)\n except (ValueError, TypeError):\n return pd.NaT\n df['datetime_column'] = df['datetime_column'].apply(safe_to_datetime)\n- Replace 'datetime_column' with the actual column names containing date-like strings\nImportant Notes:\n- Do NOT create a correlation matrix — correlation analysis is outside the scope of preprocessing\n- Do NOT generate any plots or visualizations\nOutput Instructions:\n1. Include the full preprocessing Python code\n2. Provide a brief bullet-point summary of the steps performed. Example:\n• Identified 5 numeric and 4 categorical columns\n• Filled missing numeric values with column means\n• Filled missing categorical values with column modes\n• Converted 1 date column to datetime format\n Respond in the user's language for all summary and reasoning but keep the code in english"
14
+ },
15
+ {
16
+ "template_name": "planner_preprocessing_agent",
17
+ "display_name": "Data Preprocessing Agent",
18
+ "description": "Multi-agent planner variant: Cleans and prepares a DataFrame using Pandas and NumPy—handles missing values, detects column types, and converts date strings to datetime",
19
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
20
+ "category": "Data Manipulation",
21
+ "is_premium_only": false,
22
+ "variant_type": "planner",
23
+ "base_agent": "preprocessing_agent",
24
+ "is_active": true,
25
+ "prompt_template": "You are a data preprocessing agent optimized for multi-agent data analytics pipelines.\n\nYou are given:\n* A raw dataset (often just uploaded or loaded).\n* A user-defined goal (e.g., clean data for analysis, prepare for modeling).\n***plan_instructions** containing:\n ***'create'**: Variables you must create (e.g., ['df_cleaned', 'preprocessing_summary', 'column_types'])\n ***'use'**: Variables you must use (e.g., ['df', 'raw_data'])\n * **'instruction'**: Specific preprocessing instructions\n\n### Your Planner-Optimized Responsibilities:\n* **ALWAYS follow plan_instructions** - essential for pipeline data flow\n* Create ONLY the variables specified in plan_instructions['create']\n* Use ONLY the variables specified in plan_instructions['use']\n* Apply preprocessing as per plan_instructions['instruction']\n* Ensure cleaned data integrates seamlessly with downstream agents\n\n### Core Preprocessing Techniques:\n* Identify and categorize column types (numeric, categorical, datetime)\n* Handle missing values appropriately:\n - Numeric: impute with mean, median, or specified strategy\n - Categorical: impute with mode or specified strategy\n* Convert date strings to datetime format with proper error handling\n* Remove duplicates and handle data quality issues\n* Apply data type optimizations for memory efficiency\n* Create preprocessing summaries for pipeline transparency\n\n### Multi-Agent Best Practices:\n* Use exact variable names from plan_instructions['create']\n* Ensure data format compatibility for downstream agents\n* Maintain data integrity and schema consistency\n* Document preprocessing steps for pipeline reproducibility\n\n### Output:\n* Python code implementing preprocessing per plan_instructions\n* Summary of data cleaning and transformation operations\n* Focus on seamless integration with analysis and modeling agents\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
26
+ },
27
+ {
28
+ "template_name": "statistical_analytics_agent",
29
+ "display_name": "Statistical Analytics Agent",
30
+ "description": "Performs statistical analysis (e.g., regression, seasonal decomposition) using statsmodels, with proper handling of categorical data and missing values",
31
+ "icon_url": "/icons/templates/statsmodel.svg",
32
+ "category": "Data Modelling",
33
+ "is_premium_only": false,
34
+ "variant_type": "individual",
35
+ "base_agent": "statistical_analytics_agent",
36
+ "is_active": true,
37
+ "prompt_template": "You are a statistical analytics agent. Your task is to take a dataset and a user-defined goal and output Python code that performs the appropriate statistical analysis to achieve that goal. Follow these guidelines:\nIMPORTANT: You may be provided with previous interaction history. The section marked \"### Current Query:\" contains the user's current request. Any text in \"### Previous Interaction History:\" is for context only and is NOT part of the current request.\nData Handling:\nAlways handle strings as categorical variables in a regression using statsmodels C(string_column).\nDo not change the index of the DataFrame.\nConvert X and y into float when fitting a model.\nError Handling:\nAlways check for missing values and handle them appropriately.\nEnsure that categorical variables are correctly processed.\nProvide clear error messages if the model fitting fails.\nRegression:\nFor regression, use statsmodels and ensure that a constant term is added to the predictor using sm.add_constant(X).\nHandle categorical variables using C(column_name) in the model formula.\nFit the model with model = sm.OLS(y.astype(float), X.astype(float)).fit().\nSeasonal Decomposition:\nEnsure the period is set correctly when performing seasonal decomposition.\nVerify the number of observations works for the decomposition.\nOutput:\nEnsure the code is executable and as intended.\nAlso choose the correct type of model for the problem\nAvoid adding data visualization code.\nProvide a concise bullet-point summary of the statistical analysis performed.\n\nExample Summary:\n• Applied linear regression with OLS to predict house prices based on 5 features\n• Model achieved R-squared of 0.78\n• Significant predictors include square footage (p<0.001) and number of bathrooms (p<0.01)\n• Detected strong seasonal pattern with 12-month periodicity\n• Forecast shows 15% growth trend over next quarter\nRespond in the user's language for all summary and reasoning but keep the code in english"
38
+ },
39
+ {
40
+ "template_name": "planner_statistical_analytics_agent",
41
+ "display_name": "Statistical Analytics Agent",
42
+ "description": "Multi-agent planner variant: Performs statistical analysis (e.g., regression, seasonal decomposition) using statsmodels, with proper handling of categorical data and missing values",
43
+ "icon_url": "/icons/templates/statsmodel.svg",
44
+ "category": "Data Modelling",
45
+ "is_premium_only": false,
46
+ "variant_type": "planner",
47
+ "base_agent": "statistical_analytics_agent",
48
+ "is_active": true,
49
+ "prompt_template": "You are a statistical analytics agent optimized for multi-agent data analytics pipelines.\n\nYou are given:\n* A dataset (often preprocessed and cleaned).\n* A user-defined goal (e.g., regression analysis, time series analysis, hypothesis testing).\n* **plan_instructions** containing:\n * **'create'**: Variables you must create (e.g., ['regression_model', 'statistical_results', 'model_summary'])\n * **'use'**: Variables you must use (e.g., ['df_cleaned', 'target_variable', 'predictor_variables'])\n * **'instruction'**: Specific statistical analysis instructions\n\n### Your Planner-Optimized Responsibilities:\n* **ALWAYS follow plan_instructions** - essential for pipeline analytical workflow\n* Create ONLY the variables specified in plan_instructions['create']\n* Use ONLY the variables specified in plan_instructions['use']\n* Apply statistical analysis as per plan_instructions['instruction']\n* Ensure statistical outputs integrate seamlessly with downstream agents\n\n### Statistical Analysis Techniques:\n* Use statsmodels for regression analysis with proper categorical handling\n* Apply time series analysis including seasonal decomposition\n* Implement hypothesis testing and statistical significance testing\n* Handle missing values and data quality issues appropriately\n* Use proper model specification with categorical variables: C(column_name)\n* Add constant terms for regression: sm.add_constant(X)\n* Ensure data types are appropriate: convert to float for modeling\n\n### Multi-Agent Best Practices:\n* Use exact variable names from plan_instructions['create']\n* Ensure statistical model objects are accessible to downstream agents\n* Maintain statistical rigor and proper model diagnostics\n* Focus on interpretable results for decision-making agents\n\n### Output:\n* Python code implementing statistical analysis per plan_instructions\n* Summary of statistical findings and model performance\n* Focus on robust statistical inference for pipeline decision-making\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
50
+ },
51
+ {
52
+ "template_name": "data_viz_agent",
53
+ "display_name": "Data Visualization Agent",
54
+ "description": "Creates interactive data visualizations using Plotly with advanced styling and formatting options",
55
+ "icon_url": "/icons/templates/plotly.svg",
56
+ "category": "Data Visualization",
57
+ "is_premium_only": false,
58
+ "variant_type": "individual",
59
+ "base_agent": "data_viz_agent",
60
+ "is_active": true,
61
+ "prompt_template": "You are an AI agent responsible for generating interactive data visualizations using Plotly.\nIMPORTANT Instructions:\n- The section marked \"### Current Query:\" contains the user's request. Any text in \"### Previous Interaction History:\" is for context only and should NOT be treated as part of the current request.\n- You must only use the tools provided to you. This agent handles visualization only.\n- If len(df) > 50000, always sample the dataset before visualization using: \nif len(df) > 50000: \n df = df.sample(50000, random_state=1)\n- Each visualization must be generated as a **separate figure** using go.Figure(). \nDo NOT use subplots under any circumstances.\n- Each figure must be returned individually using: \nfig.to_html(full_html=False)\n- Use update_layout with xaxis and yaxis **only once per figure**.\n- Enhance readability and clarity by: \n• Using low opacity (0.4-0.7) where appropriate \n• Applying visually distinct colors for different elements or categories \n- Make sure the visual **answers the user's specific goal**: \n• Identify what insight or comparison the user is trying to achieve \n• Choose the visualization type and features (e.g., color, size, grouping) to emphasize that goal \n• For example, if the user asks for \"trends in revenue,\" use a time series line chart; if they ask for \"top-performing categories,\" use a bar chart sorted by value \n• Prioritize highlighting patterns, outliers, or comparisons relevant to the question\n- Never include the dataset or styling index in the output.\n- If there are no relevant columns for the requested visualization, respond with: \n\"No relevant columns found to generate this visualization.\"\n- Use only one number format consistently: either 'K', 'M', or comma-separated values like 1,000/1,000,000. Do not mix formats.\n- Only include trendlines in scatter plots if the user explicitly asks for them.\n- Output only the code and a concise bullet-point summary of what the visualization reveals.\n- Always end each visualization with: \nfig.to_html(full_html=False)\nRespond in the user's language for all summary and reasoning but keep the code in english"
62
+ },
63
+ {
64
+ "template_name": "sk_learn_agent",
65
+ "display_name": "Machine Learning Agent",
66
+ "description": "Trains and evaluates machine learning models using scikit-learn, including classification, regression, and clustering with feature importance insights",
67
+ "icon_url": "/icons/templates/sk_learn_agent.svg",
68
+ "category": "Data Modelling",
69
+ "is_premium_only": false,
70
+ "variant_type": "individual",
71
+ "base_agent": "sk_learn_agent",
72
+ "is_active": true,
73
+ "prompt_template": "You are a machine learning agent. \nYour task is to take a dataset and a user-defined goal, and output Python code that performs the appropriate machine learning analysis to achieve that goal. \nYou should use the scikit-learn library.\nIMPORTANT: You may be provided with previous interaction history. The section marked \"### Current Query:\" contains the user's current request. Any text in \"### Previous Interaction History:\" is for context only and is NOT part of the current request.\nMake sure your output is as intended!\nProvide a concise bullet-point summary of the machine learning operations performed.\n\nExample Summary:\n• Trained a Random Forest classifier on customer churn data with 80/20 train-test split\n• Model achieved 92% accuracy and 88% F1-score\n• Feature importance analysis revealed that contract length and monthly charges are the strongest predictors of churn\n• Implemented K-means clustering (k=4) on customer shopping behaviors\n• Identified distinct segments: high-value frequent shoppers (22%), occasional big spenders (35%), budget-conscious regulars (28%), and rare visitors (15%)\nRespond in the user's language for all summary and reasoning but keep the code in english"
74
+ },
75
+ {
76
+ "template_name": "planner_data_viz_agent",
77
+ "display_name": "Data Visualization Agent",
78
+ "description": "Multi-agent planner variant: Creates interactive data visualizations using Plotly with advanced styling and formatting options",
79
+ "icon_url": "/icons/templates/plotly.svg",
80
+ "category": "Data Visualization",
81
+ "is_premium_only": false,
82
+ "variant_type": "planner",
83
+ "base_agent": "data_viz_agent",
84
+ "is_active": true,
85
+ "prompt_template": "### **Data Visualization Agent Definition**\nYou are the **data visualization agent** in a multi-agent analytics pipeline. Your primary responsibility is to **generate visualizations** based on the **user-defined goal** and the **plan instructions**.\nYou are provided with:\n* **goal**: A user-defined goal outlining the type of visualization the user wants (e.g., \"plot sales over time with trendline\").\n* **dataset**: The dataset (e.g., `df_cleaned`) which will be passed to you by other agents in the pipeline. **Do not assume or create any variables** — **the data is already present and valid** when you receive it.\n* **styling_index**: Specific styling instructions (e.g., axis formatting, color schemes) for the visualization.\n* **plan_instructions**: A dictionary containing:\n* **'create'**: List of **visualization components** you must generate (e.g., 'scatter_plot', 'bar_chart').\n* **'use'**: List of **variables you must use** to generate the visualizations. This includes datasets and any other variables provided by the other agents.\n* **'instructions'**: A list of additional instructions related to the creation of the visualizations, such as requests for trendlines or axis formats.\n---\n### **Responsibilities**:\n1. **Strict Use of Provided Variables**:\n* You must **never create fake data**. Only use the variables and datasets that are explicitly **provided** to you in the `plan_instructions['use']` section. All the required data **must already be available**.\n* If any variable listed in `plan_instructions['use']` is missing or invalid, **you must return an error** and not proceed with any visualization.\n2. **Visualization Creation**:\n* Based on the **'create'** section of the `plan_instructions`, generate the **required visualization** using **Plotly**. For example, if the goal is to plot a time series, you might generate a line chart.\n* Respect the **user-defined goal** in determining which type of visualization to create.\n3. **Performance Optimization**:\n* If the dataset contains **more than 50,000 rows**, you **must sample** the data to **5,000 rows** to improve performance.\n4. **Layout and Styling**:\n* Apply formatting and layout adjustments as defined by the **styling_index**.\n* You must ensure that all axes (x and y) have **consistent formats** (e.g., using `K`, `M`, or 1,000 format, but not mixing formats).\n5. **Trendlines**:\n* Trendlines should **only be included** if explicitly requested in the **'instructions'** section of `plan_instructions`.\n6. **Displaying the Visualization**:\n* Use Plotly's `fig.show()` method to display the created chart.\n* **Never** output raw datasets or the **goal** itself. Only the visualization code and the chart should be returned.\n7. **Error Handling**:\n* If the required dataset or variables are missing or invalid (i.e., not included in `plan_instructions['use']`), return an error message indicating which specific variable is missing or invalid.\n8. **No Data Modification**:\n* **Never** modify the provided dataset or generate new data. If the data needs preprocessing or cleaning, assume it's already been done by other agents.\n---\n### **Strict Conditions**:\n* You **never** create any data.\n* You **only** use the data and variables passed to you.\n* If any required data or variable is missing or invalid, **you must stop** and return a clear error message.\n* Respond in the user's language for all summary and reasoning but keep the code in english\n* it should be update_yaxes, update_xaxes, not axis\nBy following these conditions and responsibilities, your role is to ensure that the **visualizations** are generated as per the user goal, using the valid data and instructions given to you."
86
+ },
87
+ {
88
+ "template_name": "planner_sk_learn_agent",
89
+ "display_name": "Machine Learning Agent",
90
+ "description": "Multi-agent planner variant: Trains and evaluates machine learning models using scikit-learn, including classification, regression, and clustering with feature importance insights",
91
+ "icon_url": "/icons/templates/sk_learn_agent.svg",
92
+ "category": "Data Modelling",
93
+ "is_premium_only": false,
94
+ "variant_type": "planner",
95
+ "base_agent": "sk_learn_agent",
96
+ "is_active": true,
97
+ "prompt_template": "**Agent Definition:**\nYou are a machine learning agent in a multi-agent data analytics pipeline.\nYou are given:\n* A dataset (often cleaned and feature-engineered).\n* A user-defined goal (e.g., classification, regression, clustering).\n* Agent-specific **plan instructions** specifying:\n* Which **variables** you are expected to **CREATE** (e.g., `trained_model`, `predictions`).\n* Which **variables** you will **USE** (e.g., `df_cleaned`, `target_variable`, `feature_columns`).\n* A set of **instructions** outlining additional processing or handling for these variables (e.g., handling missing values, applying transformations, or other task-specific guidelines).\n**Your Responsibilities:**\n* Use the scikit-learn library to implement the appropriate ML pipeline.\n* Always split data into training and testing sets where applicable.\n* Use `print()` for all outputs.\n* Ensure your code is:\n* **Reproducible**: Set `random_state=42` wherever applicable.\n* **Modular**: Avoid deeply nested code.\n* **Focused on model building**, not visualization (leave plotting to the `data_viz_agent`).\n**You must not:**\n* Visualize anything (that's another agent's job).\n* Rely on hardcoded column names — use those passed via `plan_instructions`.\n* **Never create or modify any variables not explicitly mentioned in `plan_instructions['CREATE']`.**\n* **Never create the `df` variable**. You will **only** work with the variables passed via the `plan_instructions`.\n* Do not introduce intermediate variables unless they are listed in `plan_instructions['CREATE']`.\n**Instructions to Follow:**\n1. **CREATE** only the variables specified in the `plan_instructions['CREATE']` list. Do not create any intermediate or new variables.\n2. **USE** only the variables specified in the `plan_instructions['USE']` list. You are **not allowed** to create or modify any variables not listed in the plan instructions.\n3. Follow any **processing instructions** in the `plan_instructions['INSTRUCTIONS']` list. This might include tasks like handling missing values, scaling features, or encoding categorical variables. Always perform these steps on the variables specified in the `plan_instructions`.\n4. Do **not reassign or modify** any variables passed via `plan_instructions`. These should be used as-is.\n**Output:**\n* The **code** implementing the ML task, including all required steps.\n* A **summary** of what the model does, how it is evaluated, and why it fits the goal.\n* Respond in the user's language for all summary and reasoning but keep the code in english"
98
+ },
99
+ {
100
+ "template_name": "feature_engineering_agent",
101
+ "display_name": "Feature Engineering Agent",
102
+ "description": "Advanced feature creation and selection for machine learning pipelines using various encoding and transformation techniques",
103
+ "icon_url": "/icons/templates/feature-engineering.png",
104
+ "category": "Data Modelling",
105
+ "is_premium_only": true,
106
+ "variant_type": "individual",
107
+ "base_agent": "feature_engineering_agent",
108
+ "is_active": true,
109
+ "prompt_template": "You are a feature engineering expert for machine learning pipelines. Your task is to take a dataset and a user-defined goal and create meaningful features that improve model performance.\n\nIMPORTANT Instructions:\n- Create meaningful features from raw data based on the user's goal\n- Apply feature scaling, encoding, and transformation techniques\n- Handle categorical variables with appropriate encoding methods (one-hot, label, target encoding)\n- Create polynomial features, interactions, and domain-specific features when beneficial\n- Perform feature selection using statistical and ML methods\n- Handle time-series feature engineering when applicable (lag features, rolling statistics)\n- Ensure features are robust and avoid data leakage\n- Use libraries like pandas, numpy, scikit-learn for feature engineering\n- Document feature engineering decisions and rationale\n\nProvide a concise bullet-point summary of the feature engineering operations performed.\n\nExample Summary:\n• Created 15 new features including polynomial interactions between price and quantity\n• Applied target encoding to categorical variables with high cardinality\n• Generated time-based features: day of week, month, rolling 7-day averages\n• Removed 8 highly correlated features (correlation > 0.95)\n• Applied StandardScaler to numerical features for model compatibility\n• Final feature set: 23 features with improved signal-to-noise ratio\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
110
+ },
111
+ {
112
+ "template_name": "planner_feature_engineering_agent",
113
+ "display_name": "Feature Engineering Agent",
114
+ "description": "Multi-agent planner variant: Advanced feature creation and selection for machine learning pipelines using various encoding and transformation techniques",
115
+ "icon_url": "/icons/templates/feature-engineering.png",
116
+ "category": "Data Modelling",
117
+ "is_premium_only": true,
118
+ "variant_type": "planner",
119
+ "base_agent": "feature_engineering_agent",
120
+ "is_active": true,
121
+ "prompt_template": "You are a feature engineering expert optimized for multi-agent data analytics pipelines.\n\nYou are given:\n* A dataset (often raw or lightly processed).\n* A user-defined goal (e.g., improve model performance, create specific feature types).\n* **plan_instructions** containing:\n * **'create'**: Variables you must create (e.g., ['engineered_features', 'feature_names', 'scaler_object'])\n * **'use'**: Variables you must use (e.g., ['raw_data', 'target_column'])\n * **'instruction'**: Specific feature engineering instructions\n\n### Your Planner-Optimized Responsibilities:\n* **ALWAYS follow plan_instructions** - essential for pipeline coordination\n* Create ONLY the variables specified in plan_instructions['create']\n* Use ONLY the variables specified in plan_instructions['use']\n* Apply feature engineering techniques as per plan_instructions['instruction']\n* Ensure engineered features integrate seamlessly with downstream ML agents\n\n### Feature Engineering Techniques:\n* Categorical encoding (one-hot, label, target encoding)\n* Numerical transformations (scaling, normalization, polynomial features)\n* Time-series features (lag features, rolling statistics, temporal patterns)\n* Feature selection and dimensionality reduction\n* Interaction features and domain-specific feature creation\n* Handle missing values and outliers appropriately\n\n### Multi-Agent Best Practices:\n* Use exact variable names from plan_instructions['create']\n* Ensure feature compatibility for downstream agents\n* Maintain data integrity and prevent leakage\n* Document feature engineering decisions for pipeline transparency\n\n### Output:\n* Python code implementing feature engineering per plan_instructions\n* Summary of features created and transformations applied\n* Focus on seamless integration with ML modeling agents\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
122
+ },
123
+ {
124
+ "template_name": "polars_agent",
125
+ "display_name": "Polars Agent",
126
+ "description": "High-performance data processing using Polars for large datasets with lazy evaluation and efficient memory usage",
127
+ "icon_url": "/icons/templates/polars_github_logo_rect_dark_name.svg",
128
+ "category": "Data Manipulation",
129
+ "is_premium_only": true,
130
+ "variant_type": "individual",
131
+ "base_agent": "polars_agent",
132
+ "is_active": true,
133
+ "prompt_template": "You are a Polars expert for high-performance data processing. Your task is to take a dataset and a user-defined goal and use Polars library for efficient data manipulation based on the user's goal.\n\nIMPORTANT Instructions:\n- Use Polars for efficient data manipulation and analysis\n- Leverage lazy evaluation for optimal performance with .lazy() and .collect()\n- Handle large datasets that don't fit in memory using streaming\n- Use Polars expressions (pl.col, pl.when, etc.) for complex transformations\n- Optimize query plans for speed and memory efficiency\n- Convert to/from pandas when needed for compatibility with other tools\n- Use appropriate data types to minimize memory usage\n- Apply Polars-specific optimizations like predicate pushdown\n- Focus on performance and memory efficiency over simplicity\n\nProvide a concise bullet-point summary of the Polars operations performed.\n\nExample Summary:\n• Processed 10M row dataset using lazy evaluation for memory efficiency\n• Applied complex filtering and aggregations with 5x speedup vs pandas\n• Used Polars expressions for vectorized string operations\n• Implemented window functions for time-series calculations\n• Optimized memory usage by selecting appropriate dtypes (reduced from 2GB to 500MB)\n• Final output: clean, aggregated dataset ready for analysis\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
134
+ },
135
+ {
136
+ "template_name": "planner_polars_agent",
137
+ "display_name": "Polars Agent",
138
+ "description": "Multi-agent planner variant: High-performance data processing using Polars for large datasets with lazy evaluation and efficient memory usage",
139
+ "icon_url": "https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars_github_logo_rect_dark_name.svg",
140
+ "category": "Data Manipulation",
141
+ "is_premium_only": true,
142
+ "variant_type": "planner",
143
+ "base_agent": "polars_agent",
144
+ "is_active": true,
145
+ "prompt_template": "You are a Polars expert optimized for multi-agent data processing pipelines.\n\nYou are given:\n* A dataset (often large or complex).\n* A user-defined goal (e.g., data transformation, aggregation, filtering).\n* **plan_instructions** containing:\n * **'create'**: Variables you must create (e.g., ['processed_data', 'summary_stats'])\n * **'use'**: Variables you must use (e.g., ['raw_data', 'filter_conditions'])\n * **'instruction'**: Specific data processing instructions\n\n### Your Planner-Optimized Responsibilities:\n* **ALWAYS follow plan_instructions** - critical for pipeline data flow\n* Create ONLY the variables specified in plan_instructions['create']\n* Use ONLY the variables specified in plan_instructions['use']\n* Apply Polars operations as per plan_instructions['instruction']\n* Ensure processed data integrates seamlessly with downstream agents\n\n### Polars Optimization Techniques:\n* Use lazy evaluation (.lazy().collect()) for memory efficiency\n* Apply predicate pushdown and projection pushdown optimizations\n* Leverage Polars expressions for vectorized operations\n* Use appropriate data types to minimize memory footprint\n* Implement streaming for datasets larger than memory\n* Convert to pandas DataFrame only when required by downstream agents\n\n### Multi-Agent Best Practices:\n* Use exact variable names from plan_instructions['create']\n* Ensure data format compatibility for subsequent agents\n* Maintain data integrity and schema consistency\n* Optimize for both speed and memory usage in pipeline context\n\n### Output:\n* Python code implementing Polars operations per plan_instructions\n* Summary of data processing and optimizations applied\n* Focus on high-performance data flow in multi-agent pipeline\n\nRespond in the user's language for all summary and reasoning but keep the code in english"
146
+ }
147
+ ],
148
+ "remove": []
149
+ }
app.py ADDED
@@ -0,0 +1,1589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import asyncio
3
+ import json
4
+ import logging
5
+ import os
6
+ import time
7
+ import uuid
8
+ from io import StringIO
9
+ from typing import List, Optional
10
+ import ast
11
+ import markdown
12
+ from bs4 import BeautifulSoup
13
+ import pandas as pd
14
+ from datetime import datetime, UTC
15
+ # Third-party imports
16
+ import uvicorn
17
+ from dotenv import load_dotenv
18
+ from fastapi import (
19
+ Depends,
20
+ FastAPI,
21
+ File,
22
+ Form,
23
+ HTTPException,
24
+ Request,
25
+ UploadFile
26
+ )
27
+ from fastapi.middleware.cors import CORSMiddleware
28
+ from fastapi.responses import JSONResponse, StreamingResponse
29
+ from fastapi.security import APIKeyHeader
30
+ from llama_index.core import Document, VectorStoreIndex
31
+ from pydantic import BaseModel
32
+
33
+ # Local application imports
34
+ from scripts.format_response import format_response_to_markdown
35
+ from src.agents.agents import *
36
+ from src.agents.retrievers.retrievers import *
37
+ from src.managers.ai_manager import AI_Manager
38
+ from src.managers.session_manager import SessionManager
39
+ from src.routes.analytics_routes import router as analytics_router
40
+ from src.routes.chat_routes import router as chat_router
41
+ from src.routes.code_routes import router as code_router
42
+ from src.routes.feedback_routes import router as feedback_router
43
+ from src.routes.session_routes import router as session_router, get_session_id_dependency
44
+ from src.routes.deep_analysis_routes import router as deep_analysis_router
45
+ from src.routes.templates_routes import router as templates_router
46
+ from src.schemas.query_schema import QueryRequest
47
+ from src.utils.logger import Logger
48
+
49
+ # Import deep analysis components directly
50
+ # from src.agents.try_deep_agents import deep_analysis_module
51
+ from src.agents.deep_agents import deep_analysis_module
52
+ from src.utils.generate_report import generate_html_report
53
+
54
+ from src.utils.model_registry import MODEL_OBJECTS
55
+
56
+ logger = Logger("app", see_time=True, console_log=True)
57
+ load_dotenv()
58
+
59
+ # Request models
60
+ class DeepAnalysisRequest(BaseModel):
61
+ goal: str
62
+
63
+ class DeepAnalysisResponse(BaseModel):
64
+ goal: str
65
+ deep_questions: str
66
+ deep_plan: str
67
+ summaries: List[str]
68
+ code: str
69
+ plotly_figs: List
70
+ synthesis: List[str]
71
+ final_conclusion: str
72
+ html_report: Optional[str] = None
73
+
74
+ styling_instructions = [
75
+ """
76
+ Dont ignore any of these instructions.
77
+ For a line chart always use plotly_white template, reduce x axes & y axes line to 0.2 & x & y grid width to 1.
78
+ Always give a title and make bold using html tag axis label and try to use multiple colors if more than one line
79
+ Annotate the min and max of the line
80
+ Display numbers in thousand(K) or Million(M) if larger than 1000/100000
81
+ Show percentages in 2 decimal points with '%' sign
82
+ Default size of chart should be height =1200 and width =1000
83
+
84
+ """
85
+
86
+ , """
87
+ Dont ignore any of these instructions.
88
+ For a bar chart always use plotly_white template, reduce x axes & y axes line to 0.2 & x & y grid width to 1.
89
+ Always give a title and make bold using html tag axis label
90
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000.
91
+ Annotate the values of the bar chart
92
+ If variable is a percentage show in 2 decimal points with '%' sign.
93
+ Default size of chart should be height =1200 and width =1000
94
+ """
95
+ ,
96
+
97
+ """
98
+ For a histogram chart choose a bin_size of 50
99
+ Do not ignore any of these instructions
100
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
101
+ Always give a title and make bold using html tag axis label
102
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
103
+ If variable is a percentage show in 2 decimal points with '%'
104
+ Default size of chart should be height =1200 and width =1000
105
+ """,
106
+
107
+
108
+ """
109
+ For a pie chart only show top 10 categories, bundle rest as others
110
+ Do not ignore any of these instructions
111
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
112
+ Always give a title and make bold using html tag axis label
113
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
114
+ If variable is a percentage show in 2 decimal points with '%'
115
+ Default size of chart should be height =1200 and width =1000
116
+ """,
117
+
118
+ """
119
+ Do not ignore any of these instructions
120
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
121
+ Always give a title and make bold using html tag axis label
122
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
123
+ Don't add K/M if number already in , or value is not a number
124
+ If variable is a percentage show in 2 decimal points with '%'
125
+ Default size of chart should be height =1200 and width =1000
126
+ """,
127
+ """
128
+ For a heat map
129
+ Use the 'plotly_white' template for a clean, white background.
130
+ Set a chart title
131
+ Style the X-axis with a black line color, 0.2 line width, 1 grid width, format 1000/1000000 as K/M
132
+ Do not format non-numerical numbers
133
+ .style the Y-axis with a black line color, 0.2 line width, 1 grid width format 1000/1000000 as K/M
134
+ Do not format non-numerical numbers
135
+
136
+ . Set the figure dimensions to a height of 1200 pixels and a width of 1000 pixels.
137
+ """,
138
+ """
139
+ For a Histogram, used for returns/distribution plotting
140
+ Use the 'plotly_white' template for a clean, white background.
141
+ Set a chart title
142
+ Style the X-axis 1 grid width, format 1000/1000000 as K/M
143
+ Do not format non-numerical numbers
144
+ .style the Y-axis, 1 grid width format 1000/1000000 as K/M
145
+ Do not format non-numerical numbers
146
+
147
+ Use an opacity of 0.75
148
+
149
+ Set the figure dimensions to a height of 1200 pixels and a width of 1000 pixels.
150
+ """
151
+ ]
152
+
153
+ # Add near the top of the file, after imports
154
+ DEFAULT_MODEL_CONFIG = {
155
+ "provider": os.getenv("MODEL_PROVIDER", "openai"),
156
+ "model": os.getenv("MODEL_NAME", "gpt-5-mini"),
157
+ "api_key": os.getenv("OPENAI_API_KEY"),
158
+ "temperature": float(os.getenv("TEMPERATURE", 1.0)),
159
+ "max_tokens": int(os.getenv("MAX_TOKENS", 6000)), "cache": False
160
+ }
161
+
162
+ # Create default LM config but don't set it globally
163
+
164
+ default_lm = MODEL_OBJECTS[DEFAULT_MODEL_CONFIG['model']]
165
+
166
+
167
+
168
+ # lm = dspy.LM('openai/gpt-4o-mini', api_key=os.getenv("OPENAI_API_KEY"))
169
+ dspy.configure(lm=default_lm, async_max_workers=100)
170
+
171
+ # Function to get model config from session or use default
172
+ def get_session_lm(session_state):
173
+ """Get the appropriate LM instance for a session, or default if not configured"""
174
+ # First check if we have a valid session-specific model config
175
+ if session_state and isinstance(session_state, dict) and "model_config" in session_state:
176
+ model_config = session_state["model_config"]
177
+ if model_config and isinstance(model_config, dict) and "model" in model_config:
178
+ # Found valid session-specific model config, use it
179
+ provider = model_config.get("provider", "openai").lower()
180
+ model_name = model_config.get("model", DEFAULT_MODEL_CONFIG["model"])
181
+ if 'gpt-5' or 'o1' not in model_name:
182
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['max_tokens'] = model_config.get("max_tokens", DEFAULT_MODEL_CONFIG["max_tokens"])
183
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['temperature'] = model_config.get("temperature", DEFAULT_MODEL_CONFIG["temperature"])
184
+ elif 'gpt-5' or 'o1' in model_name and provider =='openai':
185
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['max_completion_tokens'] = model_config.get("max_tokens", DEFAULT_MODEL_CONFIG["max_tokens"])
186
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['temperature'] = 1.0
187
+ else:
188
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['max_tokens'] = model_config.get("max_tokens", DEFAULT_MODEL_CONFIG["max_tokens"])
189
+ MODEL_OBJECTS[model_name].__dict__['kwargs']['temperature'] = model_config.get("temperature", DEFAULT_MODEL_CONFIG["temperature"])
190
+
191
+
192
+ # If no valid session config, use default
193
+ return MODEL_OBJECTS[model_name]
194
+
195
+ # Initialize retrievers with empty data first
196
+ def initialize_retrievers(styling_instructions: List[str], doc: List[str]):
197
+ try:
198
+ style_index = VectorStoreIndex.from_documents([Document(text=x) for x in styling_instructions])
199
+ data_index = VectorStoreIndex.from_documents([Document(text=x) for x in doc])
200
+ return {"style_index": style_index, "dataframe_index": data_index}
201
+ except Exception as e:
202
+ logger.log_message(f"Error initializing retrievers: {str(e)}", level=logging.ERROR)
203
+ raise e
204
+
205
+ # clear console
206
+ def clear_console():
207
+ os.system('cls' if os.name == 'nt' else 'clear')
208
+
209
+
210
+ # Check for Housing.csv
211
+ housing_csv_path = "Housing.csv"
212
+ if not os.path.exists(housing_csv_path):
213
+ logger.log_message(f"Housing.csv not found at {os.path.abspath(housing_csv_path)}", level=logging.ERROR)
214
+ raise FileNotFoundError(f"Housing.csv not found at {os.path.abspath(housing_csv_path)}")
215
+
216
+ # All agents are now loaded from database - no hardcoded dictionaries needed
217
+
218
+ # Add session header
219
+ X_SESSION_ID = APIKeyHeader(name="X-Session-ID", auto_error=False)
220
+
221
+ # Update AppState class to use SessionManager
222
+ class AppState:
223
+ def __init__(self):
224
+ self._session_manager = SessionManager(styling_instructions, {}) # Empty dict, agents loaded from DB
225
+ self.model_config = DEFAULT_MODEL_CONFIG.copy()
226
+ # Update the SessionManager with the current model_config
227
+ self._session_manager._app_model_config = self.model_config
228
+ self.ai_manager = AI_Manager()
229
+ self.chat_name_agent = chat_history_name_agent
230
+ # Initialize deep analysis module
231
+ self.deep_analyzer = None
232
+
233
+ def get_session_state(self, session_id: str):
234
+ """Get or create session-specific state using the SessionManager"""
235
+ return self._session_manager.get_session_state(session_id)
236
+
237
+ def clear_session_state(self, session_id: str):
238
+ """Clear session-specific state using the SessionManager"""
239
+ self._session_manager.clear_session_state(session_id)
240
+
241
+ def update_session_dataset(self, session_id: str, df, name, desc):
242
+ """Update dataset for a specific session using the SessionManager"""
243
+ self._session_manager.update_session_dataset(session_id, df, name, desc)
244
+
245
+ def reset_session_to_default(self, session_id: str):
246
+ """Reset a session to use the default dataset using the SessionManager"""
247
+ self._session_manager.reset_session_to_default(session_id)
248
+
249
+ def set_session_user(self, session_id: str, user_id: int, chat_id: int = None):
250
+ """Associate a user with a session using the SessionManager"""
251
+ return self._session_manager.set_session_user(session_id, user_id, chat_id)
252
+
253
+ def get_ai_manager(self):
254
+ """Get the AI Manager instance"""
255
+ return self.ai_manager
256
+
257
+ def get_provider_for_model(self, model_name):
258
+ return self.ai_manager.get_provider_for_model(model_name)
259
+
260
+ def calculate_cost(self, model_name, input_tokens, output_tokens):
261
+ return self.ai_manager.calculate_cost(model_name, input_tokens, output_tokens)
262
+
263
+ def save_usage_to_db(self, user_id, chat_id, model_name, provider, prompt_tokens, completion_tokens, total_tokens, query_size, response_size, cost, request_time_ms, is_streaming=False):
264
+ return self.ai_manager.save_usage_to_db(user_id, chat_id, model_name, provider, prompt_tokens, completion_tokens, total_tokens, query_size, response_size, round(cost, 7), request_time_ms, is_streaming)
265
+
266
+ def get_tokenizer(self):
267
+ return self.ai_manager.tokenizer
268
+
269
+ def get_chat_history_name_agent(self):
270
+ return dspy.Predict(self.chat_name_agent)
271
+
272
+ def get_deep_analyzer(self, session_id: str):
273
+ """Get or create deep analysis module for a session"""
274
+ session_state = self.get_session_state(session_id)
275
+ user_id = session_state.get("user_id")
276
+
277
+ # Check if we need to recreate the deep analyzer (user changed or doesn't exist)
278
+ current_analyzer = session_state.get('deep_analyzer')
279
+ analyzer_user_id = session_state.get('deep_analyzer_user_id')
280
+
281
+ logger.log_message(f"Deep analyzer check - session: {session_id}, current_user: {user_id}, analyzer_user: {analyzer_user_id}, has_analyzer: {current_analyzer is not None}", level=logging.INFO)
282
+
283
+ if (not current_analyzer or
284
+ analyzer_user_id != user_id or
285
+ not hasattr(session_state, 'deep_analyzer')):
286
+
287
+ logger.log_message(f"Creating/recreating deep analyzer for session {session_id}, user_id: {user_id} (reason: analyzer_exists={current_analyzer is not None}, user_match={analyzer_user_id == user_id})", level=logging.INFO)
288
+
289
+ # Load user-enabled agents from database using preference system
290
+ from src.db.init_db import session_factory
291
+ from src.agents.agents import load_user_enabled_templates_for_planner_from_db
292
+
293
+ db_session = session_factory()
294
+ try:
295
+ # Load user-enabled agents for planner (respects preferences)
296
+ if user_id:
297
+ enabled_agents_dict = load_user_enabled_templates_for_planner_from_db(user_id, db_session)
298
+ logger.log_message(f"Deep analyzer loaded {len(enabled_agents_dict)} enabled agents for user {user_id}: {list(enabled_agents_dict.keys())}", level=logging.INFO)
299
+
300
+ if not enabled_agents_dict:
301
+ logger.log_message(f"WARNING: No enabled agents found for user {user_id}, falling back to defaults", level=logging.WARNING)
302
+ # Fallback to default agents if no enabled agents
303
+ from src.agents.agents import preprocessing_agent, statistical_analytics_agent, sk_learn_agent, data_viz_agent
304
+ enabled_agents_dict = {
305
+ "preprocessing_agent": preprocessing_agent,
306
+ "statistical_analytics_agent": statistical_analytics_agent,
307
+ "sk_learn_agent": sk_learn_agent,
308
+ "data_viz_agent": data_viz_agent
309
+ }
310
+ else:
311
+ # Fallback to default agents if no user_id
312
+ logger.log_message("No user_id in session, loading default agents for deep analysis", level=logging.WARNING)
313
+ from src.agents.agents import preprocessing_agent, statistical_analytics_agent, sk_learn_agent, data_viz_agent
314
+ enabled_agents_dict = {
315
+ "preprocessing_agent": preprocessing_agent,
316
+ "statistical_analytics_agent": statistical_analytics_agent,
317
+ "sk_learn_agent": sk_learn_agent,
318
+ "data_viz_agent": data_viz_agent
319
+ }
320
+
321
+ # Create agents dictionary for deep analysis using enabled agents
322
+ deep_agents = {}
323
+ deep_agents_desc = {}
324
+
325
+ for agent_name, signature in enabled_agents_dict.items():
326
+ deep_agents[agent_name] = dspy.asyncify(dspy.ChainOfThought(signature))
327
+ # Get agent description from database
328
+ deep_agents_desc[agent_name] = get_agent_description(agent_name)
329
+
330
+ logger.log_message(f"Deep analyzer initialized with {len(deep_agents)} agents: {list(deep_agents.keys())}", level=logging.INFO)
331
+
332
+ except Exception as e:
333
+ logger.log_message(f"Error loading agents for deep analysis: {str(e)}", level=logging.ERROR)
334
+ # Fallback to minimal set
335
+ from src.agents.agents import preprocessing_agent, statistical_analytics_agent, sk_learn_agent, data_viz_agent
336
+ deep_agents = {
337
+ "preprocessing_agent": dspy.asyncify(dspy.Predict(preprocessing_agent)),
338
+ "statistical_analytics_agent": dspy.asyncify(dspy.Predict(statistical_analytics_agent)),
339
+ "sk_learn_agent": dspy.asyncify(dspy.Predict(sk_learn_agent)),
340
+ "data_viz_agent": dspy.asyncify(dspy.Predict(data_viz_agent))
341
+ }
342
+ deep_agents_desc = {name: get_agent_description(name) for name in deep_agents.keys()}
343
+ logger.log_message(f"Using fallback agents: {list(deep_agents.keys())}", level=logging.WARNING)
344
+ finally:
345
+ db_session.close()
346
+
347
+ session_state['deep_analyzer'] = deep_analysis_module(agents=deep_agents, agents_desc=deep_agents_desc)
348
+ session_state['deep_analyzer_user_id'] = user_id # Track which user this analyzer was created for
349
+ else:
350
+ logger.log_message(f"Using existing deep analyzer for session {session_id}, user_id: {user_id}", level=logging.INFO)
351
+
352
+ return session_state['deep_analyzer']
353
+
354
+ # Initialize FastAPI app with state
355
+ app = FastAPI(title="AI Analytics API", version="1.0")
356
+ app.state = AppState()
357
+
358
+
359
+ # Configure middleware
360
+ # Use a wildcard for local development or read from environment
361
+ is_development = os.getenv("ENVIRONMENT", "development").lower() == "development"
362
+
363
+ allowed_origins = []
364
+ frontend_url = os.getenv("FRONTEND_URL", "").strip()
365
+ print(f"FRONTEND_URL: {frontend_url}")
366
+ if is_development:
367
+ allowed_origins = ["*"]
368
+ elif frontend_url:
369
+ allowed_origins = [frontend_url]
370
+ else:
371
+ logger.log_message("CORS misconfigured: FRONTEND_URL not set", level=logging.ERROR)
372
+ allowed_origins = [] # or set a default safe origin
373
+
374
+ # Add a strict origin verification middleware
375
+ @app.middleware("http")
376
+ async def verify_origin_middleware(request: Request, call_next):
377
+ # Skip origin check in development mode
378
+ if is_development:
379
+ return await call_next(request)
380
+
381
+ # Get the origin from the request headers
382
+ origin = request.headers.get("origin")
383
+
384
+ # Log the origin for debugging
385
+ if origin:
386
+ print(f"Request from origin: {origin}")
387
+
388
+ # If no origin header or origin not in allowed list, reject the request
389
+ if origin and frontend_url and origin != frontend_url:
390
+ print(f"Blocked request from unauthorized origin: {origin}")
391
+ return JSONResponse(
392
+ status_code=403,
393
+ content={"detail": "Not authorized"}
394
+ )
395
+
396
+ # Continue processing the request if origin is allowed
397
+ return await call_next(request)
398
+
399
+ # CORS middleware (still needed for browser preflight)
400
+ app.add_middleware(
401
+ CORSMiddleware,
402
+ allow_origins=allowed_origins,
403
+ allow_origin_regex=None,
404
+ allow_credentials=True,
405
+ allow_methods=["*"],
406
+ allow_headers=["*"],
407
+ expose_headers=["*"],
408
+ max_age=600 # Cache preflight requests for 10 minutes (for performance)
409
+ )
410
+
411
+ # Add these constants at the top of the file with other imports/constants
412
+ RESPONSE_ERROR_INVALID_QUERY = "Please provide a valid query..."
413
+ RESPONSE_ERROR_NO_DATASET = "No dataset is currently loaded. Please link a dataset before proceeding with your analysis."
414
+ DEFAULT_TOKEN_RATIO = 1.5
415
+ REQUEST_TIMEOUT_SECONDS = 30 # Timeout for LLM requests
416
+ MAX_RECENT_MESSAGES = 5
417
+ DB_BATCH_SIZE = 10 # For future batch DB operations
418
+
419
+ @app.post("/chat/{agent_name}", response_model=dict)
420
+ async def chat_with_agent(
421
+ agent_name: str,
422
+ request: QueryRequest,
423
+ request_obj: Request,
424
+ session_id: str = Depends(get_session_id_dependency)
425
+ ):
426
+ session_state = app.state.get_session_state(session_id)
427
+ logger.log_message(f"[DEBUG] chat_with_agent called with agent: '{agent_name}', query: '{request.query[:100]}...'", level=logging.DEBUG)
428
+
429
+ try:
430
+ # Extract and validate query parameters
431
+ logger.log_message(f"[DEBUG] Updating session from query params", level=logging.DEBUG)
432
+ _update_session_from_query_params(request_obj, session_state)
433
+ logger.log_message(f"[DEBUG] Session state after query params: user_id={session_state.get('user_id')}, chat_id={session_state.get('chat_id')}", level=logging.DEBUG)
434
+
435
+ # Validate dataset and agent name
436
+ if session_state["current_df"] is None:
437
+ logger.log_message(f"[DEBUG] No dataset loaded", level=logging.DEBUG)
438
+ raise HTTPException(status_code=400, detail=RESPONSE_ERROR_NO_DATASET)
439
+
440
+ logger.log_message(f"[DEBUG] About to validate agent name: '{agent_name}'", level=logging.DEBUG)
441
+ _validate_agent_name(agent_name, session_state)
442
+ logger.log_message(f"[DEBUG] Agent validation completed successfully", level=logging.DEBUG)
443
+
444
+ # Record start time for timing
445
+ start_time = time.time()
446
+
447
+ # Get chat context and prepare query
448
+ logger.log_message(f"[DEBUG] Preparing query with context", level=logging.DEBUG)
449
+ enhanced_query = _prepare_query_with_context(request.query, session_state)
450
+ logger.log_message(f"[DEBUG] Enhanced query length: {len(enhanced_query)}", level=logging.DEBUG)
451
+
452
+ # Initialize agent - handle standard, template, and custom agents
453
+ if "," in agent_name:
454
+ logger.log_message(f"[DEBUG] Processing multiple agents: {agent_name}", level=logging.DEBUG)
455
+ # Multiple agents case
456
+ agent_list = [agent.strip() for agent in agent_name.split(",")]
457
+
458
+ # Categorize agents
459
+ standard_agents = [agent for agent in agent_list if _is_standard_agent(agent)]
460
+ template_agents = [agent for agent in agent_list if _is_template_agent(agent)]
461
+ custom_agents = [agent for agent in agent_list if not _is_standard_agent(agent) and not _is_template_agent(agent)]
462
+
463
+ logger.log_message(f"[DEBUG] Agent categorization - standard: {standard_agents}, template: {template_agents}, custom: {custom_agents}", level=logging.DEBUG)
464
+
465
+ if custom_agents:
466
+ # If any custom agents, use session AI system for all
467
+ ai_system = session_state["ai_system"]
468
+ session_lm = get_session_lm(session_state)
469
+ logger.log_message(f"[DEBUG] Using custom agent execution path", level=logging.DEBUG)
470
+ with dspy.context(lm=session_lm):
471
+ response = await asyncio.wait_for(
472
+ _execute_custom_agents(ai_system, agent_list, enhanced_query),
473
+ timeout=REQUEST_TIMEOUT_SECONDS
474
+ )
475
+ logger.log_message(f"[DEBUG] Custom agents response type: {type(response)}, keys: {list(response.keys()) if isinstance(response, dict) else 'not a dict'}", level=logging.DEBUG)
476
+ else:
477
+ # All standard/template agents - use auto_analyst_ind which loads from DB
478
+ user_id = session_state.get("user_id")
479
+ logger.log_message(f"[DEBUG] Using auto_analyst_ind for multiple standard/template agents with user_id: {user_id}", level=logging.DEBUG)
480
+
481
+ # Create database session for agent loading
482
+ from src.db.init_db import session_factory
483
+ db_session = session_factory()
484
+ try:
485
+ # auto_analyst_ind will load all agents from database
486
+ logger.log_message(f"[DEBUG] Creating auto_analyst_ind instance", level=logging.DEBUG)
487
+ agent = auto_analyst_ind(agents=[], retrievers=session_state["retrievers"], user_id=user_id, db_session=db_session)
488
+ session_lm = get_session_lm(session_state)
489
+ logger.log_message(f"[DEBUG] About to call agent.forward with query and agent list", level=logging.DEBUG)
490
+ with dspy.context(lm=session_lm):
491
+ response = await asyncio.wait_for(
492
+ agent.forward(enhanced_query, ",".join(agent_list)),
493
+ timeout=REQUEST_TIMEOUT_SECONDS
494
+ )
495
+ logger.log_message(f"[DEBUG] auto_analyst_ind response type: {type(response)}, content: {str(response)[:200]}...", level=logging.DEBUG)
496
+ finally:
497
+ db_session.close()
498
+ else:
499
+ logger.log_message(f"[DEBUG] Processing single agent: {agent_name}", level=logging.DEBUG)
500
+ # Single agent case
501
+ if _is_standard_agent(agent_name) or _is_template_agent(agent_name):
502
+ # Standard or template agent - use auto_analyst_ind which loads from DB
503
+ user_id = session_state.get("user_id")
504
+ logger.log_message(f"[DEBUG] Using auto_analyst_ind for single standard/template agent '{agent_name}' with user_id: {user_id}", level=logging.DEBUG)
505
+
506
+ # Create database session for agent loading
507
+ from src.db.init_db import session_factory
508
+ db_session = session_factory()
509
+ try:
510
+ # auto_analyst_ind will load all agents from database
511
+ logger.log_message(f"[DEBUG] Creating auto_analyst_ind instance for single agent", level=logging.DEBUG)
512
+ agent = auto_analyst_ind(agents=[], retrievers=session_state["retrievers"], user_id=user_id, db_session=db_session)
513
+ session_lm = get_session_lm(session_state)
514
+ logger.log_message(f"[DEBUG] About to call agent.forward for single agent '{agent_name}'", level=logging.DEBUG)
515
+ with dspy.context(lm=session_lm):
516
+ response = await asyncio.wait_for(
517
+ agent.forward(enhanced_query, agent_name),
518
+ timeout=REQUEST_TIMEOUT_SECONDS
519
+ )
520
+ logger.log_message(f"[DEBUG] Single agent response type: {type(response)}, content: {str(response)[:200]}...", level=logging.DEBUG)
521
+ finally:
522
+ db_session.close()
523
+ else:
524
+ # Custom agent - use session AI system
525
+ ai_system = session_state["ai_system"]
526
+ session_lm = get_session_lm(session_state)
527
+ logger.log_message(f"[DEBUG] Using custom agent execution for '{agent_name}'", level=logging.DEBUG)
528
+ with dspy.context(lm=session_lm):
529
+ response = await asyncio.wait_for(
530
+ _execute_custom_agents(ai_system, [agent_name], enhanced_query),
531
+ timeout=REQUEST_TIMEOUT_SECONDS
532
+ )
533
+ logger.log_message(f"[DEBUG] Custom single agent response type: {type(response)}, content: {str(response)[:200]}...", level=logging.DEBUG)
534
+
535
+ logger.log_message(f"[DEBUG] About to format response to markdown. Response type: {type(response)}", level=logging.DEBUG)
536
+ formatted_response = format_response_to_markdown(response, agent_name, session_state["current_df"])
537
+ logger.log_message(f"[DEBUG] Formatted response type: {type(formatted_response)}, length: {len(str(formatted_response))}", level=logging.DEBUG)
538
+
539
+ if formatted_response == RESPONSE_ERROR_INVALID_QUERY:
540
+ logger.log_message(f"[DEBUG] Response was invalid query error", level=logging.DEBUG)
541
+ return {
542
+ "agent_name": agent_name,
543
+ "query": request.query,
544
+ "response": formatted_response,
545
+ "session_id": session_id
546
+ }
547
+
548
+ # Track usage statistics
549
+ if session_state.get("user_id"):
550
+ logger.log_message(f"[DEBUG] Tracking model usage", level=logging.DEBUG)
551
+ _track_model_usage(
552
+ session_state=session_state,
553
+ enhanced_query=enhanced_query,
554
+ response=response,
555
+ processing_time_ms=int((time.time() - start_time) * 1000)
556
+ )
557
+
558
+ logger.log_message(f"[DEBUG] chat_with_agent completed successfully", level=logging.DEBUG)
559
+ return {
560
+ "agent_name": agent_name,
561
+ "query": request.query, # Return original query without context
562
+ "response": formatted_response,
563
+ "session_id": session_id
564
+ }
565
+ except HTTPException:
566
+ # Re-raise HTTP exceptions to preserve status codes
567
+ logger.log_message(f"[DEBUG] HTTPException caught and re-raised", level=logging.DEBUG)
568
+ raise
569
+ except asyncio.TimeoutError:
570
+ logger.log_message(f"[ERROR] Timeout error in chat_with_agent", level=logging.ERROR)
571
+ raise HTTPException(status_code=504, detail="Request timed out. Please try a simpler query.")
572
+ except Exception as e:
573
+ logger.log_message(f"[ERROR] Unexpected error in chat_with_agent: {str(e)}", level=logging.ERROR)
574
+ logger.log_message(f"[ERROR] Exception type: {type(e)}, traceback: {str(e)}", level=logging.ERROR)
575
+ import traceback
576
+ logger.log_message(f"[ERROR] Full traceback: {traceback.format_exc()}", level=logging.ERROR)
577
+ raise HTTPException(status_code=500, detail="An unexpected error occurred. Please try again later.")
578
+
579
+
580
+ @app.post("/chat", response_model=dict)
581
+ async def chat_with_all(
582
+ request: QueryRequest,
583
+ request_obj: Request,
584
+ session_id: str = Depends(get_session_id_dependency)
585
+ ):
586
+ session_state = app.state.get_session_state(session_id)
587
+
588
+ try:
589
+ # Extract and validate query parameters
590
+ _update_session_from_query_params(request_obj, session_state)
591
+
592
+ # Validate dataset
593
+ if session_state["current_df"] is None:
594
+ raise HTTPException(status_code=400, detail=RESPONSE_ERROR_NO_DATASET)
595
+
596
+ if session_state["ai_system"] is None:
597
+ raise HTTPException(status_code=500, detail="AI system not properly initialized.")
598
+
599
+ # Get session-specific model
600
+ session_lm = get_session_lm(session_state)
601
+
602
+ # Create streaming response
603
+ return StreamingResponse(
604
+ _generate_streaming_responses(session_state, request.query, session_lm),
605
+ media_type='text/event-stream',
606
+ headers={
607
+ 'Cache-Control': 'no-cache',
608
+ 'Connection': 'keep-alive',
609
+ 'Content-Type': 'text/event-stream',
610
+ 'Access-Control-Allow-Origin': '*',
611
+ 'X-Accel-Buffering': 'no'
612
+ }
613
+ )
614
+ except HTTPException:
615
+ # Re-raise HTTP exceptions to preserve status codes
616
+ raise
617
+ except Exception as e:
618
+ raise HTTPException(status_code=500, detail="An unexpected error occurred. Please try again later.")
619
+
620
+
621
+ # Helper functions to reduce duplication and improve modularity
622
+ def _update_session_from_query_params(request_obj: Request, session_state: dict):
623
+ """Extract and validate chat_id and user_id from query parameters"""
624
+ # Check for chat_id in query parameters
625
+ if "chat_id" in request_obj.query_params:
626
+ try:
627
+ chat_id_param = int(request_obj.query_params.get("chat_id"))
628
+ # Update session state with this chat ID
629
+ session_state["chat_id"] = chat_id_param
630
+ except (ValueError, TypeError):
631
+ logger.log_message("Invalid chat_id parameter", level=logging.WARNING)
632
+ # Continue without updating chat_id
633
+
634
+ # Check for user_id in query parameters
635
+ if "user_id" in request_obj.query_params:
636
+ try:
637
+ user_id = int(request_obj.query_params["user_id"])
638
+ session_state["user_id"] = user_id
639
+ except (ValueError, TypeError):
640
+ raise HTTPException(
641
+ status_code=400,
642
+ detail="Invalid user_id in query params. Please provide a valid integer."
643
+ )
644
+
645
+
646
+ def _validate_agent_name(agent_name: str, session_state: dict = None):
647
+ """Validate that the agent name(s) are available"""
648
+ logger.log_message(f"[DEBUG] Validating agent name: '{agent_name}'", level=logging.DEBUG)
649
+
650
+ if "," in agent_name:
651
+ # Multiple agents
652
+ agent_list = [agent.strip() for agent in agent_name.split(",")]
653
+ logger.log_message(f"[DEBUG] Multiple agents detected: {agent_list}", level=logging.DEBUG)
654
+ for agent in agent_list:
655
+ is_available = _is_agent_available(agent, session_state)
656
+ logger.log_message(f"[DEBUG] Agent '{agent}' availability: {is_available}", level=logging.DEBUG)
657
+ if not is_available:
658
+ available_agents = _get_available_agents_list(session_state)
659
+ logger.log_message(f"[DEBUG] Agent '{agent}' not found. Available: {available_agents}", level=logging.DEBUG)
660
+ raise HTTPException(
661
+ status_code=400,
662
+ detail=f"Agent '{agent}' not found. Available agents: {available_agents}"
663
+ )
664
+ else:
665
+ # Single agent
666
+ is_available = _is_agent_available(agent_name, session_state)
667
+ logger.log_message(f"[DEBUG] Single agent '{agent_name}' availability: {is_available}", level=logging.DEBUG)
668
+ if not is_available:
669
+ available_agents = _get_available_agents_list(session_state)
670
+ logger.log_message(f"[DEBUG] Agent '{agent_name}' not found. Available: {available_agents}", level=logging.DEBUG)
671
+ raise HTTPException(
672
+ status_code=400,
673
+ detail=f"Agent '{agent_name}' not found. Available agents: {available_agents}"
674
+ )
675
+
676
+ logger.log_message(f"[DEBUG] Agent validation passed for: '{agent_name}'", level=logging.DEBUG)
677
+
678
+ def _is_agent_available(agent_name: str, session_state: dict = None) -> bool:
679
+ """Check if an agent is available (standard, template, or custom)"""
680
+ # Check if it's a standard agent
681
+ if _is_standard_agent(agent_name):
682
+ return True
683
+
684
+ # Check if it's a template agent
685
+ if _is_template_agent(agent_name):
686
+ return True
687
+
688
+ # Check if it's a custom agent in session
689
+ if session_state and "ai_system" in session_state:
690
+ ai_system = session_state["ai_system"]
691
+ if hasattr(ai_system, 'agents') and agent_name in ai_system.agents:
692
+ return True
693
+
694
+ return False
695
+
696
+ def _get_available_agents_list(session_state: dict = None) -> list:
697
+ """Get list of all available agents from database"""
698
+ from src.db.init_db import session_factory
699
+ from src.agents.agents import load_all_available_templates_from_db
700
+
701
+ # Core agents (always available)
702
+ available = ["preprocessing_agent", "statistical_analytics_agent", "sk_learn_agent", "data_viz_agent"]
703
+
704
+ # Add template agents from database
705
+ db_session = session_factory()
706
+ try:
707
+ template_agents_dict = load_all_available_templates_from_db(db_session)
708
+ # template_agents_dict is a dict with template_name as keys
709
+ template_names = [template_name for template_name in template_agents_dict.keys()
710
+ if template_name not in available and template_name != 'basic_qa_agent']
711
+ available.extend(template_names)
712
+ except Exception as e:
713
+ logger.log_message(f"Error loading template agents: {str(e)}", level=logging.ERROR)
714
+ finally:
715
+ db_session.close()
716
+
717
+ return available
718
+
719
+ def _is_standard_agent(agent_name: str) -> bool:
720
+ """Check if agent is one of the 4 core standard agents"""
721
+ standard_agents = ["preprocessing_agent", "statistical_analytics_agent", "sk_learn_agent", "data_viz_agent"]
722
+ return agent_name in standard_agents
723
+
724
+ def _is_template_agent(agent_name: str) -> bool:
725
+ """Check if agent is a template agent"""
726
+ try:
727
+ from src.db.init_db import session_factory
728
+ from src.db.schemas.models import AgentTemplate
729
+
730
+ db_session = session_factory()
731
+ try:
732
+ template = db_session.query(AgentTemplate).filter(
733
+ AgentTemplate.template_name == agent_name,
734
+ AgentTemplate.is_active == True
735
+ ).first()
736
+ return template is not None
737
+ finally:
738
+ db_session.close()
739
+ except Exception as e:
740
+ logger.log_message(f"Error checking if {agent_name} is template: {str(e)}", level=logging.ERROR)
741
+ return False
742
+
743
+ async def _execute_custom_agents(ai_system, agent_names: list, query: str):
744
+ """Execute custom agents using the session's AI system"""
745
+ try:
746
+ # For custom agents, we need to use the AI system's execute_agent method
747
+
748
+ agent_results = [ai_system]
749
+ if len(agent_names) == 1:
750
+ # Single custom agent
751
+ agent_name = agent_names[0]
752
+ # Prepare inputs for the custom agent (similar to standard agents like data_viz_agent)
753
+ dict_ = {}
754
+ dict_['dataset'] = ai_system.dataset.retrieve(query)[0].text
755
+ dict_['styling_index'] = ai_system.styling_index.retrieve(query)[0].text
756
+ dict_['goal'] = query
757
+ dict_['Agent_desc'] = str(ai_system.agent_desc)
758
+
759
+ # Get input fields for this agent
760
+ if agent_name in ai_system.agent_inputs:
761
+ inputs = {x: dict_[x] for x in ai_system.agent_inputs[agent_name] if x in dict_}
762
+
763
+ # Execute the custom agent
764
+ agent_name_result, result_dict = await ai_system.agents[agent_name](**inputs)
765
+ return {agent_name_result: result_dict}
766
+ else:
767
+ logger.log_message(f"Agent '{agent_name}' not found in ai_system.agent_inputs", level=logging.ERROR)
768
+ return {"error": f"Agent '{agent_name}' input configuration not found"}
769
+ else:
770
+ # Multiple agents - execute sequentially
771
+ results = {}
772
+ for agent_name in agent_names:
773
+ single_result = await _execute_custom_agents(ai_system, [agent_name], query)
774
+ results.update(single_result)
775
+ return results
776
+
777
+ except Exception as e:
778
+ logger.log_message(f"Error in _execute_custom_agents: {str(e)}", level=logging.ERROR)
779
+ return {"error": f"Error executing custom agents: {str(e)}"}
780
+
781
+ def _prepare_query_with_context(query: str, session_state: dict) -> str:
782
+ """Prepare the query with chat context from previous messages"""
783
+ chat_id = session_state.get("chat_id")
784
+ if not chat_id:
785
+ return query
786
+
787
+ # Get chat manager from app state
788
+ chat_manager = app.state._session_manager.chat_manager
789
+ # Get recent messages
790
+ recent_messages = chat_manager.get_recent_chat_history(chat_id, limit=MAX_RECENT_MESSAGES)
791
+ # Extract response history
792
+ chat_context = chat_manager.extract_response_history(recent_messages)
793
+
794
+ # Append context to the query if available
795
+ if chat_context:
796
+ return f"### Current Query:\n{query}\n\n{chat_context}"
797
+ return query
798
+
799
+
800
+ def _track_model_usage(session_state: dict, enhanced_query: str, response, processing_time_ms: int):
801
+ """Track model usage statistics in the database"""
802
+ try:
803
+ ai_manager = app.state.get_ai_manager()
804
+
805
+ # Get model configuration
806
+ model_config = session_state.get("model_config", DEFAULT_MODEL_CONFIG)
807
+ model_name = model_config.get("model", DEFAULT_MODEL_CONFIG["model"])
808
+ provider = ai_manager.get_provider_for_model(model_name)
809
+
810
+ # Calculate token usage
811
+ try:
812
+ # Try exact tokenization
813
+ prompt_tokens = len(ai_manager.tokenizer.encode(enhanced_query))
814
+ completion_tokens = len(ai_manager.tokenizer.encode(str(response)))
815
+ total_tokens = prompt_tokens + completion_tokens
816
+ except Exception as token_error:
817
+ # Fall back to estimation
818
+ logger.log_message(f"Tokenization error: {str(token_error)}", level=logging.WARNING)
819
+ prompt_words = len(enhanced_query.split())
820
+ completion_words = len(str(response).split())
821
+ prompt_tokens = int(prompt_words * DEFAULT_TOKEN_RATIO)
822
+ completion_tokens = int(completion_words * DEFAULT_TOKEN_RATIO)
823
+ total_tokens = prompt_tokens + completion_tokens
824
+
825
+ # Calculate cost
826
+ cost = ai_manager.calculate_cost(model_name, prompt_tokens, completion_tokens)
827
+
828
+ # Save usage to database
829
+ ai_manager.save_usage_to_db(
830
+ user_id=session_state.get("user_id"),
831
+ chat_id=session_state.get("chat_id"),
832
+ model_name=model_name,
833
+ provider=provider,
834
+ prompt_tokens=int(prompt_tokens),
835
+ completion_tokens=int(completion_tokens),
836
+ total_tokens=int(total_tokens),
837
+ query_size=len(enhanced_query),
838
+ response_size=len(str(response)),
839
+ cost=round(cost, 7),
840
+ request_time_ms=processing_time_ms,
841
+ is_streaming=False
842
+ )
843
+ except Exception as e:
844
+ # Log but don't fail the request if usage tracking fails
845
+ logger.log_message(f"Failed to track model usage: {str(e)}", level=logging.ERROR)
846
+
847
+
848
+ async def _generate_streaming_responses(session_state: dict, query: str, session_lm):
849
+ """Generate streaming responses for chat_with_all endpoint"""
850
+ overall_start_time = time.time()
851
+ total_response = ""
852
+ total_inputs = ""
853
+ usage_records = []
854
+
855
+ # Add chat context from previous messages
856
+ enhanced_query = _prepare_query_with_context(query, session_state)
857
+
858
+ # try:
859
+ # Get the plan - planner is now async, so we need to await it
860
+ plan_response = await session_state["ai_system"].get_plan(enhanced_query)
861
+
862
+ plan_description = format_response_to_markdown(
863
+ {"analytical_planner": plan_response},
864
+ dataframe=session_state["current_df"]
865
+ )
866
+
867
+ # Check if plan is valid
868
+ if plan_description == RESPONSE_ERROR_INVALID_QUERY:
869
+ yield json.dumps({
870
+ "agent": "Analytical Planner",
871
+ "content": plan_description,
872
+ "status": "error"
873
+ }) + "\n"
874
+ return
875
+
876
+ yield json.dumps({
877
+ "agent": "Analytical Planner",
878
+ "content": plan_description,
879
+ "status": "success" if plan_description else "error"
880
+ }) + "\n"
881
+
882
+ # Track planner usage
883
+ if session_state.get("user_id"):
884
+ planner_tokens = _estimate_tokens(ai_manager=app.state.ai_manager,
885
+ input_text=enhanced_query,
886
+ output_text=plan_description)
887
+
888
+ usage_records.append(_create_usage_record(
889
+ session_state=session_state,
890
+ model_name=session_state.get("model_config", DEFAULT_MODEL_CONFIG)["model"],
891
+ prompt_tokens=planner_tokens["prompt"],
892
+ completion_tokens=planner_tokens["completion"],
893
+ query_size=len(enhanced_query),
894
+ response_size=len(plan_description),
895
+ processing_time_ms=int((time.time() - overall_start_time) * 1000),
896
+ is_streaming=False
897
+ ))
898
+
899
+ logger.log_message(f"Plan response: {plan_response}", level=logging.INFO)
900
+ logger.log_message(f"Plan response type: {type(plan_response)}", level=logging.INFO)
901
+
902
+ # Check if plan_response is valid
903
+ # if not plan_response or not isinstance(plan_response, dict):
904
+ # yield json.dumps({
905
+ # "agent": "Analytical Planner",
906
+ # "content": "**Error: Invalid plan response**\n\nResponse: " + str(plan_response),
907
+ # "status": "error"
908
+ # }) + "\n"
909
+ # return
910
+
911
+ # Execute the plan with well-managed concurrency
912
+ with dspy.context(lm = session_lm):
913
+ # try:
914
+
915
+ async for agent_name, inputs, response in session_state["ai_system"].execute_plan(enhanced_query, plan_response):
916
+
917
+ if agent_name == "plan_not_found":
918
+ yield json.dumps({
919
+ "agent": "Analytical Planner",
920
+ "content": "**No plan found**\n\nPlease try again with a different query or try using a different model.",
921
+ "status": "error"
922
+ }) + "\n"
923
+ return
924
+
925
+ if agent_name == "plan_not_formated_correctly":
926
+ yield json.dumps({
927
+ "agent": "Analytical Planner",
928
+ "content": "**Something went wrong with formatting, retry the query!**",
929
+ "status": "error"
930
+ }) + "\n"
931
+ return
932
+
933
+
934
+ formatted_response = format_response_to_markdown(
935
+ {agent_name: response},
936
+ dataframe=session_state["current_df"]
937
+ )
938
+
939
+ yield json.dumps({
940
+ "agent": agent_name.split("__")[0] if "__" in agent_name else agent_name,
941
+ "content": formatted_response,
942
+ "status": "success" if response else "error"
943
+ }) + "\n"
944
+
945
+ # Handle agent errors
946
+ if isinstance(response, dict) and "error" in response:
947
+ yield json.dumps({
948
+ "agent": agent_name,
949
+ "content": f"**Error in {agent_name}**: {response['error']}",
950
+ "status": "error"
951
+ }) + "\n"
952
+ continue # Continue with next agent instead of returning
953
+
954
+
955
+
956
+ if formatted_response == RESPONSE_ERROR_INVALID_QUERY:
957
+ yield json.dumps({
958
+ "agent": agent_name,
959
+ "content": formatted_response,
960
+ "status": "error"
961
+ }) + "\n"
962
+ continue # Continue with next agent instead of returning
963
+
964
+ # Send response chunk
965
+
966
+
967
+ # Track agent usage for future batch DB write
968
+ if session_state.get("user_id"):
969
+ agent_tokens = _estimate_tokens(
970
+ ai_manager=app.state.ai_manager,
971
+ input_text=str(inputs),
972
+ output_text=str(response)
973
+ )
974
+
975
+ # Get appropriate model name for code combiner
976
+ if "code_combiner_agent" in agent_name and "__" in agent_name:
977
+ provider = agent_name.split("__")[1]
978
+ model_name = _get_model_name_for_provider(provider)
979
+ else:
980
+ model_name = session_state.get("model_config", DEFAULT_MODEL_CONFIG)["model"]
981
+
982
+ usage_records.append(_create_usage_record(
983
+ session_state=session_state,
984
+ model_name=model_name,
985
+ prompt_tokens=agent_tokens["prompt"],
986
+ completion_tokens=agent_tokens["completion"],
987
+ query_size=len(str(inputs)),
988
+ response_size=len(str(response)),
989
+ processing_time_ms=int((time.time() - overall_start_time) * 1000),
990
+ is_streaming=True
991
+ ))
992
+
993
+ # except asyncio.TimeoutError:
994
+ # yield json.dumps({
995
+ # "agent": "planner",
996
+ # "content": "The request timed out. Please try a simpler query.",
997
+ # "status": "error"
998
+ # }) + "\n"
999
+ # return
1000
+
1001
+ # except Exception as e:
1002
+ # logger.log_message(f"Error executing plan: {str(e)}", level=logging.ERROR)
1003
+ # yield json.dumps({
1004
+ # "agent": "planner",
1005
+ # "content": f"An error occurred while executing the plan: {str(e)}",
1006
+ # "status": "error"
1007
+ # }) + "\n"
1008
+ # return
1009
+
1010
+ # except Exception as e:
1011
+ # logger.log_message(f"Error in streaming response: {str(e)}", level=logging.ERROR)
1012
+ # yield json.dumps({
1013
+ # "agent": "planner",
1014
+ # "content": "An error occurred while generating responses. Please try again!" + str(e) + str({k: v for k, v in session_lm.__dict__['kwargs'].items() if k != 'api_key'}),
1015
+ # "status": "error"
1016
+ # }) + "\n"
1017
+
1018
+
1019
+ def _estimate_tokens(ai_manager, input_text: str, output_text: str) -> dict:
1020
+ """Estimate token counts, with fallback for tokenization errors"""
1021
+ try:
1022
+ # Try exact tokenization
1023
+ prompt_tokens = len(ai_manager.tokenizer.encode(input_text))
1024
+ completion_tokens = len(ai_manager.tokenizer.encode(output_text))
1025
+ except Exception:
1026
+ # Fall back to estimation
1027
+ prompt_words = len(input_text.split())
1028
+ completion_words = len(output_text.split())
1029
+ prompt_tokens = int(prompt_words * DEFAULT_TOKEN_RATIO)
1030
+ completion_tokens = int(completion_words * DEFAULT_TOKEN_RATIO)
1031
+
1032
+ return {
1033
+ "prompt": prompt_tokens,
1034
+ "completion": completion_tokens,
1035
+ "total": prompt_tokens + completion_tokens
1036
+ }
1037
+
1038
+
1039
+ def _create_usage_record(session_state: dict, model_name: str, prompt_tokens: int,
1040
+ completion_tokens: int, query_size: int, response_size: int,
1041
+ processing_time_ms: int, is_streaming: bool) -> dict:
1042
+ """Create a usage record for the database"""
1043
+ ai_manager = app.state.get_ai_manager()
1044
+ provider = ai_manager.get_provider_for_model(model_name)
1045
+ cost = ai_manager.calculate_cost(model_name, prompt_tokens, completion_tokens)
1046
+
1047
+ return {
1048
+ "user_id": session_state.get("user_id"),
1049
+ "chat_id": session_state.get("chat_id"),
1050
+ "model_name": model_name,
1051
+ "provider": provider,
1052
+ "prompt_tokens": int(prompt_tokens),
1053
+ "completion_tokens": int(completion_tokens),
1054
+ "total_tokens": int(prompt_tokens + completion_tokens),
1055
+ "query_size": query_size,
1056
+ "response_size": response_size,
1057
+ "cost": round(cost, 7),
1058
+ "request_time_ms": processing_time_ms,
1059
+ "is_streaming": is_streaming
1060
+ }
1061
+
1062
+
1063
+ def _get_model_name_for_provider(provider: str) -> str:
1064
+ """Get the model name for a provider"""
1065
+ provider_model_map = {
1066
+ "openai": "o3-mini",
1067
+ "anthropic": "claude-3-7-sonnet-latest",
1068
+ "gemini": "gemini-2.5-pro-preview-03-25"
1069
+ }
1070
+ return provider_model_map.get(provider, "o3-mini")
1071
+
1072
+
1073
+
1074
+ # Add an endpoint to list available agents
1075
+ @app.get("/agents", response_model=dict)
1076
+ async def list_agents(request: Request, session_id: str = Depends(get_session_id_dependency)):
1077
+ """Get all available agents (standard, template, and custom)"""
1078
+ session_state = app.state.get_session_state(session_id)
1079
+
1080
+ try:
1081
+ # Get all available agents from database and session
1082
+ available_agents_list = _get_available_agents_list(session_state)
1083
+
1084
+ # Categorize agents
1085
+ standard_agents = ["preprocessing_agent", "statistical_analytics_agent", "sk_learn_agent", "data_viz_agent"]
1086
+
1087
+ # Get template agents from database
1088
+ from src.db.init_db import session_factory
1089
+ from src.agents.agents import load_all_available_templates_from_db
1090
+
1091
+ db_session = session_factory()
1092
+ try:
1093
+ template_agents_dict = load_all_available_templates_from_db(db_session)
1094
+ # template_agents_dict is a dict with template_name as keys
1095
+ template_agents = [template_name for template_name in template_agents_dict.keys()
1096
+ if template_name not in standard_agents and template_name != 'basic_qa_agent']
1097
+ except Exception as e:
1098
+ logger.log_message(f"Error loading template agents in /agents endpoint: {str(e)}", level=logging.ERROR)
1099
+ template_agents = []
1100
+ finally:
1101
+ db_session.close()
1102
+
1103
+ # Get custom agents from session
1104
+ custom_agents = []
1105
+ if session_state and "ai_system" in session_state:
1106
+ ai_system = session_state["ai_system"]
1107
+ if hasattr(ai_system, 'agents'):
1108
+ custom_agents = [agent for agent in available_agents_list
1109
+ if agent not in standard_agents and agent not in template_agents]
1110
+
1111
+ # Ensure template agents are in the available list
1112
+ for template_agent in template_agents:
1113
+ if template_agent not in available_agents_list:
1114
+ available_agents_list.append(template_agent)
1115
+
1116
+ return {
1117
+ "available_agents": available_agents_list,
1118
+ "standard_agents": standard_agents,
1119
+ "template_agents": template_agents,
1120
+ "custom_agents": custom_agents
1121
+ }
1122
+ except Exception as e:
1123
+ logger.log_message(f"Error getting agents list: {str(e)}", level=logging.ERROR)
1124
+ raise HTTPException(status_code=500, detail=f"Error getting agents list: {str(e)}")
1125
+
1126
+ @app.get("/health", response_model=dict)
1127
+ async def health():
1128
+ return {"message": "API is healthy and running"}
1129
+
1130
+ @app.get("/")
1131
+ async def index():
1132
+ return {
1133
+ "title": "Welcome to the AI Analytics API",
1134
+ "message": "Explore our API for advanced analytics and visualization tools designed to empower your data-driven decisions.",
1135
+ "description": "Utilize our powerful agents and models to gain insights from your data effortlessly.",
1136
+ "colors": {
1137
+ "primary": "#007bff",
1138
+ "secondary": "#6c757d",
1139
+ "success": "#28a745",
1140
+ "danger": "#dc3545",
1141
+ },
1142
+ "features": [
1143
+ "Real-time data processing",
1144
+ "Customizable visualizations",
1145
+ "Seamless integration with various data sources",
1146
+ "User-friendly interface for easy navigation",
1147
+ "Custom Analytics",
1148
+ ],
1149
+ }
1150
+
1151
+ @app.post("/chat_history_name")
1152
+ async def chat_history_name(request: dict, session_id: str = Depends(get_session_id_dependency)):
1153
+ query = request.get("query")
1154
+ name = None
1155
+
1156
+ lm = dspy.LM(model="gpt-4o-mini", max_tokens=300, temperature=0.5)
1157
+
1158
+ with dspy.context(lm=lm):
1159
+ name = app.state.get_chat_history_name_agent()(query=str(query))
1160
+
1161
+ return {"name": name.name if name else "New Chat"}
1162
+
1163
+ @app.post("/deep_analysis_streaming")
1164
+ async def deep_analysis_streaming(
1165
+ request: DeepAnalysisRequest,
1166
+ request_obj: Request,
1167
+ session_id: str = Depends(get_session_id_dependency)
1168
+ ):
1169
+ """Perform streaming deep analysis with real-time updates"""
1170
+ session_state = app.state.get_session_state(session_id)
1171
+
1172
+ try:
1173
+ # Extract and validate query parameters
1174
+ _update_session_from_query_params(request_obj, session_state)
1175
+
1176
+ # Validate dataset
1177
+ if session_state["current_df"] is None:
1178
+ raise HTTPException(status_code=400, detail=RESPONSE_ERROR_NO_DATASET)
1179
+
1180
+ # Get user_id from session state (if available)
1181
+ user_id = session_state.get("user_id")
1182
+
1183
+ # Generate a UUID for this report
1184
+ import uuid
1185
+ report_uuid = str(uuid.uuid4())
1186
+
1187
+ # Create initial pending report in the database
1188
+ try:
1189
+ from src.db.init_db import session_factory
1190
+ from src.db.schemas.models import DeepAnalysisReport
1191
+
1192
+ db_session = session_factory()
1193
+
1194
+ try:
1195
+ # Create a pending report entry
1196
+ new_report = DeepAnalysisReport(
1197
+ report_uuid=report_uuid,
1198
+ user_id=user_id,
1199
+ goal=request.goal,
1200
+ status="pending",
1201
+ start_time=datetime.now(UTC),
1202
+ progress_percentage=0
1203
+ )
1204
+
1205
+ db_session.add(new_report)
1206
+ db_session.commit()
1207
+ db_session.refresh(new_report)
1208
+
1209
+ # Store the report ID in session state for later updates
1210
+ session_state["current_deep_analysis_id"] = new_report.report_id
1211
+ session_state["current_deep_analysis_uuid"] = report_uuid
1212
+
1213
+ except Exception as e:
1214
+ logger.log_message(f"Error creating initial deep analysis report: {str(e)}", level=logging.ERROR)
1215
+ # Continue even if DB storage fails
1216
+ finally:
1217
+ db_session.close()
1218
+
1219
+ except Exception as e:
1220
+ logger.log_message(f"Database operation failed: {str(e)}", level=logging.ERROR)
1221
+ # Continue even if DB operation fails
1222
+
1223
+ # Get session-specific model
1224
+ # session_lm = get_session_lm(session_state)
1225
+ session_lm = dspy.LM(model="anthropic/claude-sonnet-4-20250514", max_tokens=7000, temperature=0.5)
1226
+
1227
+ return StreamingResponse(
1228
+ _generate_deep_analysis_stream(session_state, request.goal, session_lm, session_id),
1229
+ media_type='text/event-stream',
1230
+ headers={
1231
+ 'Cache-Control': 'no-cache',
1232
+ 'Connection': 'keep-alive',
1233
+ 'Content-Type': 'text/event-stream',
1234
+ 'Access-Control-Allow-Origin': '*',
1235
+ 'X-Accel-Buffering': 'no'
1236
+ }
1237
+ )
1238
+
1239
+ except HTTPException:
1240
+ raise
1241
+ except Exception as e:
1242
+ logger.log_message(f"Streaming deep analysis failed: {str(e)}", level=logging.ERROR)
1243
+ raise HTTPException(status_code=500, detail=f"Streaming deep analysis failed: {str(e)}")
1244
+
1245
+ async def _generate_deep_analysis_stream(session_state: dict, goal: str, session_lm, session_id: str):
1246
+ """Generate streaming responses for deep analysis"""
1247
+ # Track the start time for duration calculation
1248
+ start_time = datetime.now(UTC)
1249
+
1250
+ try:
1251
+ # Get dataset info
1252
+ df = session_state["current_df"]
1253
+ dtypes_info = pd.DataFrame({
1254
+ 'Column': df.columns,
1255
+ 'Data Type': df.dtypes.astype(str)
1256
+ }).to_markdown()
1257
+ dataset_info = f"Sample Data:\n{df.head(2).to_markdown()}\n\nData Types:\n{dtypes_info}"
1258
+
1259
+ # Get report info from session state
1260
+ report_id = session_state.get("current_deep_analysis_id")
1261
+ report_uuid = session_state.get("current_deep_analysis_uuid")
1262
+ user_id = session_state.get("user_id")
1263
+
1264
+ # Helper function to update report in database
1265
+ async def update_report_in_db(status, progress, step=None, content=None):
1266
+ if not report_id:
1267
+ return
1268
+
1269
+ try:
1270
+ from src.db.init_db import session_factory
1271
+ from src.db.schemas.models import DeepAnalysisReport
1272
+
1273
+ db_session = session_factory()
1274
+
1275
+ try:
1276
+ report = db_session.query(DeepAnalysisReport).filter(DeepAnalysisReport.report_id == report_id).first()
1277
+
1278
+ if report:
1279
+ report.status = status
1280
+ report.progress_percentage = progress
1281
+
1282
+ # Update step-specific fields if provided
1283
+ if step == "questions" and content:
1284
+ report.deep_questions = content
1285
+ elif step == "planning" and content:
1286
+ report.deep_plan = content
1287
+ elif step == "analysis" and content:
1288
+ # For analysis step, we get the full object with multiple fields
1289
+ if isinstance(content, dict):
1290
+ # Update fields from content if they exist
1291
+ if "deep_questions" in content and content["deep_questions"]:
1292
+ report.deep_questions = content["deep_questions"]
1293
+ if "deep_plan" in content and content["deep_plan"]:
1294
+ report.deep_plan = content["deep_plan"]
1295
+ if "code" in content and content["code"]:
1296
+ report.analysis_code = content["code"]
1297
+ if "final_conclusion" in content and content["final_conclusion"]:
1298
+ report.final_conclusion = content["final_conclusion"]
1299
+ # Also update summary from conclusion
1300
+ conclusion = content["final_conclusion"]
1301
+ conclusion = conclusion.replace("**Conclusion**", "")
1302
+ report.report_summary = conclusion[:200] + "..." if len(conclusion) > 200 else conclusion
1303
+
1304
+ # Handle JSON fields
1305
+ if "summaries" in content and content["summaries"]:
1306
+ report.summaries = json.dumps(content["summaries"])
1307
+ if "plotly_figs" in content and content["plotly_figs"]:
1308
+ report.plotly_figures = json.dumps(content["plotly_figs"])
1309
+ if "synthesis" in content and content["synthesis"]:
1310
+ report.synthesis = json.dumps(content["synthesis"])
1311
+
1312
+ # For the final step, update the HTML report
1313
+ if step == "completed":
1314
+ if content:
1315
+ report.html_report = content
1316
+ else:
1317
+ logger.log_message("No HTML content provided for completed step", level=logging.WARNING)
1318
+
1319
+ report.end_time = datetime.now(UTC)
1320
+ # Ensure start_time is timezone-aware before calculating duration
1321
+ if report.start_time.tzinfo is None:
1322
+ start_time_utc = report.start_time.replace(tzinfo=UTC)
1323
+ else:
1324
+ start_time_utc = report.start_time
1325
+ report.duration_seconds = int((report.end_time - start_time_utc).total_seconds())
1326
+
1327
+ report.updated_at = datetime.now(UTC)
1328
+ db_session.commit()
1329
+
1330
+ except Exception as e:
1331
+ db_session.rollback()
1332
+ logger.log_message(f"Error updating deep analysis report: {str(e)}", level=logging.ERROR)
1333
+ finally:
1334
+ db_session.close()
1335
+ except Exception as e:
1336
+ logger.log_message(f"Database operation failed: {str(e)}", level=logging.ERROR)
1337
+
1338
+ # Use session model for this request
1339
+ with dspy.context(lm=session_lm):
1340
+ # Send initial status
1341
+ yield json.dumps({
1342
+ "step": "initialization",
1343
+ "status": "starting",
1344
+ "message": "Initializing deep analysis...",
1345
+ "progress": 5
1346
+ }) + "\n"
1347
+
1348
+ # Update DB status to running
1349
+ await update_report_in_db("running", 5)
1350
+
1351
+ # Get deep analyzer - use the correct session_id from the session_state
1352
+ logger.log_message(f"Getting deep analyzer for session_id: {session_id}, user_id: {user_id}", level=logging.INFO)
1353
+ deep_analyzer = app.state.get_deep_analyzer(session_id)
1354
+
1355
+ # Make the dataset available globally for code execution
1356
+ globals()['df'] = df
1357
+
1358
+ # Use the new streaming method and forward all progress updates
1359
+ final_result = None
1360
+ async for update in deep_analyzer.execute_deep_analysis_streaming(
1361
+ goal=goal,
1362
+ dataset_info=dataset_info,
1363
+ session_df=df
1364
+ ):
1365
+ # Convert the update to the expected format and yield it
1366
+ if update.get("step") == "questions" and update.get("status") == "completed":
1367
+ # Update DB with questions
1368
+ await update_report_in_db("running", update.get("progress", 0), "questions", update.get("content"))
1369
+ elif update.get("step") == "planning" and update.get("status") == "completed":
1370
+ # Update DB with planning
1371
+ await update_report_in_db("running", update.get("progress", 0), "planning", update.get("content"))
1372
+ elif update.get("step") == "conclusion" and update.get("status") == "completed":
1373
+ # Store the final result for later processing
1374
+ final_result = update.get("final_result")
1375
+
1376
+ # Convert Plotly figures to JSON format for network transmission
1377
+ if final_result:
1378
+ import plotly.io
1379
+ serialized_return_dict = final_result.copy()
1380
+
1381
+ # Convert plotly_figs to JSON format
1382
+ if 'plotly_figs' in serialized_return_dict and serialized_return_dict['plotly_figs']:
1383
+ json_figs = []
1384
+ for fig_list in serialized_return_dict['plotly_figs']:
1385
+ if isinstance(fig_list, list):
1386
+ json_fig_list = []
1387
+ for fig in fig_list:
1388
+ if hasattr(fig, 'to_json'): # Check if it's a Plotly figure
1389
+ json_fig_list.append(plotly.io.to_json(fig))
1390
+ else:
1391
+ json_fig_list.append(fig) # Already JSON or other format
1392
+ json_figs.append(json_fig_list)
1393
+ else:
1394
+ # Single figure case
1395
+ if hasattr(fig_list, 'to_json'):
1396
+ json_figs.append(plotly.io.to_json(fig_list))
1397
+ else:
1398
+ json_figs.append(fig_list)
1399
+ serialized_return_dict['plotly_figs'] = json_figs
1400
+
1401
+ # Update DB with analysis results
1402
+ await update_report_in_db("running", update.get("progress", 0), "analysis", serialized_return_dict)
1403
+
1404
+ # Generate HTML report using the original final_result with Figure objects
1405
+ html_report = None
1406
+ try:
1407
+ html_report = generate_html_report(final_result)
1408
+ except Exception as e:
1409
+ logger.log_message(f"Error generating HTML report: {str(e)}", level=logging.ERROR)
1410
+ # Continue even if HTML generation fails
1411
+
1412
+ # Send the analysis results
1413
+ yield json.dumps({
1414
+ "step": "analysis",
1415
+ "status": "completed",
1416
+ "content": serialized_return_dict,
1417
+ "progress": 90
1418
+ }) + "\n"
1419
+
1420
+ # Send report generation status
1421
+ yield json.dumps({
1422
+ "step": "report",
1423
+ "status": "processing",
1424
+ "message": "Generating final report...",
1425
+ "progress": 95
1426
+ }) + "\n"
1427
+
1428
+ # Send final completion
1429
+ yield json.dumps({
1430
+ "step": "completed",
1431
+ "status": "success",
1432
+ "analysis": serialized_return_dict,
1433
+ "html_report": html_report,
1434
+ "progress": 100
1435
+ }) + "\n"
1436
+
1437
+ # Update DB with completed report (with HTML if generated)
1438
+ if html_report:
1439
+ logger.log_message(f"Saving HTML report to database, length: {len(html_report)}", level=logging.INFO)
1440
+ else:
1441
+ logger.log_message("No HTML report to save to database", level=logging.WARNING)
1442
+ await update_report_in_db("completed", 100, "completed", html_report)
1443
+ elif update.get("step") == "error":
1444
+ # Forward error directly
1445
+ yield json.dumps(update) + "\n"
1446
+ await update_report_in_db("failed", 0)
1447
+ return
1448
+ else:
1449
+ # Forward all other progress updates
1450
+ yield json.dumps(update) + "\n"
1451
+
1452
+ # If we somehow exit the loop without getting a final result, that's an error
1453
+ if not final_result:
1454
+ yield json.dumps({
1455
+ "step": "error",
1456
+ "status": "failed",
1457
+ "message": "Deep analysis completed without final result",
1458
+ "progress": 0
1459
+ }) + "\n"
1460
+ await update_report_in_db("failed", 0)
1461
+
1462
+ except Exception as e:
1463
+ logger.log_message(f"Error in deep analysis stream: {str(e)}", level=logging.ERROR)
1464
+ yield json.dumps({
1465
+ "step": "error",
1466
+ "status": "failed",
1467
+ "message": f"Deep analysis failed: {str(e)}",
1468
+ "progress": 0
1469
+ }) + "\n"
1470
+
1471
+ # Update DB with error status
1472
+ if 'update_report_in_db' in locals() and session_state.get("current_deep_analysis_id"):
1473
+ await update_report_in_db("failed", 0)
1474
+
1475
+ @app.post("/deep_analysis/download_report")
1476
+ async def download_html_report(
1477
+ request: dict,
1478
+ session_id: str = Depends(get_session_id_dependency)
1479
+ ):
1480
+ """Download HTML report from previous deep analysis"""
1481
+ try:
1482
+ analysis_data = request.get("analysis_data")
1483
+ if not analysis_data:
1484
+ raise HTTPException(status_code=400, detail="No analysis data provided")
1485
+
1486
+ # Get report UUID from request if available (for saving to DB)
1487
+ report_uuid = request.get("report_uuid")
1488
+ session_state = app.state.get_session_state(session_id)
1489
+
1490
+ # If no report_uuid in request, try to get it from session state
1491
+ if not report_uuid and session_state.get("current_deep_analysis_uuid"):
1492
+ report_uuid = session_state.get("current_deep_analysis_uuid")
1493
+
1494
+ # Convert JSON-serialized Plotly figures back to Figure objects for HTML generation
1495
+ processed_data = analysis_data.copy()
1496
+
1497
+ if 'plotly_figs' in processed_data and processed_data['plotly_figs']:
1498
+ import plotly.io
1499
+ import plotly.graph_objects as go
1500
+
1501
+ figure_objects = []
1502
+ for fig_list in processed_data['plotly_figs']:
1503
+ if isinstance(fig_list, list):
1504
+ fig_obj_list = []
1505
+ for fig_json in fig_list:
1506
+ if isinstance(fig_json, str):
1507
+ # Convert JSON string back to Figure object
1508
+ try:
1509
+ fig_obj = plotly.io.from_json(fig_json)
1510
+ fig_obj_list.append(fig_obj)
1511
+ except Exception as e:
1512
+ logger.log_message(f"Error parsing Plotly JSON: {str(e)}", level=logging.WARNING)
1513
+ continue
1514
+ elif hasattr(fig_json, 'to_html'):
1515
+ # Already a Figure object
1516
+ fig_obj_list.append(fig_json)
1517
+ figure_objects.append(fig_obj_list)
1518
+ else:
1519
+ # Single figure case
1520
+ if isinstance(fig_list, str):
1521
+ try:
1522
+ fig_obj = plotly.io.from_json(fig_list)
1523
+ figure_objects.append(fig_obj)
1524
+ except Exception as e:
1525
+ logger.log_message(f"Error parsing Plotly JSON: {str(e)}", level=logging.WARNING)
1526
+ continue
1527
+ elif hasattr(fig_list, 'to_html'):
1528
+ figure_objects.append(fig_list)
1529
+
1530
+ processed_data['plotly_figs'] = figure_objects
1531
+
1532
+ # Generate HTML report
1533
+ html_report = generate_html_report(processed_data)
1534
+
1535
+ # Save report to database if we have a UUID
1536
+ if report_uuid:
1537
+ try:
1538
+ from src.db.init_db import session_factory
1539
+ from src.db.schemas.models import DeepAnalysisReport
1540
+
1541
+ db_session = session_factory()
1542
+ try:
1543
+ # Try to find existing report by UUID
1544
+ report = db_session.query(DeepAnalysisReport).filter(DeepAnalysisReport.report_uuid == report_uuid).first()
1545
+
1546
+ if report:
1547
+ # Update existing report with HTML content
1548
+ report.html_report = html_report
1549
+ report.updated_at = datetime.now(UTC)
1550
+ db_session.commit()
1551
+ except Exception as e:
1552
+ db_session.rollback()
1553
+ finally:
1554
+ db_session.close()
1555
+ except Exception as e:
1556
+ logger.log_message(f"Database operation failed when storing HTML report: {str(e)}", level=logging.ERROR)
1557
+ # Continue even if DB storage fails
1558
+
1559
+ # Create a filename with timestamp
1560
+ timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
1561
+ filename = f"deep_analysis_report_{timestamp}.html"
1562
+
1563
+ # Return as downloadable file
1564
+ return StreamingResponse(
1565
+ iter([html_report.encode('utf-8')]),
1566
+ media_type='text/html',
1567
+ headers={
1568
+ 'Content-Disposition': f'attachment; filename="{filename}"',
1569
+ 'Content-Type': 'text/html; charset=utf-8'
1570
+ }
1571
+ )
1572
+
1573
+ except Exception as e:
1574
+ logger.log_message(f"Failed to generate HTML report: {str(e)}", level=logging.ERROR)
1575
+ raise HTTPException(status_code=500, detail=f"Failed to generate report: {str(e)}")
1576
+
1577
+
1578
+ # In the section where routers are included, add the session_router
1579
+ app.include_router(chat_router)
1580
+ app.include_router(analytics_router)
1581
+ app.include_router(code_router)
1582
+ app.include_router(session_router)
1583
+ app.include_router(feedback_router)
1584
+ app.include_router(deep_analysis_router)
1585
+ app.include_router(templates_router)
1586
+
1587
+ if __name__ == "__main__":
1588
+ port = int(os.environ.get("PORT", 8000))
1589
+ uvicorn.run(app, host="0.0.0.0", port=port)
docs/README.md ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend Documentation
2
+
3
+ This directory contains comprehensive documentation for the Auto-Analyst backend - a sophisticated multi-agent AI platform for data analysis built with FastAPI, DSPy, and modern Python technologies.
4
+
5
+ ## 📁 Documentation Structure
6
+
7
+ ### **🏗️ Architecture** (`/architecture/`)
8
+ - **[System Architecture](./architecture/architecture.md)** - Comprehensive overview of backend system design, components, and data flow patterns
9
+
10
+ ### **🚀 Development** (`/development/`)
11
+ - **[Development Workflow](./development/development_workflow.md)** - Complete development guide with patterns, best practices, and code organization principles
12
+
13
+ ### **🔧 System** (`/system/`)
14
+ - **[Database Schema](./system/database-schema.md)** - Complete database schema with all tables, relationships, and performance optimization
15
+ - **[Shared DataFrame System](./system/shared_dataframe.md)** - Inter-agent data sharing and session management
16
+
17
+ ### **🌐 API** (`/api/`)
18
+ - **[API Endpoints Overview](./api/README.md)** - Main API reference hub
19
+ - **[Route Documentation](./api/routes/)** - Detailed endpoint documentation:
20
+ - **[Core Routes](./api/routes/session.md)** - File uploads, sessions, authentication
21
+ - **[Chat Routes](./api/routes/chats.md)** - Chat and messaging endpoints
22
+ - **[Code Routes](./api/routes/code.md)** - Code execution and processing
23
+ - **[Analytics Routes](./api/routes/analytics.md)** - Usage analytics and monitoring
24
+ - **[Deep Analysis Routes](./api/routes/deep_analysis.md)** - Multi-agent analysis system
25
+ - **[Template Routes](./api/routes/templates.md)** - Agent template management
26
+ - **[Feedback Routes](./api/routes/feedback.md)** - User feedback and rating system
27
+
28
+ ### **🐛 Troubleshooting** (`/troubleshooting/`)
29
+ - **[Troubleshooting Guide](./troubleshooting/troubleshooting.md)** - Common issues, debugging tools, and solutions
30
+
31
+ ## 🎯 Backend Overview
32
+
33
+ ### **Tech Stack**
34
+ - **FastAPI** - Modern async Python web framework
35
+ - **DSPy** - AI agent orchestration and LLM integration
36
+ - **SQLAlchemy** - Database ORM with PostgreSQL/SQLite support
37
+ - **Plotly** - Interactive data visualizations
38
+ - **Pandas/NumPy** - Data manipulation and analysis
39
+ - **Scikit-learn** - Machine learning models
40
+ - **Statsmodels** - Statistical analysis
41
+
42
+ ### **Core Features**
43
+ - **Multi-Agent System** - 4+ specialized AI agents for different analysis tasks
44
+ - **Template System** - User-customizable agent configurations
45
+ - **Deep Analysis** - Multi-step analytical workflows with streaming progress
46
+ - **Session Management** - Stateful user sessions with shared data context
47
+ - **Code Execution** - Safe Python code execution environment
48
+ - **Real-time Streaming** - WebSocket support for live analysis updates
49
+
50
+ ### **Agent Types**
51
+ 1. **Data Preprocessing Agent** - Data cleaning and preparation
52
+ 2. **Statistical Analytics Agent** - Statistical analysis using statsmodels
53
+ 3. **Machine Learning Agent** - ML modeling with scikit-learn
54
+ 4. **Data Visualization Agent** - Interactive charts with Plotly
55
+ 5. **Feature Engineering Agent** (Premium) - Advanced feature creation
56
+ 6. **Polars Agent** (Premium) - High-performance data processing
57
+
58
+ ## 🚀 Quick Start Guide
59
+
60
+ ### **1. Environment Setup**
61
+
62
+ ```bash
63
+ # Navigate to backend directory
64
+ cd Auto-Analyst-CS/auto-analyst-backend
65
+
66
+ # Create virtual environment
67
+ python -m venv venv
68
+ source venv/bin/activate # Linux/Mac
69
+ venv\Scripts\activate # Windows
70
+
71
+ # Install dependencies
72
+ pip install -r requirements.txt
73
+ ```
74
+
75
+ ### **2. Environment Configuration**
76
+
77
+ Create `.env` file with required variables:
78
+
79
+ ```env
80
+ # Database Configuration
81
+ DATABASE_URL=sqlite:///./chat_database.db
82
+
83
+ # AI Model Configuration
84
+ OPENAI_API_KEY=your-openai-api-key
85
+ MODEL_PROVIDER=openai # openai, anthropic, groq, gemini
86
+ MODEL_NAME=gpt-4o-mini
87
+ TEMPERATURE=0.7
88
+ MAX_TOKENS=6000
89
+
90
+ # Optional: Additional AI Providers
91
+ ANTHROPIC_API_KEY=your-anthropic-key
92
+ GROQ_API_KEY=your-groq-key
93
+ GEMINI_API_KEY=your-gemini-key
94
+
95
+ # Security
96
+ ADMIN_API_KEY=your-admin-key
97
+
98
+ # Application Settings
99
+ ENVIRONMENT=development
100
+ FRONTEND_URL=http://localhost:3000/
101
+ ```
102
+
103
+ ### **3. Database Initialization**
104
+
105
+ ```bash
106
+ # Initialize database and default agents
107
+ python -c "
108
+ from src.db.init_db import init_db
109
+ init_db()
110
+ print('✅ Database and agents initialized successfully')
111
+ "
112
+ ```
113
+
114
+ ### **4. Start Development Server**
115
+
116
+ ```bash
117
+ # Start the FastAPI server
118
+ python -m app
119
+
120
+ # Or with uvicorn for more control
121
+ uvicorn app:app --reload --host 0.0.0.0 --port 8000
122
+ ```
123
+
124
+ ### **5. Verify Installation**
125
+
126
+ - **API Documentation**: `http://localhost:8000/docs`
127
+ - **Health Check**: `http://localhost:8000/health`
128
+
129
+ ## 🔧 Development Workflow
130
+
131
+ ### **Adding New Agents**
132
+
133
+ 1. **Define Agent Signature** in `src/agents/agents.py`
134
+ 2. **Add Configuration** to `agents_config.json`
135
+ 3. **Register Agent** in loading system
136
+ 4. **Test Integration** with multi-agent pipeline
137
+
138
+ ### **Adding New API Endpoints**
139
+
140
+ 1. **Create Route File** in `src/routes/`
141
+ 2. **Define Pydantic Models** for request/response
142
+ 3. **Implement Endpoints** with proper error handling
143
+ 4. **Register Router** in `app.py`
144
+ 5. **Update Documentation**
145
+
146
+ ### **Database Changes**
147
+
148
+ 1. **Modify Models** in `src/db/schemas/models.py`
149
+ 2. **Create Migration**: `alembic revision --autogenerate -m "description"`
150
+ 3. **Apply Migration**: `alembic upgrade head`
151
+ 4. **Update Documentation**
152
+
153
+ ## 📊 System Architecture
154
+
155
+ ### **Request Processing Flow**
156
+ ```
157
+ HTTP Request → FastAPI Router → Route Handler → Business Logic →
158
+ Database/Agent System → AI Model → Response Processing → JSON Response
159
+ ```
160
+
161
+ ### **Agent Execution Flow**
162
+ ```
163
+ User Query → Session Manager → Agent Selection → Context Preparation →
164
+ DSPy Chain → AI Model → Code Generation → Execution → Response Formatting
165
+ ```
166
+
167
+ ### **Deep Analysis Workflow**
168
+ ```
169
+ Goal Input → Question Generation → Planning → Multi-Agent Execution →
170
+ Code Synthesis → Result Compilation → HTML Report Generation
171
+ ```
172
+
173
+ ## 🧪 Testing & Validation
174
+
175
+ ### **API Testing**
176
+ ```bash
177
+ # Interactive documentation
178
+ open http://localhost:8000/docs
179
+
180
+ # cURL examples
181
+ curl -X GET "http://localhost:8000/health"
182
+ curl -X POST "http://localhost:8000/chat/preprocessing_agent" \
183
+ -H "Content-Type: application/json" \
184
+ -d '{"query": "Clean this dataset", "session_id": "test"}'
185
+ ```
186
+
187
+ ### **Agent Testing**
188
+ ```python
189
+ # Test individual agents
190
+ from src.agents.agents import preprocessing_agent
191
+ import dspy
192
+
193
+ # Configure DSPy
194
+ lm = dspy.LM('openai/gpt-4o-mini', api_key='your-key')
195
+ dspy.configure(lm=lm)
196
+
197
+ # Test agent
198
+ agent = dspy.ChainOfThought(preprocessing_agent)
199
+ result = agent(goal='clean data', dataset='test dataset')
200
+ print(result)
201
+ ```
202
+
203
+ ## 🔒 Security & Production
204
+
205
+ ### **Security Features**
206
+ - **Session-based authentication** with secure session management
207
+ - **API key protection** for admin endpoints
208
+ - **Input validation** using Pydantic models
209
+ - **Error handling** with proper HTTP status codes
210
+ - **CORS configuration** for frontend integration
211
+
212
+ ### **Production Considerations**
213
+ - **PostgreSQL database** for production deployment
214
+ - **Environment variable management** for secrets
215
+ - **Logging configuration** for monitoring
216
+ - **Rate limiting** for API protection
217
+ - **Performance optimization** for large datasets
218
+
219
+ ## 📈 Monitoring & Analytics
220
+
221
+ The backend includes comprehensive analytics for:
222
+ - **Usage tracking** - API endpoint usage and performance
223
+ - **Model usage** - AI model consumption and costs
224
+ - **User analytics** - User behavior and engagement
225
+ - **Error monitoring** - System health and error tracking
226
+ - **Performance metrics** - Response times and throughput
227
+
228
+ ## 🤝 Contributing
229
+
230
+ 1. **Follow coding standards** defined in development workflow
231
+ 2. **Add comprehensive tests** for new features
232
+ 3. **Update documentation** for all changes
233
+ 4. **Use proper error handling** patterns
234
+ 5. **Submit detailed pull requests** with clear descriptions
235
+
236
+ ---
237
+
238
+ ## 📖 Detailed Documentation
239
+
240
+ For specific implementation details, refer to the organized documentation in each subdirectory:
241
+
242
+ - **[Getting Started Guide](./getting_started.md)** - Complete setup walkthrough
243
+ - **[Architecture Documentation](./architecture/)** - System design and components
244
+ - **[Development Guides](./development/)** - Workflow and best practices
245
+ - **[API Reference](./api/)** - Complete endpoint documentation
246
+ - **[System Documentation](./system/)** - Database and core systems
247
+ - **[Troubleshooting](./troubleshooting/)** - Debugging and solutions
248
+
249
+ ---
250
+
251
+ **Need help?** Check the troubleshooting guide or refer to the comprehensive documentation in each section.
docs/api/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend API Documentation
2
+
3
+ This document is a guide to the backend API endpoints utilized within the Auto-Analyst application. It encompasses a thorough breakdown of various aspects, including the handling of requests, the intricate processes of data transformations, and the structured responses that the API generates.
4
+
5
+ The Auto-Analyst application is designed to facilitate seamless interactions and efficient data management, making it essential for users to understand the available endpoints and their functionalities. Each section of this documentation is crafted to provide clarity and insight into how the API operates, ensuring that developers and users alike can effectively leverage its capabilities.
6
+
7
+ For more specific details regarding the various functionalities offered by the API, please refer to the following sections, which delve deeper into their respective areas:
8
+
9
+ ## 📚 Core Documentation
10
+
11
+ - **[Getting Started Guide](./getting_started.md)**: Quick start guide for new developers and LLMs to understand the system architecture and get up to speed quickly
12
+ - **[System Architecture](./architecture.md)**: Comprehensive overview of the backend system design, components, and data flow patterns
13
+ - **[Troubleshooting Guide](./troubleshooting.md)**: Common issues, debugging tools, and solutions for development and deployment problems
14
+
15
+ ## 🛠️ API Reference
16
+
17
+ - **[Core Endpoints](./routes/session.md)**: Review the core endpoints that handle fundamental operations within the application, including data uploads, AI analysis, model settings, and session management.
18
+ - **[Analytics Endpoints](./routes/analytics.md)**: Explore the endpoints dedicated to analytics, providing insights into usage statistics, performance metrics, cost analysis, and real-time monitoring.
19
+ - **[Chat Endpoints](./routes/chats.md)**: Discover the endpoints that manage chat interactions, enabling users to create, retrieve, and manage chat sessions effectively.
20
+ - **[Code Endpoints](./routes/code.md)**: Learn about the endpoints for code execution, editing, fixing, and cleaning operations with advanced AI assistance.
21
+ - **[Deep Analysis Endpoints](./routes/deep_analysis.md)**: Comprehensive documentation for the multi-agent deep analysis system, including streaming progress, report management, template integration, and how user's active agents are leveraged for advanced analytical insights.
22
+ - **[Feedback Endpoints](./routes/feedback.md)**: Understand the endpoints for managing user feedback on AI-generated messages, including rating systems and model performance tracking.
23
+ - **[Templates Endpoints](./routes/templates.md)**: Comprehensive guide to the template system, agent loading, user preferences, and how personalized AI agent configurations work for different users.
docs/api/routes/analytics.md ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Analytics Routes Documentation
2
+
3
+ These routes provide comprehensive analytics functionality for the Auto-Analyst backend, including dashboard summaries, user analytics, model performance metrics, cost analysis, and system monitoring.
4
+
5
+ ## Authentication
6
+
7
+ All analytics endpoints require admin authentication via API key:
8
+
9
+ ```python
10
+ ADMIN_API_KEY = os.getenv("ADMIN_API_KEY", "default-admin-key-change-me")
11
+ ```
12
+
13
+ The API key can be provided via:
14
+ - **Header:** `X-Admin-API-Key`
15
+ - **Query parameter:** `admin_api_key`
16
+
17
+ ---
18
+
19
+ ## Dashboard Endpoints
20
+
21
+ ### **GET /analytics/dashboard**
22
+ Returns comprehensive dashboard data combining usage statistics, model performance, and user activity.
23
+
24
+ **Query Parameters:**
25
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
26
+
27
+ **Response:**
28
+ ```json
29
+ {
30
+ "total_tokens": 123456,
31
+ "total_cost": 25.50,
32
+ "total_requests": 1000,
33
+ "total_users": 50,
34
+ "daily_usage": [
35
+ {
36
+ "date": "2023-05-01",
37
+ "tokens": 5000,
38
+ "cost": 1.25,
39
+ "requests": 100
40
+ }
41
+ ],
42
+ "model_usage": [
43
+ {
44
+ "model_name": "claude-3-sonnet-20241022",
45
+ "tokens": 10000,
46
+ "cost": 10.00,
47
+ "requests": 200
48
+ }
49
+ ],
50
+ "top_users": [
51
+ {
52
+ "user_id": "123",
53
+ "tokens": 5000,
54
+ "cost": 5.00,
55
+ "requests": 50
56
+ }
57
+ ],
58
+ "start_date": "2023-04-01",
59
+ "end_date": "2023-05-01"
60
+ }
61
+ ```
62
+
63
+ ### **WebSocket /analytics/dashboard/realtime**
64
+ WebSocket endpoint for real-time dashboard updates. Accepts connections and maintains them for broadcasting live data updates.
65
+
66
+ ---
67
+
68
+ ## User Analytics Endpoints
69
+
70
+ ### **GET /analytics/users**
71
+ Returns user list with usage statistics from the past 7 days.
72
+
73
+ **Query Parameters:**
74
+ - `limit` (optional): Maximum users to return (default: `100`)
75
+ - `offset` (optional): Pagination offset (default: `0`)
76
+
77
+ **Response:**
78
+ ```json
79
+ {
80
+ "users": [
81
+ {
82
+ "user_id": "123",
83
+ "tokens": 5000,
84
+ "cost": 5.00,
85
+ "requests": 50,
86
+ "first_seen": "2023-04-01T12:00:00Z",
87
+ "last_seen": "2023-05-01T12:00:00Z"
88
+ }
89
+ ],
90
+ "total": 200,
91
+ "limit": 100,
92
+ "offset": 0
93
+ }
94
+ ```
95
+
96
+ ### **GET /analytics/users/activity**
97
+ Returns daily user activity metrics with new user tracking.
98
+
99
+ **Query Parameters:**
100
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
101
+
102
+ **Response:**
103
+ ```json
104
+ {
105
+ "user_activity": [
106
+ {
107
+ "date": "2023-05-01",
108
+ "activeUsers": 20,
109
+ "newUsers": 5,
110
+ "sessions": 30
111
+ }
112
+ ]
113
+ }
114
+ ```
115
+
116
+ ### **GET /analytics/users/sessions/stats**
117
+ Returns session statistics including total users, active users today, average queries per session, and average session time.
118
+
119
+ **Response:**
120
+ ```json
121
+ {
122
+ "totalUsers": 500,
123
+ "activeToday": 25,
124
+ "avgQueriesPerSession": 3.2,
125
+ "avgSessionTime": 300
126
+ }
127
+ ```
128
+
129
+ ### **WebSocket /analytics/realtime**
130
+ WebSocket endpoint for real-time user analytics updates.
131
+
132
+ ---
133
+
134
+ ## Model Analytics Endpoints
135
+
136
+ ### **GET /analytics/usage/models**
137
+ Returns model usage breakdown with performance metrics.
138
+
139
+ **Query Parameters:**
140
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
141
+
142
+ **Response:**
143
+ ```json
144
+ {
145
+ "model_usage": [
146
+ {
147
+ "model_name": "claude-3-sonnet-20241022",
148
+ "tokens": 10000,
149
+ "cost": 10.00,
150
+ "requests": 200,
151
+ "avg_response_time": 1.5
152
+ }
153
+ ]
154
+ }
155
+ ```
156
+
157
+ ### **GET /analytics/models/history**
158
+ Returns daily model usage history with trend data.
159
+
160
+ **Query Parameters:**
161
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
162
+
163
+ **Response:**
164
+ ```json
165
+ {
166
+ "model_history": [
167
+ {
168
+ "date": "2023-05-01",
169
+ "models": [
170
+ {
171
+ "name": "claude-3-sonnet-20241022",
172
+ "tokens": 5000,
173
+ "requests": 100
174
+ }
175
+ ]
176
+ }
177
+ ]
178
+ }
179
+ ```
180
+
181
+ ### **GET /analytics/models/metrics**
182
+ Returns model performance metrics including success rates and response times.
183
+
184
+ **Response:**
185
+ ```json
186
+ {
187
+ "model_metrics": [
188
+ {
189
+ "name": "claude-3-sonnet-20241022",
190
+ "avg_tokens": 250.5,
191
+ "avg_response_time": 1.2,
192
+ "success_rate": 0.95
193
+ }
194
+ ]
195
+ }
196
+ ```
197
+
198
+ ---
199
+
200
+ ## Cost Analytics Endpoints
201
+
202
+ ### **GET /analytics/costs/summary**
203
+ Returns cost summary with averages and totals.
204
+
205
+ **Query Parameters:**
206
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
207
+
208
+ **Response:**
209
+ ```json
210
+ {
211
+ "totalCost": 25.50,
212
+ "totalTokens": 100000,
213
+ "totalRequests": 1000,
214
+ "avgDailyCost": 0.85,
215
+ "costPerThousandTokens": 0.255,
216
+ "daysInPeriod": 30,
217
+ "startDate": "2023-04-01",
218
+ "endDate": "2023-05-01"
219
+ }
220
+ ```
221
+
222
+ ### **GET /analytics/costs/daily**
223
+ Returns daily cost breakdown with filled gaps for missing dates.
224
+
225
+ **Query Parameters:**
226
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
227
+
228
+ **Response:**
229
+ ```json
230
+ {
231
+ "daily_costs": [
232
+ {
233
+ "date": "2023-05-01",
234
+ "cost": 1.25,
235
+ "tokens": 5000
236
+ }
237
+ ]
238
+ }
239
+ ```
240
+
241
+ ### **GET /analytics/costs/models**
242
+ Returns cost breakdown by model.
243
+
244
+ **Query Parameters:**
245
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
246
+
247
+ **Response:**
248
+ ```json
249
+ {
250
+ "model_costs": [
251
+ {
252
+ "model_name": "claude-3-sonnet-20241022",
253
+ "cost": 15.50,
254
+ "tokens": 50000,
255
+ "requests": 500
256
+ }
257
+ ]
258
+ }
259
+ ```
260
+
261
+ ### **GET /analytics/costs/projections**
262
+ Returns cost projections based on last 30 days usage.
263
+
264
+ **Response:**
265
+ ```json
266
+ {
267
+ "nextMonth": 75.00,
268
+ "next3Months": 225.00,
269
+ "nextYear": 900.00,
270
+ "tokensNextMonth": 300000,
271
+ "dailyCost": 2.50,
272
+ "dailyTokens": 10000,
273
+ "baselineDays": 30
274
+ }
275
+ ```
276
+
277
+ ### **GET /analytics/costs/today**
278
+ Returns today's cost data.
279
+
280
+ **Response:**
281
+ ```json
282
+ {
283
+ "date": "2023-05-01",
284
+ "cost": 2.50,
285
+ "tokens": 10000,
286
+ "requests": 100
287
+ }
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Tier Analytics Endpoints
293
+
294
+ ### **GET /analytics/tiers/usage**
295
+ Returns usage data categorized by model tiers with aggregated statistics.
296
+
297
+ **Query Parameters:**
298
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
299
+
300
+ **Response:**
301
+ ```json
302
+ {
303
+ "tier_data": {
304
+ "tier_1": {
305
+ "name": "Basic",
306
+ "credits": 1,
307
+ "total_tokens": 50000,
308
+ "total_requests": 500,
309
+ "total_cost": 5.00,
310
+ "avg_tokens_per_query": 100,
311
+ "cost_per_1k_tokens": 0.10,
312
+ "total_credit_cost": 500,
313
+ "cost_per_credit": 0.01,
314
+ "models": [...]
315
+ }
316
+ },
317
+ "period": "30d",
318
+ "start_date": "2023-04-01",
319
+ "end_date": "2023-05-01"
320
+ }
321
+ ```
322
+
323
+ ### **GET /analytics/tiers/projections**
324
+ Returns tier-based cost and usage projections.
325
+
326
+ **Response:**
327
+ ```json
328
+ {
329
+ "daily_usage": {...},
330
+ "projections": {
331
+ "monthly": {...},
332
+ "quarterly": {...},
333
+ "yearly": {...}
334
+ },
335
+ "tier_definitions": {...}
336
+ }
337
+ ```
338
+
339
+ ### **GET /analytics/tiers/efficiency**
340
+ Returns efficiency metrics by tier including cost per credit and tokens per credit.
341
+
342
+ **Query Parameters:**
343
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
344
+
345
+ **Response:**
346
+ ```json
347
+ {
348
+ "efficiency_data": {...},
349
+ "most_efficient_tier": "tier_2",
350
+ "best_value_tier": "tier_1",
351
+ "period": "30d",
352
+ "start_date": "2023-04-01",
353
+ "end_date": "2023-05-01"
354
+ }
355
+ ```
356
+
357
+ ---
358
+
359
+ ## Code Execution Analytics Endpoints
360
+
361
+ ### **GET /analytics/code-executions/summary**
362
+ Returns code execution statistics including success rates and model performance.
363
+
364
+ **Query Parameters:**
365
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
366
+
367
+ **Response:**
368
+ ```json
369
+ {
370
+ "period": "30d",
371
+ "start_date": "2023-04-01",
372
+ "end_date": "2023-05-01",
373
+ "overall_stats": {
374
+ "total_executions": 1000,
375
+ "successful_executions": 950,
376
+ "failed_executions": 50,
377
+ "success_rate": 0.95,
378
+ "total_users": 100,
379
+ "total_chats": 200
380
+ },
381
+ "model_performance": [...],
382
+ "failed_agents": [...]
383
+ }
384
+ ```
385
+
386
+ ### **GET /analytics/code-executions/detailed**
387
+ Returns detailed code execution records with filtering options.
388
+
389
+ **Query Parameters:**
390
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
391
+ - `success_filter` (optional): Filter by success status (boolean)
392
+ - `user_id` (optional): Filter by user ID
393
+ - `model_name` (optional): Filter by model name
394
+ - `limit` (optional): Maximum results (default: `100`)
395
+
396
+ **Response:**
397
+ ```json
398
+ {
399
+ "period": "30d",
400
+ "start_date": "2023-04-01",
401
+ "end_date": "2023-05-01",
402
+ "count": 50,
403
+ "executions": [...]
404
+ }
405
+ ```
406
+
407
+ ### **GET /analytics/code-executions/users**
408
+ Returns code execution statistics grouped by user.
409
+
410
+ **Query Parameters:**
411
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
412
+ - `limit` (optional): Maximum users (default: `50`)
413
+
414
+ **Response:**
415
+ ```json
416
+ {
417
+ "period": "30d",
418
+ "start_date": "2023-04-01",
419
+ "end_date": "2023-05-01",
420
+ "users": [...]
421
+ }
422
+ ```
423
+
424
+ ### **GET /analytics/code-executions/error-analysis**
425
+ Returns error analysis with categorized error types and agent failure patterns.
426
+
427
+ **Query Parameters:**
428
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
429
+
430
+ **Response:**
431
+ ```json
432
+ {
433
+ "period": "30d",
434
+ "start_date": "2023-04-01",
435
+ "end_date": "2023-05-01",
436
+ "total_failed_executions": 50,
437
+ "error_types": [...],
438
+ "error_by_agent": [...]
439
+ }
440
+ ```
441
+
442
+ ---
443
+
444
+ ## Feedback Analytics Endpoints
445
+
446
+ ### **GET /analytics/feedback/summary**
447
+ Returns feedback summary statistics including rating distributions and trends.
448
+
449
+ **Query Parameters:**
450
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
451
+
452
+ **Response:**
453
+ ```json
454
+ {
455
+ "period": "30d",
456
+ "start_date": "2023-04-01",
457
+ "end_date": "2023-05-01",
458
+ "total_feedback": 500,
459
+ "avg_rating": 4.2,
460
+ "chats_with_feedback": 200,
461
+ "ratings_distribution": [
462
+ {"rating": 1, "count": 10},
463
+ {"rating": 2, "count": 20},
464
+ {"rating": 3, "count": 50},
465
+ {"rating": 4, "count": 200},
466
+ {"rating": 5, "count": 220}
467
+ ],
468
+ "models_data": [...],
469
+ "feedback_trend": [...]
470
+ }
471
+ ```
472
+
473
+ ### **GET /analytics/feedback/detailed**
474
+ Returns detailed feedback records with filtering and pagination.
475
+
476
+ **Query Parameters:**
477
+ - `period` (optional): Time period (`7d`, `30d`, `90d`, default: `30d`)
478
+ - `min_rating` (optional): Minimum rating filter
479
+ - `max_rating` (optional): Maximum rating filter
480
+ - `model_name` (optional): Filter by model name
481
+ - `limit` (optional): Maximum results (default: `100`)
482
+ - `offset` (optional): Pagination offset (default: `0`)
483
+
484
+ **Response:**
485
+ ```json
486
+ {
487
+ "period": "30d",
488
+ "start_date": "2023-04-01",
489
+ "end_date": "2023-05-01",
490
+ "total": 500,
491
+ "count": 100,
492
+ "offset": 0,
493
+ "limit": 100,
494
+ "feedback": [...]
495
+ }
496
+ ```
497
+
498
+ ---
499
+
500
+ ## Public Endpoints
501
+
502
+ ### **GET /analytics/public/ticker**
503
+ Returns public ticker data for landing page statistics. **No authentication required.**
504
+
505
+ **Response:**
506
+ ```json
507
+ {
508
+ "total_signups": 1000,
509
+ "total_tokens": 5000000,
510
+ "total_requests": 50000,
511
+ "last_updated": "2023-05-01T12:00:00Z"
512
+ }
513
+ ```
514
+
515
+ ---
516
+
517
+ ## Utility Endpoints
518
+
519
+ ### **GET /analytics/usage/summary**
520
+ Returns overall usage summary (legacy endpoint, calls dashboard with 30d period).
521
+
522
+ ### **GET /analytics/debug/model_usage**
523
+ Debug endpoint for testing admin API key validation.
524
+
525
+ **Response:**
526
+ ```json
527
+ {
528
+ "status": "success",
529
+ "message": "Admin API key validated successfully"
530
+ }
531
+ ```
532
+
533
+ ---
534
+
535
+ ## Error Categorization
536
+
537
+ The system automatically categorizes code execution errors into the following types:
538
+
539
+ - **NameError**: Variable or function name not found
540
+ - **SyntaxError**: Invalid Python syntax
541
+ - **TypeError**: Type-related errors
542
+ - **AttributeError**: Attribute access errors
543
+ - **IndexError/KeyError**: Index or key access errors
544
+ - **ImportError**: Module import errors
545
+ - **ValueError**: Invalid values passed to functions
546
+ - **OperationError**: Unsupported operations
547
+ - **IndentationError**: Python indentation errors
548
+ - **PermissionError**: File/system permission errors
549
+ - **FileNotFoundError**: File access errors
550
+ - **MemoryError**: Memory allocation errors
551
+ - **TimeoutError**: Operation timeout errors
552
+ - **OtherError**: Uncategorized errors
553
+
554
+ ## Real-time Updates
555
+
556
+ The analytics system supports real-time updates through WebSocket connections:
557
+
558
+ - **Dashboard updates**: Broadcasted when new model usage is recorded
559
+ - **User activity updates**: Broadcasted for user activity changes
560
+ - **Model performance updates**: Broadcasted for model-specific metrics
561
+
562
+ All real-time updates are sent as JSON messages with `type` field indicating the update category and `metrics` containing the delta or new values.
docs/api/routes/chats.md ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Chat Routes Overview
2
+
3
+ These routes handle chat interactions, message processing, user management, and debugging.
4
+
5
+ ---
6
+
7
+ ### **Chat Management**
8
+
9
+ #### **1. Create a New Chat**
10
+ **Endpoint:** `POST /chats/`
11
+ **Description:** Creates a new chat session.
12
+ **Request Body:**
13
+ ```json
14
+ {
15
+ "user_id": 123
16
+ }
17
+ ```
18
+ **Response:**
19
+ ```json
20
+ {
21
+ "chat_id": 456,
22
+ "user_id": 123,
23
+ "title": "New Chat",
24
+ "created_at": "2023-05-01T12:00:00Z"
25
+ }
26
+ ```
27
+
28
+ ---
29
+
30
+ #### **2. Retrieve a Chat by ID**
31
+ **Endpoint:** `GET /chats/{chat_id}`
32
+ **Description:** Fetches a specific chat along with its messages.
33
+ **Path Parameter:** `chat_id` (ID of the chat)
34
+ **Query Parameter:** `user_id` (Optional for access control)
35
+ **Response:**
36
+ ```json
37
+ {
38
+ "chat_id": 456,
39
+ "title": "New Chat",
40
+ "created_at": "2023-05-01T12:00:00Z",
41
+ "user_id": 123,
42
+ "messages": [
43
+ {
44
+ "message_id": 789,
45
+ "chat_id": 456,
46
+ "content": "Hello, how can I help?",
47
+ "sender": "ai",
48
+ "timestamp": "2023-05-01T12:01:00Z"
49
+ }
50
+ ]
51
+ }
52
+ ```
53
+
54
+ ---
55
+
56
+ #### **3. List Recent Chats**
57
+ **Endpoint:** `GET /chats/`
58
+ **Description:** Retrieves a list of recent chats, optionally filtered by user ID.
59
+ **Query Parameters:**
60
+ - `user_id` (Optional for filtering by user)
61
+ - `limit` (Maximum number of chats, default: 10, max: 100)
62
+ - `offset` (For pagination, default: 0)
63
+ **Response:**
64
+ ```json
65
+ [
66
+ {
67
+ "chat_id": 456,
68
+ "user_id": 123,
69
+ "title": "New Chat",
70
+ "created_at": "2023-05-01T12:00:00Z"
71
+ }
72
+ ]
73
+ ```
74
+
75
+ ---
76
+
77
+ #### **4. Update a Chat**
78
+ **Endpoint:** `PUT /chats/{chat_id}`
79
+ **Description:** Updates a chat's title or user ID.
80
+ **Path Parameter:** `chat_id` (ID of the chat to update)
81
+ **Request Body:**
82
+ ```json
83
+ {
84
+ "title": "Updated Chat Title",
85
+ "user_id": 123
86
+ }
87
+ ```
88
+ **Response:**
89
+ ```json
90
+ {
91
+ "chat_id": 456,
92
+ "title": "Updated Chat Title",
93
+ "created_at": "2023-05-01T12:00:00Z",
94
+ "user_id": 123
95
+ }
96
+ ```
97
+
98
+ ---
99
+
100
+ #### **5. Delete a Chat**
101
+ **Endpoint:** `DELETE /chats/{chat_id}`
102
+ **Description:** Deletes a chat and all its messages while preserving model usage records.
103
+ **Path Parameter:** `chat_id` (ID of the chat to delete)
104
+ **Query Parameter:** `user_id` (Optional for access control)
105
+ **Response:**
106
+ ```json
107
+ {
108
+ "message": "Chat 456 deleted successfully",
109
+ "preserved_model_usage": true
110
+ }
111
+ ```
112
+
113
+ ---
114
+
115
+ #### **6. Cleanup Empty Chats**
116
+ **Endpoint:** `POST /chats/cleanup-empty`
117
+ **Description:** Deletes empty chats for a user.
118
+ **Request Body:**
119
+ ```json
120
+ {
121
+ "user_id": 123,
122
+ "is_admin": false
123
+ }
124
+ ```
125
+ **Response:**
126
+ ```json
127
+ {
128
+ "message": "Deleted 5 empty chats"
129
+ }
130
+ ```
131
+
132
+ ---
133
+
134
+ ### **Message Management**
135
+
136
+ #### **1. Add Message to Chat**
137
+ **Endpoint:** `POST /chats/{chat_id}/messages`
138
+ **Description:** Adds a message to an existing chat.
139
+ **Path Parameter:** `chat_id` (ID of the chat)
140
+ **Query Parameter:** `user_id` (Optional for access control)
141
+ **Request Body:**
142
+ ```json
143
+ {
144
+ "content": "Hello, I need help with data analysis",
145
+ "sender": "user"
146
+ }
147
+ ```
148
+ **Response:**
149
+ ```json
150
+ {
151
+ "message_id": 789,
152
+ "chat_id": 456,
153
+ "content": "Hello, I need help with data analysis",
154
+ "sender": "user",
155
+ "timestamp": "2023-05-01T12:01:00Z"
156
+ }
157
+ ```
158
+
159
+ ---
160
+
161
+ ### **User Management**
162
+
163
+ #### **1. Create or Retrieve a User**
164
+ **Endpoint:** `POST /chats/users`
165
+ **Description:** Creates a new user or retrieves an existing one based on email.
166
+ **Request Body:**
167
+ ```json
168
+ {
169
+ "username": "john_doe",
170
+ "email": "john@example.com"
171
+ }
172
+ ```
173
+ **Response:**
174
+ ```json
175
+ {
176
+ "user_id": 123,
177
+ "username": "john_doe",
178
+ "email": "john@example.com",
179
+ "created_at": "2023-05-01T12:00:00Z"
180
+ }
181
+ ```
docs/api/routes/code.md ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code Routes Documentation
2
+
3
+ This document describes the API endpoints available for code execution, editing, fixing, and cleaning operations in the Auto-Analyst backend.
4
+
5
+ ## Base URL
6
+
7
+ All code-related endpoints are prefixed with `/code`.
8
+
9
+ ## Endpoints
10
+
11
+ ### Execute Code
12
+ Executes Python code against the current session's dataframe.
13
+
14
+ **Endpoint:** `POST /code/execute`
15
+
16
+ **Request Body:**
17
+ ```json
18
+ {
19
+ "code": "string", // Python code to execute
20
+ "session_id": "string", // Optional session ID
21
+ "message_id": 123 // Optional message ID for tracking
22
+ }
23
+ ```
24
+
25
+ **Response:**
26
+ ```json
27
+ {
28
+ "output": "string", // Execution output
29
+ "plotly_outputs": [ // Optional array of plotly outputs
30
+ "string"
31
+ ]
32
+ }
33
+ ```
34
+
35
+ **Error Responses:**
36
+ - `400 Bad Request`: No dataset loaded or no code provided
37
+ - `500 Internal Server Error`: Execution error
38
+
39
+ ### Edit Code
40
+ Uses AI to edit code based on user instructions.
41
+
42
+ **Endpoint:** `POST /code/edit`
43
+
44
+ **Request Body:**
45
+ ```json
46
+ {
47
+ "original_code": "string", // Code to be edited
48
+ "user_prompt": "string" // Instructions for editing
49
+ }
50
+ ```
51
+
52
+ **Response:**
53
+ ```json
54
+ {
55
+ "edited_code": "string" // The edited code
56
+ }
57
+ ```
58
+
59
+ **Error Responses:**
60
+ - `400 Bad Request`: Missing original code or editing instructions
61
+ - `500 Internal Server Error`: Editing error
62
+
63
+ ### Fix Code
64
+ Uses AI to fix code with errors, employing a block-by-block approach with DSPy refinement.
65
+
66
+ **Endpoint:** `POST /code/fix`
67
+
68
+ **Request Body:**
69
+ ```json
70
+ {
71
+ "code": "string", // Code containing errors
72
+ "error": "string" // Error message to fix
73
+ }
74
+ ```
75
+
76
+ **Response:**
77
+ ```json
78
+ {
79
+ "fixed_code": "string" // The fixed code
80
+ }
81
+ ```
82
+
83
+ **Error Responses:**
84
+ - `400 Bad Request`: Missing code or error message
85
+ - `500 Internal Server Error`: Fixing error
86
+
87
+ ### Clean Code
88
+ Cleans and formats code by organizing imports and ensuring proper code block formatting.
89
+
90
+ **Endpoint:** `POST /code/clean-code`
91
+
92
+ **Request Body:**
93
+ ```json
94
+ {
95
+ "code": "string" // Code to clean
96
+ }
97
+ ```
98
+
99
+ **Response:**
100
+ ```json
101
+ {
102
+ "cleaned_code": "string" // The cleaned code
103
+ }
104
+ ```
105
+
106
+ **Error Responses:**
107
+ - `400 Bad Request`: No code provided
108
+ - `500 Internal Server Error`: Cleaning error
109
+
110
+ ### Get Latest Code
111
+ Retrieves the latest code from a specific message.
112
+
113
+ **Endpoint:** `POST /code/get-latest-code`
114
+
115
+ **Request Body:**
116
+ ```json
117
+ {
118
+ "message_id": 123 // Message ID to retrieve code from
119
+ }
120
+ ```
121
+
122
+ **Response:**
123
+ ```json
124
+ {
125
+ "code": "string" // The retrieved code
126
+ }
127
+ ```
128
+
129
+ **Error Responses:**
130
+ - `400 Bad Request`: Missing message ID
131
+ - `404 Not Found`: Message not found
132
+ - `500 Internal Server Error`: Retrieval error
133
+
134
+ ## Code Processing Features
135
+
136
+ ### Import Organization
137
+ The code processing system automatically:
138
+ - Moves all import statements to the top of the file
139
+ - Deduplicates imports
140
+ - Sorts imports alphabetically
141
+
142
+ ### Code Block Management
143
+ The system supports code blocks marked with special comments:
144
+ - Start marker: `# agent_name code start`
145
+ - End marker: `# agent_name code end`
146
+
147
+ ### Error Handling with DSPy Refinement
148
+ When fixing code, the system uses DSPy's refinement mechanism:
149
+ - Identifies specific code blocks with errors
150
+ - Processes error messages to extract relevant information
151
+ - Uses a scoring function to validate fixes
152
+ - Employs iterative refinement with up to 3 attempts
153
+ - Fixes each block individually while maintaining the overall structure
154
+ - Preserves code block markers and relationships
155
+
156
+ ### Dataset Context
157
+ When editing or fixing code, the system provides context about the current dataset including:
158
+ - Number of rows and columns
159
+ - Column names and data types
160
+ - Null value counts
161
+ - Sample values for each column
162
+
163
+ ### Code Execution Safety
164
+ The execution system includes safety measures:
165
+ - Removes blocking calls like `plt.show()`
166
+ - Handles `__main__` block extraction
167
+ - Cleans up print statements with unwanted newlines
168
+ - Executes code in isolated namespaces
169
+
170
+ ## Session Management
171
+ All endpoints require a valid session ID, which is used to:
172
+ - Access the current dataset
173
+ - Maintain state between requests
174
+ - Track code execution history
175
+ - Store execution results for analysis
176
+
177
+ ## Error Handling
178
+ The system provides detailed error messages while maintaining security by:
179
+ - Logging errors for debugging
180
+ - Returning user-friendly error messages
181
+ - Preserving original code in case of processing failures
182
+ - Using code scoring to validate fixes before returning results
docs/api/routes/deep_analysis.md ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deep Analysis API Documentation
2
+
3
+ ## Overview
4
+
5
+ The Deep Analysis system provides advanced multi-agent analytical capabilities that automatically generate comprehensive reports based on user goals. The system uses DSPy (Declarative Self-improving Language Programs) to orchestrate multiple AI agents and create detailed analytical insights.
6
+
7
+ ## Key Features
8
+
9
+ - **Multi-Agent Analysis**: Orchestrates multiple specialized agents (preprocessing, statistical analysis, machine learning, visualization)
10
+ - **Template Integration**: Uses the user's active templates/agents for analysis
11
+ - **Streaming Progress**: Real-time progress updates during analysis execution
12
+ - **Report Persistence**: Stores complete analysis reports in database with metadata
13
+ - **HTML Export**: Generates downloadable HTML reports with visualizations
14
+ - **Credit Tracking**: Monitors token usage, costs, and credits consumed
15
+
16
+ ## Template Integration
17
+
18
+ The deep analysis system integrates with the user's active templates through the agent system:
19
+
20
+ 1. **Agent Selection**: Uses agents from the user's active template preferences (configured via `/templates` endpoints)
21
+ 2. **Default Agents**: Falls back to system default agents if user hasn't configured preferences:
22
+ - `preprocessing` (both individual and planner variants)
23
+ - `statistical_analytics` (both individual and planner variants)
24
+ - `sk_learn` (both individual and planner variants)
25
+ - `data_viz` (both individual and planner variants)
26
+ 3. **Template Limits**: Respects the 10-template limit for planner performance optimization
27
+ 4. **Dynamic Planning**: The planner automatically selects the most appropriate agents based on the analysis goal and available templates
28
+
29
+ ## Analysis Flow
30
+
31
+ The deep analysis process follows these steps:
32
+
33
+ 1. **Question Generation** (20% progress): Generates 5 targeted analytical questions based on the user's goal
34
+ 2. **Planning** (40% progress): Creates an optimized execution plan using available agents
35
+ 3. **Agent Execution** (60% progress): Executes analysis using user's active templates
36
+ 4. **Code Synthesis** (80% progress): Combines and optimizes code from all agents
37
+ 5. **Code Execution** (85% progress): Runs the synthesized analysis code
38
+ 6. **Synthesis** (90% progress): Synthesizes results into coherent insights
39
+ 7. **Conclusion** (100% progress): Generates final conclusions and recommendations
40
+
41
+ ---
42
+
43
+ ## API Endpoints
44
+
45
+ ### Create Deep Analysis Report
46
+
47
+ **POST** `/deep_analysis/reports`
48
+
49
+ Creates a new deep analysis report in the database.
50
+
51
+ **Request Body:**
52
+ ```json
53
+ {
54
+ "report_uuid": "string",
55
+ "user_id": 123,
56
+ "goal": "Analyze customer churn patterns",
57
+ "status": "completed",
58
+ "deep_questions": "1. What factors...\n2. How does...",
59
+ "deep_plan": "{\n \"@preprocessing\": {\n \"create\": [...],\n \"use\": [...],\n \"instruction\": \"...\"\n }\n}",
60
+ "summaries": ["Agent summary 1", "Agent summary 2"],
61
+ "analysis_code": "import pandas as pd\n# Analysis code...",
62
+ "plotly_figures": [{"data": [...], "layout": {...}}],
63
+ "synthesis": ["Synthesis result 1"],
64
+ "final_conclusion": "## Conclusion\nThe analysis reveals...",
65
+ "html_report": "<html>...</html>",
66
+ "report_summary": "Brief summary of findings",
67
+ "progress_percentage": 100,
68
+ "duration_seconds": 120,
69
+ "credits_consumed": 5,
70
+ "error_message": null,
71
+ "model_provider": "anthropic",
72
+ "model_name": "claude-sonnet-4-20250514",
73
+ "total_tokens_used": 15000,
74
+ "estimated_cost": 0.25,
75
+ "steps_completed": ["questions", "planning", "execution", "synthesis", "conclusion"]
76
+ }
77
+ ```
78
+
79
+ **Response:**
80
+ ```json
81
+ {
82
+ "report_id": 1,
83
+ "report_uuid": "uuid-string",
84
+ "user_id": 123,
85
+ "goal": "Analyze customer churn patterns",
86
+ "status": "completed",
87
+ "start_time": "2024-01-01T12:00:00Z",
88
+ "end_time": "2024-01-01T12:02:00Z",
89
+ "duration_seconds": 120,
90
+ "report_summary": "Brief summary of findings",
91
+ "created_at": "2024-01-01T12:02:00Z",
92
+ "updated_at": "2024-01-01T12:02:00Z"
93
+ }
94
+ ```
95
+
96
+ ### Get Deep Analysis Reports
97
+
98
+ **GET** `/deep_analysis/reports`
99
+
100
+ Retrieves a list of deep analysis reports with optional filtering.
101
+
102
+ **Query Parameters:**
103
+ - `user_id` (optional): Filter by user ID
104
+ - `limit` (optional): Number of reports to return (1-100, default: 10)
105
+ - `offset` (optional): Number of reports to skip (default: 0)
106
+ - `status` (optional): Filter by status ("pending", "running", "completed", "failed")
107
+
108
+ **Response:**
109
+ ```json
110
+ [
111
+ {
112
+ "report_id": 1,
113
+ "report_uuid": "uuid-string",
114
+ "user_id": 123,
115
+ "goal": "Analyze customer churn patterns",
116
+ "status": "completed",
117
+ "start_time": "2024-01-01T12:00:00Z",
118
+ "end_time": "2024-01-01T12:02:00Z",
119
+ "duration_seconds": 120,
120
+ "report_summary": "Brief summary of findings",
121
+ "created_at": "2024-01-01T12:02:00Z",
122
+ "updated_at": "2024-01-01T12:02:00Z"
123
+ }
124
+ ]
125
+ ```
126
+
127
+ ### Get User Historical Reports
128
+
129
+ **GET** `/deep_analysis/reports/user_historical`
130
+
131
+ Retrieves all historical deep analysis reports for a specific user.
132
+
133
+ **Query Parameters:**
134
+ - `user_id`: User ID (required)
135
+ - `limit` (optional): Number of reports to return (1-100, default: 50)
136
+
137
+ ### Get Report by ID
138
+
139
+ **GET** `/deep_analysis/reports/{report_id}`
140
+
141
+ Retrieves a complete deep analysis report by ID.
142
+
143
+ **Query Parameters:**
144
+ - `user_id` (optional): Ensures report belongs to specified user
145
+
146
+ **Response:**
147
+ ```json
148
+ {
149
+ "report_id": 1,
150
+ "report_uuid": "uuid-string",
151
+ "user_id": 123,
152
+ "goal": "Analyze customer churn patterns",
153
+ "status": "completed",
154
+ "start_time": "2024-01-01T12:00:00Z",
155
+ "end_time": "2024-01-01T12:02:00Z",
156
+ "duration_seconds": 120,
157
+ "deep_questions": "1. What factors contribute to churn?\n2. How does churn vary by segment?",
158
+ "deep_plan": "{\n \"@preprocessing\": {...},\n \"@statistical_analytics\": {...}\n}",
159
+ "summaries": ["Agent performed data cleaning...", "Statistical analysis revealed..."],
160
+ "analysis_code": "import pandas as pd\n# Complete analysis code",
161
+ "plotly_figures": [{"data": [...], "layout": {...}}],
162
+ "synthesis": ["The analysis shows clear patterns..."],
163
+ "final_conclusion": "## Conclusion\nCustomer churn is primarily driven by...",
164
+ "html_report": "<html>...</html>",
165
+ "report_summary": "Analysis of customer churn patterns reveals...",
166
+ "progress_percentage": 100,
167
+ "credits_consumed": 5,
168
+ "error_message": null,
169
+ "model_provider": "anthropic",
170
+ "model_name": "claude-sonnet-4-20250514",
171
+ "total_tokens_used": 15000,
172
+ "estimated_cost": 0.25,
173
+ "steps_completed": ["questions", "planning", "execution", "synthesis", "conclusion"],
174
+ "created_at": "2024-01-01T12:02:00Z",
175
+ "updated_at": "2024-01-01T12:02:00Z"
176
+ }
177
+ ```
178
+
179
+ ### Get Report by UUID
180
+
181
+ **GET** `/deep_analysis/reports/uuid/{report_uuid}`
182
+
183
+ Retrieves a complete deep analysis report by UUID. Same response format as get by ID.
184
+
185
+ ### Delete Report
186
+
187
+ **DELETE** `/deep_analysis/reports/{report_id}`
188
+
189
+ Deletes a deep analysis report.
190
+
191
+ **Query Parameters:**
192
+ - `user_id` (optional): Ensures report belongs to specified user
193
+
194
+ **Response:**
195
+ ```json
196
+ {
197
+ "message": "Report 1 deleted successfully"
198
+ }
199
+ ```
200
+
201
+ ### Update Report Status
202
+
203
+ **PUT** `/deep_analysis/reports/{report_id}/status`
204
+
205
+ Updates the status of a deep analysis report.
206
+
207
+ **Request Body:**
208
+ ```json
209
+ {
210
+ "status": "completed"
211
+ }
212
+ ```
213
+
214
+ **Valid Status Values:**
215
+ - `pending`: Analysis queued but not started
216
+ - `running`: Analysis in progress
217
+ - `completed`: Analysis finished successfully
218
+ - `failed`: Analysis encountered errors
219
+
220
+ ### Get HTML Report
221
+
222
+ **GET** `/deep_analysis/reports/uuid/{report_uuid}/html`
223
+
224
+ Retrieves only the HTML report content for a specific analysis.
225
+
226
+ **Query Parameters:**
227
+ - `user_id` (optional): Ensures report belongs to specified user
228
+
229
+ **Response:**
230
+ ```json
231
+ {
232
+ "html_report": "<html>...</html>",
233
+ "filename": "deep_analysis_report_20240101_120200.html"
234
+ }
235
+ ```
236
+
237
+ ### Download HTML Report
238
+
239
+ **POST** `/deep_analysis/download_from_db/{report_uuid}`
240
+
241
+ Downloads the HTML report as a file attachment.
242
+
243
+ **Query Parameters:**
244
+ - `user_id` (optional): Ensures report belongs to specified user
245
+
246
+ **Response:**
247
+ - Content-Type: `text/html; charset=utf-8`
248
+ - Content-Disposition: `attachment; filename="deep_analysis_report_TIMESTAMP.html"`
249
+
250
+ ---
251
+
252
+ ## Deep Analysis Module Architecture
253
+
254
+ ### DSPy Signatures
255
+
256
+ The system uses several DSPy signatures for different analysis phases:
257
+
258
+ #### 1. `deep_questions`
259
+ Generates 5 targeted analytical questions based on the user's goal and dataset structure.
260
+
261
+ #### 2. `deep_planner`
262
+ Creates an optimized execution plan using the user's active templates/agents. The planner:
263
+ - Verifies feasibility using available datasets and agent descriptions
264
+ - Batches similar questions per agent call for efficiency
265
+ - Reuses outputs across questions to minimize agent calls
266
+ - Defines clear variable flow and dependencies between agents
267
+
268
+ #### 3. `deep_code_synthesizer`
269
+ Combines and optimizes code from multiple agents:
270
+ - Fixes errors and inconsistencies between agent outputs
271
+ - Ensures proper data flow and type handling
272
+ - Converts all visualizations to Plotly format
273
+ - Adds comprehensive error handling and validation
274
+
275
+ #### 4. `deep_synthesizer`
276
+ Synthesizes analysis results into coherent insights and findings.
277
+
278
+ #### 5. `final_conclusion`
279
+ Generates final conclusions and strategic recommendations based on all analysis results.
280
+
281
+ ### Streaming Analysis
282
+
283
+ The `execute_deep_analysis_streaming` method provides real-time progress updates:
284
+
285
+ ```python
286
+ async for update in deep_analysis.execute_deep_analysis_streaming(goal, dataset_info, session_df):
287
+ if update["step"] == "questions":
288
+ # Handle questions generation progress
289
+ elif update["step"] == "planning":
290
+ # Handle planning progress
291
+ elif update["step"] == "agent_execution":
292
+ # Handle agent execution progress
293
+ # ... handle other steps
294
+ ```
295
+
296
+ ### Integration with User Templates
297
+
298
+ The deep analysis system integrates with user templates in several ways:
299
+
300
+ 1. **Agent Discovery**: Retrieves user's active template preferences from the database
301
+ 2. **Dynamic Planning**: The planner uses available agents to create optimal execution plans
302
+ 3. **Template Validation**: Ensures all referenced agents exist in the user's active templates
303
+ 4. **Fallback Handling**: Uses default agents if user preferences are incomplete
304
+ 5. **Performance Optimization**: Respects template limits for efficient execution
305
+
306
+ ### Error Handling
307
+
308
+ The system includes comprehensive error handling:
309
+
310
+ - **Code Execution Errors**: Automatically attempts to fix and retry failed code
311
+ - **Template Missing**: Falls back to default agents if user templates are unavailable
312
+ - **Timeout Protection**: Includes timeouts for long-running operations
313
+ - **Memory Management**: Handles large datasets and visualization efficiently
314
+ - **Unicode Handling**: Cleans problematic characters that might cause encoding issues
315
+
316
+ ### Visualization Integration
317
+
318
+ All visualizations are standardized to Plotly format:
319
+ - Consistent styling and color schemes
320
+ - Interactive features (zoom, pan, hover)
321
+ - Accessibility compliance (colorblind-friendly palettes)
322
+ - Export capabilities for reports
323
+ - Responsive design for different screen sizes
324
+
325
+ ---
326
+
327
+ ## Frontend Integration
328
+
329
+ The deep analysis system includes React components for:
330
+
331
+ - **DeepAnalysisSidebar**: Main interface for starting and managing analyses
332
+ - **NewAnalysisForm**: Form for initiating new deep analyses
333
+ - **CurrentAnalysisView**: Real-time progress tracking during analysis
334
+ - **HistoryView**: Browse and access historical analysis reports
335
+ - **AnalysisStep**: Individual step progress visualization
336
+
337
+ The frontend integrates with the streaming API to provide real-time feedback and uses the user's active template configuration for personalized analysis capabilities.
338
+
339
+ ## Credit and Cost Tracking
340
+
341
+ The system tracks detailed usage metrics:
342
+ - **Credits Consumed**: Number of credits deducted from user account
343
+ - **Token Usage**: Total tokens used across all model calls
344
+ - **Estimated Cost**: Dollar cost estimate based on model pricing
345
+ - **Model Information**: Provider and model name used for analysis
346
+ - **Execution Time**: Duration of analysis for performance monitoring
347
+
348
+ This information helps users understand resource consumption and optimize their analysis strategies.
docs/api/routes/feedback.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feedback Routes Documentation
2
+
3
+ This document describes the API endpoints available for managing user feedback on AI-generated messages in the Auto-Analyst backend.
4
+
5
+ ## Base URL
6
+
7
+ All feedback-related endpoints are prefixed with `/feedback`.
8
+
9
+ ## Endpoints
10
+
11
+ ### Create or Update Message Feedback
12
+ Creates new feedback or updates existing feedback for a specific message.
13
+
14
+ **Endpoint:** `POST /feedback/message/{message_id}`
15
+
16
+ **Path Parameters:**
17
+ - `message_id`: ID of the message to provide feedback for
18
+
19
+ **Request Body:**
20
+ ```json
21
+ {
22
+ "rating": 5, // Required: Star rating (1-5)
23
+ "model_name": "gpt-4o-mini", // Optional: Model used for the message
24
+ "model_provider": "openai", // Optional: Provider of the model
25
+ "temperature": 0.7, // Optional: Temperature setting
26
+ "max_tokens": 6000 // Optional: Max tokens setting
27
+ }
28
+ ```
29
+
30
+ **Response:**
31
+ ```json
32
+ {
33
+ "feedback_id": 123,
34
+ "message_id": 456,
35
+ "rating": 5,
36
+ "feedback_comment": null,
37
+ "model_name": "gpt-4o-mini",
38
+ "model_provider": "openai",
39
+ "temperature": 0.7,
40
+ "max_tokens": 6000,
41
+ "created_at": "2023-05-01T12:00:00Z",
42
+ "updated_at": "2023-05-01T12:00:00Z"
43
+ }
44
+ ```
45
+
46
+ **Error Responses:**
47
+ - `404 Not Found`: Message with specified ID not found
48
+ - `500 Internal Server Error`: Failed to create/update feedback
49
+
50
+ ### Get Message Feedback
51
+ Retrieves feedback for a specific message.
52
+
53
+ **Endpoint:** `GET /feedback/message/{message_id}`
54
+
55
+ **Path Parameters:**
56
+ - `message_id`: ID of the message to get feedback for
57
+
58
+ **Response:**
59
+ ```json
60
+ {
61
+ "feedback_id": 123,
62
+ "message_id": 456,
63
+ "rating": 5,
64
+ "feedback_comment": null,
65
+ "model_name": "gpt-4o-mini",
66
+ "model_provider": "openai",
67
+ "temperature": 0.7,
68
+ "max_tokens": 6000,
69
+ "created_at": "2023-05-01T12:00:00Z",
70
+ "updated_at": "2023-05-01T12:00:00Z"
71
+ }
72
+ ```
73
+
74
+ **Error Responses:**
75
+ - `404 Not Found`: No feedback found for the specified message
76
+ - `500 Internal Server Error`: Failed to retrieve feedback
77
+
78
+ ### Get Chat Feedback
79
+ Retrieves all feedback for messages in a specific chat.
80
+
81
+ **Endpoint:** `GET /feedback/chat/{chat_id}`
82
+
83
+ **Path Parameters:**
84
+ - `chat_id`: ID of the chat to get feedback for
85
+
86
+ **Response:**
87
+ ```json
88
+ [
89
+ {
90
+ "feedback_id": 123,
91
+ "message_id": 456,
92
+ "rating": 5,
93
+ "feedback_comment": null,
94
+ "model_name": "gpt-4o-mini",
95
+ "model_provider": "openai",
96
+ "temperature": 0.7,
97
+ "max_tokens": 6000,
98
+ "created_at": "2023-05-01T12:00:00Z",
99
+ "updated_at": "2023-05-01T12:00:00Z"
100
+ }
101
+ ]
102
+ ```
103
+
104
+ **Note:** Returns an empty array if no feedback exists for the chat.
105
+
106
+ **Error Responses:**
107
+ - `500 Internal Server Error`: Failed to retrieve chat feedback
108
+
109
+ ## Feedback Features
110
+
111
+ ### Rating System
112
+ - **Scale:** 1-5 star rating system
113
+ - **Required:** Rating is the only required field for feedback
114
+ - **Purpose:** Allows users to rate the quality of AI responses
115
+
116
+ ### Model Context Tracking
117
+ The system optionally tracks:
118
+ - **Model Name:** The specific AI model used (e.g., "gpt-4o-mini")
119
+ - **Model Provider:** The provider of the model (e.g., "openai", "anthropic")
120
+ - **Temperature:** The creativity/randomness setting used
121
+ - **Max Tokens:** The maximum response length setting
122
+
123
+ ### Update Behavior
124
+ - **Upsert Operation:** The POST endpoint either creates new feedback or updates existing feedback
125
+ - **Partial Updates:** When updating, only provided fields are modified
126
+ - **Timestamp Tracking:** Both creation and update timestamps are maintained
127
+
128
+ ## Data Management
129
+
130
+ ### Database Operations
131
+ - **Atomic Operations:** Feedback creation/updates are handled in database transactions
132
+ - **Referential Integrity:** Feedback is linked to specific messages via foreign keys
133
+ - **Soft Handling:** Missing optional fields are handled gracefully
134
+
135
+ ### Error Handling
136
+ - **Comprehensive Logging:** All operations are logged for debugging
137
+ - **User-Friendly Messages:** Error responses provide clear information
138
+ - **Transaction Safety:** Failed operations are rolled back to maintain data consistency
139
+
140
+ ## Usage Patterns
141
+
142
+ ### Typical Workflow
143
+ 1. User receives an AI-generated message
144
+ 2. User provides rating (1-5 stars) via the frontend
145
+ 3. Frontend calls `POST /feedback/message/{message_id}` with rating and model context
146
+ 4. System stores or updates the feedback
147
+ 5. Feedback can be retrieved later for analytics or user review
148
+
149
+ ### Analytics Integration
150
+ Feedback data is used by the analytics system to:
151
+ - Track model performance across different configurations
152
+ - Identify patterns in user satisfaction
153
+ - Generate insights for model optimization
docs/api/routes/session.md ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Auto-Analyst API Documentation**
2
+
3
+ The core application routes are designed to manage the data and AI analysis capabilities of the Auto-Analyst application.
4
+
5
+ ## **1. Core Application Routes**
6
+ ### **Data Management**
7
+
8
+ #### **POST /upload_dataframe**
9
+ Uploads a CSV dataset for analysis.
10
+ **Request:**
11
+ - `file`: CSV file
12
+ - `name`: Dataset name
13
+ - `description`: Dataset description
14
+ **Headers:**
15
+ - `X-Force-Refresh`: "true" (optional) - Forces session reset before upload
16
+ **Response:**
17
+ ```json
18
+ { "message": "Dataframe uploaded successfully", "session_id": "abc123" }
19
+ ```
20
+
21
+ #### **POST /upload_excel**
22
+ Uploads an Excel file with a specific sheet for analysis.
23
+ **Request:**
24
+ - `file`: Excel file
25
+ - `name`: Dataset name
26
+ - `description`: Dataset description
27
+ - `sheet_name`: Name of the Excel sheet to use
28
+ **Headers:**
29
+ - `X-Force-Refresh`: "true" (optional) - Forces session reset before upload
30
+ **Response:**
31
+ ```json
32
+ { "message": "Excel file processed successfully", "session_id": "abc123", "sheet": "Sheet1" }
33
+ ```
34
+
35
+ #### **POST /api/excel-sheets**
36
+ Gets the list of sheet names from an Excel file.
37
+ **Request:**
38
+ - `file`: Excel file
39
+ **Response:**
40
+ ```json
41
+ { "sheets": ["Sheet1", "Sheet2", "Data"] }
42
+ ```
43
+
44
+ #### **GET /api/default-dataset**
45
+ Gets the default dataset.
46
+ **Response:**
47
+ ```json
48
+ {
49
+ "headers": ["column1", "column2", ...],
50
+ "rows": [[val1, val2, ...], ...],
51
+ "name": "Housing Dataset",
52
+ "description": "A comprehensive dataset containing housing information..."
53
+ }
54
+ ```
55
+
56
+ #### **POST /reset-session**
57
+ Resets session to default dataset.
58
+ **Request Body:**
59
+ ```json
60
+ {
61
+ "name": "optional name",
62
+ "description": "optional description",
63
+ "preserveModelSettings": false
64
+ }
65
+ ```
66
+ **Response:**
67
+ ```json
68
+ {
69
+ "message": "Session reset to default dataset",
70
+ "session_id": "abc123",
71
+ "dataset": "Housing.csv"
72
+ }
73
+ ```
74
+
75
+ #### **GET /api/preview-csv** / **POST /api/preview-csv**
76
+ Preview the current dataset in the session.
77
+ **Response:**
78
+ ```json
79
+ {
80
+ "headers": ["column1", "column2", ...],
81
+ "rows": [[val1, val2, ...], ...],
82
+ "name": "Dataset Name",
83
+ "description": "Dataset description..."
84
+ }
85
+ ```
86
+
87
+ ---
88
+
89
+ ### **2. AI Analysis**
90
+
91
+ #### **POST /chat/{agent_name}**
92
+ Processes a query using a specific AI agent.
93
+ **Path Parameters:** `agent_name`
94
+ **Request Body:**
95
+ ```json
96
+ { "query": "Analyze the relationship between price and size" }
97
+ ```
98
+ **Query Parameters:** `user_id` (optional), `chat_id` (optional)
99
+ **Response:**
100
+ ```json
101
+ {
102
+ "agent_name": "data_viz_agent",
103
+ "query": "Analyze the relationship between price and size",
104
+ "response": "# Analysis\n\nThere appears to be a strong positive correlation...",
105
+ "session_id": "abc123"
106
+ }
107
+ ```
108
+
109
+ #### **POST /chat**
110
+ Processes a query using multiple AI agents with streaming responses.
111
+ **Request Body:**
112
+ ```json
113
+ { "query": "Analyze the housing data" }
114
+ ```
115
+ **Query Parameters:** `user_id` (optional), `chat_id` (optional)
116
+ **Response:** *Streaming JSON objects:*
117
+ ```json
118
+ {"agent": "data_viz_agent", "content": "# Visualization\n\n...", "status": "success"}
119
+ {"agent": "statistical_analytics_agent", "content": "# Statistical Analysis\n\n...", "status": "success"}
120
+ ```
121
+
122
+ #### **POST /chat_history_name**
123
+ Generates a name for a chat based on the query.
124
+ **Request Body:**
125
+ ```json
126
+ { "query": "Analyze sales data for Q4" }
127
+ ```
128
+ **Response:**
129
+ ```json
130
+ { "name": "Chat about sales data analysis" }
131
+ ```
132
+
133
+ #### **GET /agents**
134
+ Lists available AI agents.
135
+ **Response:**
136
+ ```json
137
+ {
138
+ "available_agents": ["data_viz_agent", "sk_learn_agent", "statistical_analytics_agent", "preprocessing_agent"],
139
+ "standard_agents": ["preprocessing_agent", "statistical_analytics_agent", "sk_learn_agent", "data_viz_agent"],
140
+ "template_agents": ["custom_template_1", "custom_template_2"],
141
+ "custom_agents": []
142
+ }
143
+ ```
144
+
145
+ ---
146
+
147
+ ### **3. Deep Analysis**
148
+
149
+ #### **POST /deep_analysis_streaming**
150
+ Performs comprehensive deep analysis with real-time streaming updates.
151
+ **Request Body:**
152
+ ```json
153
+ { "goal": "Perform comprehensive analysis of the sales data" }
154
+ ```
155
+ **Query Parameters:** `user_id` (optional), `chat_id` (optional)
156
+ **Response:** *Streaming JSON objects with progress updates*
157
+
158
+ #### **POST /deep_analysis/download_report**
159
+ Downloads an HTML report from deep analysis results.
160
+ **Request Body:**
161
+ ```json
162
+ {
163
+ "analysis_data": { ... },
164
+ "report_uuid": "optional-uuid"
165
+ }
166
+ ```
167
+ **Response:** HTML file download
168
+
169
+ ---
170
+
171
+ ### **4. Model Settings**
172
+
173
+ #### **GET /api/model-settings**
174
+ Fetches current model settings.
175
+ **Response:**
176
+ ```json
177
+ {
178
+ "provider": "openai",
179
+ "model": "gpt-4o-mini",
180
+ "hasCustomKey": true,
181
+ "temperature": 1.0,
182
+ "maxTokens": 6000
183
+ }
184
+ ```
185
+
186
+ #### **POST /settings/model**
187
+ Updates model settings.
188
+ **Request Body:**
189
+ ```json
190
+ {
191
+ "provider": "openai",
192
+ "model": "gpt-4",
193
+ "api_key": "sk-...",
194
+ "temperature": 0.7,
195
+ "max_tokens": 8000
196
+ }
197
+ ```
198
+ **Response:**
199
+ ```json
200
+ { "message": "Model settings updated successfully" }
201
+ ```
202
+
203
+ ---
204
+
205
+ ### **5. Session Management**
206
+
207
+ #### **GET /api/session-info**
208
+ Gets information about the current session.
209
+ **Response:**
210
+ ```json
211
+ {
212
+ "session_id": "abc123",
213
+ "dataset_name": "Housing Dataset",
214
+ "dataset_description": "...",
215
+ "model_config": { ... }
216
+ }
217
+ ```
218
+
219
+ #### **POST /set-message-info**
220
+ Associates message tracking information with the session.
221
+ **Request Body:**
222
+ ```json
223
+ {
224
+ "chat_id": 123,
225
+ "message_id": 456,
226
+ "user_id": 789
227
+ }
228
+ ```
229
+
230
+ #### **POST /create-dataset-description**
231
+ Creates an AI-generated description for a dataset.
232
+ **Request Body:**
233
+ ```json
234
+ {
235
+ "df_preview": "column1,column2\nvalue1,value2\n...",
236
+ "name": "Dataset Name"
237
+ }
238
+ ```
239
+
240
+ ---
241
+
242
+ ### **6. System Endpoints**
243
+
244
+ #### **GET /**
245
+ Returns API welcome information and feature list.
246
+
247
+ #### **GET /health**
248
+ Health check endpoint.
249
+ **Response:**
250
+ ```json
251
+ { "message": "API is healthy and running" }
252
+ ```
253
+
254
+ ---
255
+
256
+ ---
257
+
258
+ ### **7. Authentication & Session Management**
259
+ - **Session ID Sources:**
260
+ - Query parameter: `session_id`
261
+ - Header: `X-Session-ID`
262
+ - Auto-generated if not provided
263
+ - **Session State Includes:**
264
+ - Current dataset
265
+ - AI system instance
266
+ - Model configuration
267
+ - User and chat associations
268
+
269
+ ### **9. Error Handling**
270
+ - Comprehensive error handling with appropriate HTTP status codes
271
+ - Detailed error messages for debugging
272
+ - Fallback encoding support for CSV files (UTF-8, unicode_escape, ISO-8859-1)
273
+ - Session state preservation during errors
docs/api/routes/templates.md ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Templates and Agent Loading Documentation
2
+
3
+ This document describes how the Auto-Analyst template system works, including agent loading, user preferences, and template management.
4
+
5
+ ## Overview
6
+
7
+ The Auto-Analyst system uses a flexible template-based approach for managing AI agents. Templates define specialized agents with specific capabilities, and users can customize which agents are available for their analysis workflows.
8
+
9
+ ## Template System Architecture
10
+
11
+ ### Template Types
12
+
13
+ Templates come in different **variant types** that determine how they can be used:
14
+
15
+ - **`individual`**: Templates available for single-agent queries (e.g., `@preprocessing_agent`)
16
+ - **`planner`**: Templates available for multi-agent planning workflows
17
+ - **`both`**: Templates available in both individual and planner contexts
18
+
19
+ ### Default Agents
20
+
21
+ The system includes four core default agents that are **enabled by default** for all users:
22
+
23
+ **For Individual Use:**
24
+ - `preprocessing_agent`: Data cleaning and preprocessing
25
+ - `statistical_analytics_agent`: Statistical analysis and insights
26
+ - `sk_learn_agent`: Machine learning with scikit-learn
27
+ - `data_viz_agent`: Data visualization with Plotly
28
+
29
+ **For Planner Use:**
30
+ - `planner_preprocessing_agent`: Planning version of preprocessing agent
31
+ - `planner_statistical_analytics_agent`: Planning version of statistical agent
32
+ - `planner_sk_learn_agent`: Planning version of ML agent
33
+ - `planner_data_viz_agent`: Planning version of visualization agent
34
+
35
+ ## Template Management Endpoints
36
+
37
+ ### Get All Templates
38
+
39
+ **Endpoint:** `GET /templates/`
40
+
41
+ **Query Parameters:**
42
+ - `variant_type`: Filter by `"individual"`, `"planner"`, or `"all"` (default: `"all"`)
43
+
44
+ **Response:**
45
+ ```json
46
+ [
47
+ {
48
+ "template_id": 1,
49
+ "template_name": "preprocessing_agent",
50
+ "display_name": "Data Preprocessing Agent",
51
+ "description": "Handles data cleaning, missing values, and preprocessing tasks",
52
+ "prompt_template": "You are a data preprocessing specialist...",
53
+ "template_category": "Data Processing",
54
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
55
+ "is_premium_only": false,
56
+ "is_active": true,
57
+ "usage_count": 12,
58
+ "created_at": "2023-05-01T12:00:00Z",
59
+ "updated_at": "2023-05-01T12:00:00Z"
60
+ }
61
+ ]
62
+ ```
63
+
64
+ ### Get Templates by Category
65
+
66
+ **Endpoint:** `GET /templates/categories`
67
+
68
+ **Query Parameters:**
69
+ - `variant_type`: Filter by `"individual"`, `"planner"`, or `"all"` (default: `"individual"`)
70
+
71
+ **Response:**
72
+ ```json
73
+ [
74
+ {
75
+ "category": "Data Processing",
76
+ "templates": [
77
+ {
78
+ "agent_id": 1,
79
+ "agent_name": "preprocessing_agent",
80
+ "display_name": "Data Preprocessing Agent",
81
+ "description": "Handles data cleaning and preprocessing",
82
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
83
+ "usage_count": 1234
84
+ }
85
+ ]
86
+ }
87
+ ]
88
+ ```
89
+
90
+ ### Get Template by ID
91
+
92
+ **Endpoint:** `GET /templates/template/{template_id}`
93
+
94
+ **Response:**
95
+ ```json
96
+ {
97
+ "template_id": 1,
98
+ "template_name": "preprocessing_agent",
99
+ "display_name": "Data Preprocessing Agent",
100
+ "description": "Handles data cleaning, missing values, and preprocessing tasks",
101
+ "prompt_template": "You are a data preprocessing specialist...",
102
+ "template_category": "Data Processing",
103
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
104
+ "is_premium_only": false,
105
+ "is_active": true,
106
+ "usage_count": 1234,
107
+ "created_at": "2023-05-01T12:00:00Z",
108
+ "updated_at": "2023-05-01T12:00:00Z"
109
+ }
110
+ ```
111
+
112
+ ### Get Template Categories List
113
+
114
+ **Endpoint:** `GET /templates/categories/list`
115
+
116
+ **Response:**
117
+ ```json
118
+ {
119
+ "categories": [
120
+ "Data Processing",
121
+ "Machine Learning",
122
+ "Visualization",
123
+ "Statistics"
124
+ ]
125
+ }
126
+ ```
127
+
128
+ ### Get Templates by Specific Category
129
+
130
+ **Endpoint:** `GET /templates/category/{category}`
131
+
132
+ **Path Parameters:**
133
+ - `category`: Name of the category to filter by
134
+
135
+ **Response:**
136
+ ```json
137
+ [
138
+ {
139
+ "template_id": 1,
140
+ "template_name": "preprocessing_agent",
141
+ "display_name": "Data Preprocessing Agent",
142
+ "description": "Handles data cleaning and preprocessing",
143
+ "template_category": "Data Processing",
144
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
145
+ "usage_count": 1234
146
+ }
147
+ ]
148
+ ```
149
+
150
+ ## User Template Preferences
151
+
152
+ ### How Agent Loading Works for Users
153
+
154
+ 1. **Default Behavior**: New users automatically have the 4 core default agents enabled
155
+ 2. **Custom Preferences**: Users can enable/disable additional templates through preferences
156
+ 3. **Variant-Specific**: Individual and planner variants are managed separately
157
+ 4. **Usage Tracking**: System tracks which templates users actually use
158
+
159
+ ### Get User Template Preferences
160
+
161
+ **Endpoint:** `GET /templates/user/{user_id}`
162
+
163
+ **Query Parameters:**
164
+ - `variant_type`: Filter by `"individual"`, `"planner"`, or `"all"` (default: `"planner"`)
165
+
166
+ **Response:**
167
+ ```json
168
+ [
169
+ {
170
+ "template_id": 1,
171
+ "template_name": "preprocessing_agent",
172
+ "display_name": "Data Preprocessing Agent",
173
+ "description": "Handles data cleaning and preprocessing",
174
+ "template_category": "Data Processing",
175
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
176
+ "is_premium_only": false,
177
+ "is_active": true,
178
+ "is_enabled": true,
179
+ "usage_count": 15,
180
+ "last_used_at": "2023-05-01T12:00:00Z",
181
+ "created_at": "2023-04-01T12:00:00Z",
182
+ "updated_at": "2023-05-01T12:00:00Z"
183
+ }
184
+ ]
185
+ ```
186
+
187
+ ### Get Only Enabled Templates
188
+
189
+ **Endpoint:** `GET /templates/user/{user_id}/enabled`
190
+
191
+ Returns only templates that are currently enabled for the user.
192
+
193
+ ### Get Enabled Templates for Planner
194
+
195
+ **Endpoint:** `GET /templates/user/{user_id}/enabled/planner`
196
+
197
+ Returns enabled planner templates with the following restrictions:
198
+ - **Maximum 10 templates** for planner use
199
+ - **Sorted by usage** (most used first)
200
+ - **Only planner variants** (`planner` or `both` types)
201
+
202
+ ## Template Preference Management
203
+
204
+ ### Toggle Single Template
205
+
206
+ **Endpoint:** `POST /templates/user/{user_id}/template/{template_id}/toggle`
207
+
208
+ **Request Body:**
209
+ ```json
210
+ {
211
+ "is_enabled": true
212
+ }
213
+ ```
214
+
215
+ **Restrictions:**
216
+ - Cannot disable all templates (at least 1 must remain enabled)
217
+ - Cannot enable more than 10 templates for planner use
218
+
219
+ ### Bulk Toggle Templates
220
+
221
+ **Endpoint:** `POST /templates/user/{user_id}/bulk-toggle`
222
+
223
+ **Request Body:**
224
+ ```json
225
+ {
226
+ "preferences": [
227
+ {
228
+ "template_id": 1,
229
+ "is_enabled": true
230
+ },
231
+ {
232
+ "template_id": 2,
233
+ "is_enabled": false
234
+ }
235
+ ]
236
+ }
237
+ ```
238
+
239
+ **Response:**
240
+ ```json
241
+ {
242
+ "results": [
243
+ {
244
+ "template_id": 1,
245
+ "success": true,
246
+ "message": "Template enabled successfully",
247
+ "is_enabled": true
248
+ }
249
+ ]
250
+ }
251
+ ```
252
+
253
+ ## Template Categories and Icons
254
+
255
+ ### Available Categories
256
+
257
+ Templates are organized into categories such as:
258
+ - **Data Processing**: Preprocessing, cleaning, feature engineering
259
+ - **Machine Learning**: Various ML frameworks and algorithms
260
+ - **Visualization**: Plotting and chart generation
261
+ - **Statistics**: Statistical analysis and modeling
262
+ - **Custom**: User or organization-specific templates
263
+
264
+ ### Icon System
265
+
266
+ Templates include visual icons stored in `/public/icons/templates/`:
267
+
268
+ **Core Agent Icons:**
269
+ - `preprocessing_agent.svg`: Data preprocessing
270
+ - `sk_learn_agent.svg`: Machine learning
271
+ - `matplotlib_agent.png`: Plotting with matplotlib
272
+ - `polars_agent.svg`: Data manipulation with Polars
273
+
274
+ **Library-Specific Icons:**
275
+ - `numpy.svg`, `scipy.png`: Scientific computing
276
+ - `plotly.svg`, `seaborn.svg`: Advanced visualization
277
+ - `lightgbm.png`, `xgboost.png`: Gradient boosting
278
+ - `pymc.png`, `statsmodel.svg`: Statistical modeling
279
+
280
+ **Special Purpose Icons:**
281
+ - `data-cleaning.png`: Data cleaning workflows
282
+ - `feature-engineering.png`: Feature engineering tasks
283
+
284
+ ## Agent Loading Process
285
+
286
+ ### For Individual Queries
287
+
288
+ When a user makes a query like `@preprocessing_agent analyze my data`:
289
+
290
+ 1. **Check User Preferences**: System looks up user's enabled individual templates
291
+ 2. **Apply Defaults**: If no preference exists, default agents are enabled
292
+ 3. **Load Agent**: System loads the specific agent template and executes the query
293
+ 4. **Track Usage**: Usage count is incremented for analytics
294
+
295
+ ### For Planner Workflows
296
+
297
+ When a user makes a general query that triggers the planner:
298
+
299
+ 1. **Get Enabled Planner Templates**: System queries user's enabled planner variants
300
+ 2. **Apply 10-Template Limit**: Maximum 10 templates for performance
301
+ 3. **Sort by Usage**: Most-used templates get priority
302
+ 4. **Create Plan**: Planner selects appropriate agents for the analysis
303
+ 5. **Execute Workflow**: Selected agents execute in sequence
304
+ 6. **Update Usage**: Usage statistics updated for selected agents
305
+
306
+ ### Default Agent Behavior
307
+
308
+ ```python
309
+ # Default agents enabled for new users
310
+ individual_defaults = [
311
+ "preprocessing_agent",
312
+ "statistical_analytics_agent",
313
+ "sk_learn_agent",
314
+ "data_viz_agent"
315
+ ]
316
+
317
+ planner_defaults = [
318
+ "planner_preprocessing_agent",
319
+ "planner_statistical_analytics_agent",
320
+ "planner_sk_learn_agent",
321
+ "planner_data_viz_agent"
322
+ ]
323
+ ```
324
+
325
+ ## Usage Analytics
326
+
327
+ ### Global Usage Tracking
328
+
329
+ The system tracks global usage statistics across all users:
330
+ - **Total usage count** per template
331
+ - **User-specific usage** for personalization
332
+ - **Last used timestamps** for sorting
333
+
334
+ ### Usage-Based Features
335
+
336
+ - **Template Recommendations**: Popular templates shown first
337
+ - **Personalized Ordering**: User's most-used templates prioritized
338
+ - **Analytics Dashboard**: Usage patterns for administrators
339
+
340
+ ## Template Restrictions
341
+
342
+ ### User Limitations
343
+
344
+ - **Minimum 1 Agent**: Cannot disable all templates
345
+ - **Maximum 10 for Planner**: Performance optimization
346
+ - **Premium Templates**: Some templates require premium access
347
+
348
+ ### System Limitations
349
+
350
+ - **Active Templates Only**: Inactive templates not available
351
+ - **Variant Compatibility**: Individual/planner variants managed separately
352
+ - **Category Organization**: Templates must belong to valid categories
353
+
354
+ ## Integration with Deep Analysis
355
+
356
+ The deep analysis system uses the template preference system:
357
+
358
+ 1. **Load User Preferences**: Gets enabled planner templates for user
359
+ 2. **Create Agent Pool**: Instantiates agents from enabled templates
360
+ 3. **Execute Analysis**: Uses available agents for comprehensive analysis
361
+ 4. **Fallback Behavior**: Uses default agents if no preferences found
362
+
363
+ This ensures users get personalized deep analysis based on their template preferences while maintaining system performance through the 10-template limit.
docs/architecture/architecture.md ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend System Architecture
2
+
3
+ ## Overview
4
+
5
+ Auto-Analyst is a sophisticated multi-agent AI platform designed for comprehensive data analysis. The backend system orchestrates specialized AI agents, manages user sessions, and provides a robust API for data processing and analysis workflows.
6
+
7
+ ## 🏗️ High-Level Architecture
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
11
+ │ Frontend │ │ Backend │ │ Database │
12
+ │ (Next.js) │◄──►│ (FastAPI) │◄──►│ (PostgreSQL/ │
13
+ │ │ │ │ │ SQLite) │
14
+ └─────────────────┘ └──────────────────┘ └─────────────────┘
15
+
16
+
17
+ ┌──────────────────┐
18
+ │ AI Models │
19
+ │ (DSPy/LLMs) │
20
+ └──────────────────┘
21
+
22
+
23
+ ┌──────────────────┐
24
+ │ Agent System │
25
+ │ [Processing] │
26
+ │ [Analytics] │
27
+ │ [Visualization] │
28
+ └──────────────────┘
29
+ ```
30
+
31
+ ## 🎯 Core Components
32
+
33
+ ### 1. Application Layer (`app.py`)
34
+
35
+ **FastAPI Application Server**
36
+ - **Role**: Main HTTP server and request router
37
+ - **Responsibilities**:
38
+ - Request/response handling
39
+ - Session-based authentication
40
+ - Route registration and middleware
41
+ - Error handling and logging
42
+ - Static file serving
43
+ - CORS configuration
44
+
45
+ **Key Features**:
46
+ - Async/await support for high concurrency
47
+ - Automatic API documentation generation
48
+ - Request validation with Pydantic
49
+ - Session management for user tracking
50
+
51
+ ### 2. Agent System (`src/agents/`)
52
+
53
+ **Multi-Agent Orchestra**
54
+ - **Core Agents**: Specialized AI agents for different analysis tasks
55
+ - **Deep Analysis**: Advanced multi-agent coordination system
56
+ - **Template System**: User-customizable agent configurations
57
+
58
+ #### Agent Types
59
+
60
+ 1. **Individual Agents** (`agents.py`):
61
+ ```python
62
+ - preprocessing_agent # Data cleaning and preparation
63
+ - statistical_analytics_agent # Statistical analysis
64
+ - sk_learn_agent # Machine learning with scikit-learn
65
+ - data_viz_agent # Data visualization
66
+ - basic_qa_agent # General Q&A
67
+ ```
68
+
69
+ 2. **Planner Agents** (Multi-agent coordination):
70
+ ```python
71
+ - planner_preprocessing_agent
72
+ - planner_statistical_analytics_agent
73
+ - planner_sk_learn_agent
74
+ - planner_data_viz_agent
75
+ ```
76
+
77
+ 3. **Deep Analysis System** (`deep_agents.py`):
78
+ ```python
79
+ - deep_questions # Question generation
80
+ - deep_planner # Execution planning
81
+ - deep_code_synthesizer # Code combination
82
+ - deep_synthesizer # Result synthesis
83
+ - final_conclusion # Report generation
84
+ ```
85
+
86
+ #### Agent Architecture Pattern
87
+
88
+ ```python
89
+ class AgentSignature(dspy.Signature):
90
+ """Agent description and purpose"""
91
+ goal = dspy.InputField(desc="Analysis objective")
92
+ dataset = dspy.InputField(desc="Dataset information")
93
+ plan_instructions = dspy.InputField(desc="Execution plan")
94
+
95
+ summary = dspy.OutputField(desc="Analysis summary")
96
+ code = dspy.OutputField(desc="Generated code")
97
+ ```
98
+
99
+ ### 3. Database Layer (`src/db/`)
100
+
101
+ **Data Persistence and Management**
102
+
103
+ #### Database Models (`schemas/models.py`):
104
+
105
+ ```python
106
+ # Core Models
107
+ User # User accounts and authentication
108
+ Chat # Conversation sessions
109
+ Message # Individual messages in chats
110
+ ModelUsage # AI model usage tracking
111
+
112
+ # Template System
113
+ AgentTemplate # Agent definitions and configurations
114
+ UserTemplatePreference # User's enabled/disabled agents
115
+
116
+ # Deep Analysis
117
+ DeepAnalysisReport # Analysis reports and results
118
+
119
+ # Analytics
120
+ CodeExecution # Code execution tracking
121
+ UserAnalytics # User behavior analytics
122
+ ```
123
+
124
+ #### Database Architecture:
125
+
126
+ ```
127
+ Users (1) ──────── (Many) Chats
128
+ │ │
129
+ │ ▼
130
+ └─── (Many) ModelUsage ──┘
131
+
132
+ └─── (Many) UserTemplatePreference
133
+
134
+
135
+ AgentTemplate
136
+ ```
137
+
138
+ ### 4. Route Handlers (`src/routes/`)
139
+
140
+ **RESTful API Endpoints**
141
+
142
+ | Module | Purpose | Key Endpoints |
143
+ |--------|---------|---------------|
144
+ | `session_routes.py` | Core functionality | `/upload_excel`, `/session_info` |
145
+ | `chat_routes.py` | Chat management | `/chats`, `/messages`, `/delete_chat` |
146
+ | `code_routes.py` | Code operations | `/execute_code`, `/get_latest_code` |
147
+ | `templates_routes.py` | Agent templates | `/templates`, `/user/{id}/enabled` |
148
+ | `deep_analysis_routes.py` | Deep analysis | `/reports`, `/download_from_db` |
149
+ | `analytics_routes.py` | System analytics | `/usage`, `/feedback`, `/costs` |
150
+ | `feedback_routes.py` | User feedback | `/feedback`, `/message/{id}/feedback` |
151
+
152
+ NOTE: Make sure to add a router prefix when calling these endpoints, such as to get dashboard, you'll use `http://localhost:8000/templates/dashboard`
153
+
154
+ ### 5. Business Logic Layer (`src/managers/`)
155
+
156
+ **Service Layer for Complex Operations**
157
+
158
+ #### Manager Components:
159
+
160
+ 1. **`chat_manager.py`**:
161
+ ```python
162
+ - Session management
163
+ - Message handling
164
+ - Context preservation
165
+ - Agent orchestration
166
+ ```
167
+
168
+ 2. **`ai_manager.py`**:
169
+ ```python
170
+ - Model selection and routing
171
+ - Token tracking and cost calculation
172
+ - Error handling and retries
173
+ - Response formatting
174
+ ```
175
+
176
+ 3. **`session_manager.py`**:
177
+ ```python
178
+ - Session lifecycle management
179
+ - Data sharing between agents
180
+ - Memory management
181
+ - Cleanup operations
182
+ ```
183
+
184
+ ### 6. Utility Layer (`src/utils/`)
185
+
186
+ **Shared Services and Helpers**
187
+
188
+ - **`logger.py`**: Centralized logging system
189
+ - **`generate_report.py`**: HTML report generation
190
+ - **`model_registry.py`**: AI model configuration
191
+
192
+ ## 🔄 Data Flow Architecture
193
+
194
+ ### 1. Request Processing Flow
195
+
196
+ ```
197
+ HTTP Request → FastAPI Router → Route Handler → Manager/Business Logic →
198
+ Database/Agent System → AI Model → Response Processing → JSON Response
199
+ ```
200
+
201
+ ### 2. Agent Execution Flow
202
+
203
+ ```
204
+ User Query → Session Creation → Template Selection → Agent Loading →
205
+ Code Generation → Code Execution → Result Processing → Response Formatting
206
+ ```
207
+
208
+ ### 3. Deep Analysis Flow
209
+
210
+ ```
211
+ Analysis Goal → Question Generation → Planning Phase → Agent Coordination →
212
+ Code Synthesis → Execution → Result Synthesis → Final Report Generation
213
+ ```
214
+
215
+ ### 4. Template System Flow
216
+
217
+ ```
218
+ User Preferences → Template Loading → Agent Registration →
219
+ Capability Mapping → Execution Routing → Usage Tracking
220
+ ```
221
+
222
+ ## 🎨 Design Patterns
223
+
224
+ ### 1. **Module Pattern**
225
+ - Clear separation of concerns
226
+ - Each module has specific responsibilities
227
+ - Minimal dependencies between modules
228
+
229
+ ### 2. **Repository Pattern**
230
+ - Database access abstracted through SQLAlchemy
231
+ - Session management centralized
232
+ - Clean separation of data and business logic
233
+
234
+ ### 3. **Strategy Pattern**
235
+ - Multiple AI models supported through unified interface
236
+ - Agent selection based on user preferences
237
+ - Dynamic template loading
238
+
239
+ ### 4. **Observer Pattern**
240
+ - Usage tracking and analytics
241
+ - Event-driven model updates
242
+ - Real-time progress notifications
243
+
244
+ ## 🔧 Configuration Management
245
+
246
+ ### Environment Configuration
247
+
248
+ ```python
249
+ # Database
250
+ DATABASE_URL: str # Database connection string
251
+ POSTGRES_PASSWORD: str # PostgreSQL password (optional)
252
+
253
+ # AI Models
254
+ ANTHROPIC_API_KEY: str # Claude API key
255
+ OPENAI_API_KEY: str # OpenAI API key
256
+
257
+ # Authentication
258
+ ADMIN_API_KEY: str # Admin operations key (optional)
259
+
260
+ # Deployment
261
+ PORT: int = 8000 # Server port
262
+ DEBUG: bool = False # Debug mode
263
+ ```
264
+
265
+ ### Agent Configuration (`agents_config.json`)
266
+
267
+ ```json
268
+ {
269
+ "default_agents": [
270
+ {
271
+ "template_name": "preprocessing_agent",
272
+ "description": "Data cleaning and preparation",
273
+ "variant_type": "both",
274
+ "is_premium": false,
275
+ "usage_count": 0,
276
+ "icon_url": "preprocessing.svg"
277
+ }
278
+ ],
279
+ "premium_templates": [...],
280
+ "remove": [...]
281
+ }
282
+ ```
283
+
284
+ ## 🔒 Security Architecture
285
+
286
+ ### Authentication & Authorization
287
+
288
+ 1. **Session-based Authentication**:
289
+ - Session IDs for user identification
290
+ - Optional API key authentication for admin endpoints
291
+
292
+ 2. **Input Validation**:
293
+ - Pydantic models for request validation
294
+ - SQL injection prevention through SQLAlchemy
295
+ - File upload restrictions and validation
296
+
297
+ 3. **Resource Protection**:
298
+ - User-specific data isolation
299
+ - Usage tracking and monitoring
300
+ - Rate limiting considerations
301
+
302
+ ### Data Security
303
+
304
+ 1. **Database Security**:
305
+ - Encrypted connections for PostgreSQL
306
+ - Parameterized queries
307
+ - Regular backup procedures
308
+
309
+ 2. **Code Execution Security**:
310
+ - Sandboxed code execution environment
311
+ - Limited library imports
312
+ - Timeout protection
313
+
314
+ ## 📊 Performance Architecture
315
+
316
+ ### Scalability Features
317
+
318
+ 1. **Async Architecture**:
319
+ - Non-blocking I/O operations
320
+ - Concurrent agent execution
321
+ - Streaming responses for long operations
322
+
323
+ 2. **Database Optimization**:
324
+ - Connection pooling
325
+ - Query optimization
326
+ - Indexed frequently accessed columns
327
+
328
+ 3. **Caching Strategy**:
329
+ - In-memory caching for templates
330
+ - Result caching for expensive operations
331
+ - Session data management
332
+
333
+ ### Performance Monitoring
334
+
335
+ 1. **Usage Analytics**:
336
+ - Request/response time tracking
337
+ - Token usage monitoring
338
+ - Error rate analysis
339
+
340
+ 2. **Resource Monitoring**:
341
+ - Database query performance
342
+ - Memory usage tracking
343
+ - Agent execution time analysis
344
+
345
+ ## 🚀 Deployment Architecture
346
+
347
+ ### Development Environment
348
+
349
+ ```
350
+ Local Development → SQLite Database → File-based Logging →
351
+ Direct Model API Calls → Hot Reloading
352
+ ```
353
+
354
+ ### Production Environment
355
+
356
+ ```
357
+ Load Balancer → Multiple FastAPI Instances → PostgreSQL Database →
358
+ Centralized Logging → Monitoring & Alerting
359
+ ```
360
+
361
+ ### Container Architecture
362
+
363
+ ```dockerfile
364
+ # Multi-stage build for optimization
365
+ FROM python:3.11-slim as base
366
+ # Dependencies and application setup
367
+ # Health checks and graceful shutdown
368
+ # Environment-specific configurations
369
+ ```
370
+
371
+ ## 🔄 Integration Patterns
372
+
373
+ ### External Service Integration
374
+
375
+ 1. **AI Model Providers**:
376
+ - Anthropic (Claude)
377
+ - OpenAI (GPT models)
378
+ - Unified interface through DSPy
379
+
380
+ 2. **Database Systems**:
381
+ - PostgreSQL (production)
382
+ - SQLite (development)
383
+ - Migration support through Alembic
384
+
385
+ ### Frontend Integration
386
+
387
+ 1. **REST API**:
388
+ - Standard HTTP endpoints
389
+ - JSON request/response format
390
+ - Session-based communication
391
+
392
+ 2. **Data Exchange**:
393
+ - File upload capabilities
394
+ - Real-time analysis results
395
+ - Report generation and download
396
+
397
+ ### Third-Party Integration
398
+
399
+ 1. **Python Data Science Stack (For Agentic Use only)**:
400
+ - Pandas for data manipulation
401
+ - NumPy for numerical computing
402
+ - Scikit-learn for machine learning
403
+ - Plotly for visualization
404
+ - Statsmodels for statistical analysis
405
+
406
+ 2. **Development Tools**:
407
+ - Alembic for database migrations
408
+ - SQLAlchemy for ORM
409
+ - FastAPI for web framework
410
+ - Pydantic for data validation
411
+
412
+ ## 📝 Documentation Architecture
413
+
414
+ ### API Documentation
415
+
416
+ 1. **Auto-generated Docs**: Available at `/docs` endpoint
417
+ 2. **Schema Definitions**: Pydantic models with descriptions
418
+ 3. **Endpoint Documentation**: Detailed parameter and response docs
419
+
420
+ ### Code Documentation
421
+
422
+ 1. **Inline Documentation**: Comprehensive docstrings
423
+ 2. **Architecture Guides**: High-level system design documentation
424
+ 3. **Getting Started**: Developer onboarding documentation
425
+ 4. **Troubleshooting**: Common issues and solutions
426
+
427
+ This architecture provides a robust, scalable foundation for multi-agent AI analysis while maintaining clean separation of concerns and supporting both development and production deployment scenarios.
docs/development/development_workflow.md ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend Development Workflow
2
+
3
+ ## 🎯 Development Philosophy
4
+
5
+ The Auto-Analyst backend follows modern Python development practices with emphasis on:
6
+ - **Modularity**: Clear separation of concerns across components
7
+ - **Async-First**: Non-blocking operations for scalability
8
+ - **Type Safety**: Comprehensive type hints and validation
9
+ - **Documentation**: Self-documenting code and comprehensive docs
10
+ - **Testing**: Robust testing at multiple levels
11
+ - **Performance**: Optimized for real-world usage patterns
12
+
13
+ ## 🏗️ Code Organization Principles
14
+
15
+ ### 1. **Directory Structure Standards**
16
+
17
+ ```
18
+ src/
19
+ ├── agents/ # AI agent implementations
20
+ │ ├── agents.py # Core agent definitions
21
+ │ ├── deep_agents.py # Deep analysis system
22
+ │ └── retrievers/ # Information retrieval components
23
+ ├── db/ # Database layer
24
+ │ ├── init_db.py # Database initialization
25
+ │ └── schemas/ # SQLAlchemy models
26
+ ├── managers/ # Business logic layer
27
+ │ ├── chat_manager.py # Chat operations
28
+ │ ├── ai_manager.py # AI model management
29
+ │ └── session_manager.py # Session lifecycle
30
+ ├── routes/ # FastAPI route handlers
31
+ │ ├── session_routes.py # Core functionality
32
+ │ ├── chat_routes.py # Chat endpoints
33
+ │ └── [feature]_routes.py # Feature-specific routes
34
+ ├── utils/ # Shared utilities
35
+ │ ├── logger.py # Centralized logging
36
+ │ └── helpers.py # Common functions
37
+ └── schemas/ # Pydantic models
38
+ ├── chat_schemas.py # Chat data models
39
+ └── [feature]_schemas.py # Feature schemas
40
+ ```
41
+
42
+ ### 2. **Import Organization**
43
+
44
+ ```python
45
+ # Standard library imports
46
+ import asyncio
47
+ import json
48
+ from datetime import datetime
49
+ from typing import List, Optional, Dict, Any
50
+
51
+ # Third-party imports
52
+ import dspy
53
+ import pandas as pd
54
+ from fastapi import APIRouter, Depends, HTTPException
55
+ from pydantic import BaseModel
56
+ from sqlalchemy.orm import Session
57
+
58
+ # Local imports
59
+ from src.db.init_db import session_factory
60
+ from src.db.schemas.models import User, Chat
61
+ from src.utils.logger import Logger
62
+ from src.managers.chat_manager import ChatManager
63
+ ```
64
+
65
+ ## 🛠️ Development Patterns
66
+
67
+ ### 1. **Agent Development Pattern**
68
+
69
+ ```python
70
+ # 1. Define DSPy Signature
71
+ class new_analysis_agent(dspy.Signature):
72
+ """
73
+ Comprehensive docstring explaining:
74
+ - Agent purpose and capabilities
75
+ - Input requirements and formats
76
+ - Expected output format
77
+ - Usage examples
78
+ """
79
+ goal = dspy.InputField(desc="Clear description of analysis objective")
80
+ dataset = dspy.InputField(desc="Dataset structure and content description")
81
+ plan_instructions = dspy.InputField(desc="Execution plan from planner")
82
+
83
+ summary = dspy.OutputField(desc="Natural language summary of analysis")
84
+ code = dspy.OutputField(desc="Executable Python code for analysis")
85
+
86
+ # 2. Add to Agent Configuration
87
+ # In agents_config.json:
88
+ {
89
+ "template_name": "new_analysis_agent",
90
+ "description": "Performs specialized analysis on datasets",
91
+ "variant_type": "both", # individual, planner, or both
92
+ "is_premium": false, # Will be active by default
93
+ "usage_count": 0,
94
+ "icon_url": "analysis.svg"
95
+ }
96
+
97
+ # 3. Register in Agent System
98
+ # In agents.py, add to the appropriate loading functions
99
+ ```
100
+
101
+ ### 2. **Route Development Pattern**
102
+
103
+ ```python
104
+ # 1. Create route file: src/routes/feature_routes.py
105
+ from fastapi import APIRouter, Depends, HTTPException, Query
106
+ from pydantic import BaseModel
107
+ from typing import List, Optional
108
+ from src.db.init_db import session_factory
109
+ from src.db.schemas.models import FeatureModel
110
+ from src.utils.logger import Logger
111
+
112
+ logger = Logger("feature_routes", see_time=True, console_log=False)
113
+ router = APIRouter(prefix="/feature", tags=["feature"])
114
+
115
+ # 2. Define Pydantic schemas
116
+ class FeatureCreate(BaseModel):
117
+ name: str
118
+ description: Optional[str] = None
119
+
120
+ class FeatureResponse(BaseModel):
121
+ id: int
122
+ name: str
123
+ description: Optional[str]
124
+ created_at: datetime
125
+
126
+ # 3. Implement endpoints with proper error handling
127
+ @router.post("/", response_model=FeatureResponse)
128
+ async def create_feature(feature: FeatureCreate):
129
+ try:
130
+ session = session_factory()
131
+ try:
132
+ new_feature = FeatureModel(
133
+ name=feature.name,
134
+ description=feature.description
135
+ )
136
+ session.add(new_feature)
137
+ session.commit()
138
+ session.refresh(new_feature)
139
+
140
+ return FeatureResponse(
141
+ id=new_feature.id,
142
+ name=new_feature.name,
143
+ description=new_feature.description,
144
+ created_at=new_feature.created_at
145
+ )
146
+
147
+ except Exception as e:
148
+ session.rollback()
149
+ logger.log_message(f"Error creating feature: {str(e)}", level=logging.ERROR)
150
+ raise HTTPException(status_code=500, detail=f"Failed to create feature: {str(e)}")
151
+ finally:
152
+ session.close()
153
+
154
+ except Exception as e:
155
+ logger.log_message(f"Error in create_feature: {str(e)}", level=logging.ERROR)
156
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
157
+
158
+ # 4. Register in app.py
159
+ from src.routes.feature_routes import router as feature_router
160
+ app.include_router(feature_router)
161
+ ```
162
+
163
+ ### 3. **Database Model Pattern**
164
+
165
+ ```python
166
+ # In src/db/schemas/models.py
167
+ from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, ForeignKey
168
+ from sqlalchemy.orm import relationship
169
+ from sqlalchemy.ext.declarative import declarative_base
170
+ from datetime import datetime, timezone
171
+
172
+ Base = declarative_base()
173
+
174
+ class NewModel(Base):
175
+ __tablename__ = "new_models"
176
+
177
+ # Primary key
178
+ id = Column(Integer, primary_key=True, autoincrement=True)
179
+
180
+ # Required fields
181
+ name = Column(String(255), nullable=False, unique=True)
182
+
183
+ # Optional fields
184
+ description = Column(Text, nullable=True)
185
+ is_active = Column(Boolean, default=True, nullable=False)
186
+
187
+ # Timestamps
188
+ created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), nullable=False)
189
+ updated_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc), nullable=False)
190
+
191
+ # Foreign keys
192
+ user_id = Column(Integer, ForeignKey("users.user_id"), nullable=True)
193
+
194
+ # Relationships
195
+ user = relationship("User", back_populates="new_models")
196
+
197
+ def __repr__(self):
198
+ return f"<NewModel(id={self.id}, name='{self.name}')>"
199
+
200
+ # Update User model to include back reference
201
+ class User(Base):
202
+ # ... existing fields ...
203
+ new_models = relationship("NewModel", back_populates="user")
204
+ ```
205
+
206
+ ### 4. **Manager Pattern**
207
+
208
+ ```python
209
+ # In src/managers/feature_manager.py
210
+ from typing import List, Optional, Dict, Any
211
+ from sqlalchemy.orm import Session
212
+ from src.db.schemas.models import FeatureModel
213
+ from src.utils.logger import Logger
214
+
215
+ logger = Logger("feature_manager", see_time=True, console_log=False)
216
+
217
+ class FeatureManager:
218
+ """
219
+ Manages business logic for feature operations.
220
+ Separates complex business logic from route handlers.
221
+ """
222
+
223
+ def __init__(self, session: Session):
224
+ self.session = session
225
+
226
+ async def create_feature(self, name: str, description: Optional[str] = None) -> FeatureModel:
227
+ """Create a new feature with validation and business logic."""
228
+ try:
229
+ # Validation
230
+ if not name or len(name.strip()) == 0:
231
+ raise ValueError("Feature name cannot be empty")
232
+
233
+ # Check for duplicates
234
+ existing = self.session.query(FeatureModel).filter_by(name=name).first()
235
+ if existing:
236
+ raise ValueError(f"Feature with name '{name}' already exists")
237
+
238
+ # Create feature
239
+ feature = FeatureModel(name=name, description=description)
240
+ self.session.add(feature)
241
+ self.session.commit()
242
+ self.session.refresh(feature)
243
+
244
+ logger.log_message(f"Created feature: {name}", level=logging.INFO)
245
+ return feature
246
+
247
+ except Exception as e:
248
+ self.session.rollback()
249
+ logger.log_message(f"Error creating feature: {str(e)}", level=logging.ERROR)
250
+ raise
251
+
252
+ async def get_features(self, active_only: bool = True) -> List[FeatureModel]:
253
+ """Retrieve features with optional filtering."""
254
+ try:
255
+ query = self.session.query(FeatureModel)
256
+ if active_only:
257
+ query = query.filter(FeatureModel.is_active == True)
258
+
259
+ features = query.order_by(FeatureModel.created_at.desc()).all()
260
+ return features
261
+
262
+ except Exception as e:
263
+ logger.log_message(f"Error retrieving features: {str(e)}", level=logging.ERROR)
264
+ raise
265
+ ```
266
+
267
+ ## 📋 Code Quality Standards
268
+
269
+ ### 1. **Type Hints and Documentation**
270
+
271
+ ```python
272
+ from typing import List, Optional, Dict, Any, Union
273
+ from datetime import datetime
274
+
275
+ async def process_analysis_data(
276
+ data: pd.DataFrame,
277
+ analysis_type: str,
278
+ user_id: Optional[int] = None,
279
+ options: Dict[str, Any] = None
280
+ ) -> Dict[str, Union[str, List[Any], bool]]:
281
+ """
282
+ Process analysis data with specified parameters.
283
+
284
+ Args:
285
+ data: Input DataFrame containing the data to analyze
286
+ analysis_type: Type of analysis to perform ("statistical", "ml", "viz")
287
+ user_id: Optional user ID for tracking and personalization
288
+ options: Additional options for analysis configuration
289
+
290
+ Returns:
291
+ Dictionary containing:
292
+ - status: "success" or "error"
293
+ - result: Analysis results or error message
294
+ - metadata: Additional information about the analysis
295
+
296
+ Raises:
297
+ ValueError: If analysis_type is not supported
298
+ DataError: If data format is invalid
299
+
300
+ Example:
301
+ >>> data = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
302
+ >>> result = await process_analysis_data(data, "statistical")
303
+ >>> print(result["status"])
304
+ "success"
305
+ """
306
+ if options is None:
307
+ options = {}
308
+
309
+ # Implementation...
310
+ return {"status": "success", "result": [], "metadata": {}}
311
+ ```
312
+
313
+ ### 2. **Error Handling Patterns**
314
+
315
+ ```python
316
+ # Comprehensive error handling with logging and user-friendly messages
317
+ async def safe_operation(data: Any) -> Dict[str, Any]:
318
+ """
319
+ Template for safe operations with comprehensive error handling.
320
+ """
321
+ try:
322
+ # Validation
323
+ if not data:
324
+ raise ValueError("Data cannot be empty")
325
+
326
+ # Main operation
327
+ result = await perform_operation(data)
328
+
329
+ # Success logging
330
+ logger.log_message("Operation completed successfully", level=logging.INFO)
331
+ return {"success": True, "data": result}
332
+
333
+ except ValueError as e:
334
+ # Input validation errors
335
+ logger.log_message(f"Validation error: {str(e)}", level=logging.WARNING)
336
+ return {"success": False, "error": "Invalid input", "details": str(e)}
337
+
338
+ except ConnectionError as e:
339
+ # External service errors
340
+ logger.log_message(f"Connection error: {str(e)}", level=logging.ERROR)
341
+ return {"success": False, "error": "Service unavailable", "details": "Please try again later"}
342
+
343
+ except Exception as e:
344
+ # Unexpected errors
345
+ logger.log_message(f"Unexpected error in safe_operation: {str(e)}", level=logging.ERROR)
346
+ return {"success": False, "error": "Internal error", "details": "Please contact support"}
347
+ ```
348
+
349
+ ### 3. **Async/Await Best Practices**
350
+
351
+ ```python
352
+ import asyncio
353
+ from typing import List, Coroutine
354
+
355
+ # Proper async function definition
356
+ async def async_agent_execution(agents: List[str], query: str) -> List[Dict[str, Any]]:
357
+ """Execute multiple agents concurrently."""
358
+
359
+ # Create coroutines
360
+ tasks = [
361
+ execute_single_agent(agent, query)
362
+ for agent in agents
363
+ ]
364
+
365
+ # Execute concurrently with error handling
366
+ results = []
367
+ for task in asyncio.as_completed(tasks):
368
+ try:
369
+ result = await task
370
+ results.append(result)
371
+ except Exception as e:
372
+ logger.log_message(f"Agent execution failed: {e}", level=logging.ERROR)
373
+ results.append({"error": str(e)})
374
+
375
+ return results
376
+
377
+ # Database operations with proper session management
378
+ async def async_database_operation(session: Session) -> Any:
379
+ """Template for async database operations."""
380
+ try:
381
+ # Use asyncio.to_thread for CPU-bound database operations
382
+ result = await asyncio.to_thread(
383
+ lambda: session.query(Model).filter(...).all()
384
+ )
385
+ return result
386
+ except Exception as e:
387
+ session.rollback()
388
+ raise
389
+ finally:
390
+ session.close()
391
+ ```
392
+
393
+ ## 🔧 Development Workflow
394
+
395
+ ### 1. **Feature Development Process**
396
+
397
+ 1. **Plan the Feature**:
398
+ ```bash
399
+ # Create feature branch
400
+ git checkout -b feature/new-analysis-agent
401
+
402
+ # Document requirements
403
+ echo "## New Analysis Agent" >> docs/feature_plan.md
404
+ ```
405
+
406
+ 2. **Implement Core Logic**:
407
+ ```bash
408
+ # Create agent signature
409
+ # Add to agents_config.json
410
+ # Implement business logic in managers/
411
+ # Create route handlers
412
+ ```
413
+
414
+ 3. **Add Database Changes**:
415
+ ```bash
416
+ # Modify models if needed
417
+ alembic revision --autogenerate -m "Add new analysis tables"
418
+ alembic upgrade head
419
+ ```
420
+
421
+ ### 3. **Release Process**
422
+
423
+ 1. **Pre-release Testing**:
424
+ ```bash
425
+ # Run full test suite
426
+ pytest tests/
427
+
428
+ # Test database migrations
429
+ alembic upgrade head
430
+
431
+ # Test with sample data
432
+ python scripts/test_with_sample_data.py
433
+ ```
434
+
435
+ 2. **Documentation Updates**:
436
+ ```bash
437
+ # Update API documentation
438
+ # Update troubleshooting guide
439
+ # Update changelog
440
+ ```
441
+
442
+ 3. **Deployment Preparation**:
443
+ ```bash
444
+ # Update requirements.txt
445
+ pip freeze > requirements.txt
446
+
447
+ # Test container build
448
+ docker build -t auto-analyst-backend .
449
+
450
+ ```
451
+
452
+ ## 📊 Performance Considerations
453
+
454
+ ### 1. **Database Optimization**
455
+
456
+ ```python
457
+ # Use query optimization
458
+ from sqlalchemy.orm import joinedload
459
+
460
+ # Bad: N+1 query problem
461
+ users = session.query(User).all()
462
+ for user in users:
463
+ print(user.chats) # Separate query for each user
464
+
465
+ # Good: Eager loading
466
+ users = session.query(User).options(joinedload(User.chats)).all()
467
+ for user in users:
468
+ print(user.chats) # No additional queries
469
+
470
+ # Use pagination for large datasets
471
+ def get_paginated_results(session, model, page=1, per_page=20):
472
+ offset = (page - 1) * per_page
473
+ return session.query(model).offset(offset).limit(per_page).all()
474
+ ```
475
+
476
+
477
+ ### 2. **Async Optimization**
478
+
479
+ ```python
480
+ # Use connection pooling
481
+ from sqlalchemy.pool import QueuePool
482
+
483
+ engine = create_engine(
484
+ DATABASE_URL,
485
+ poolclass=QueuePool,
486
+ pool_size=20,
487
+ max_overflow=30
488
+ )
489
+
490
+ # Batch operations
491
+ async def batch_process_agents(agents: List[str], queries: List[str]):
492
+ semaphore = asyncio.Semaphore(5) # Limit concurrent operations
493
+
494
+ async def process_with_limit(agent, query):
495
+ async with semaphore:
496
+ return await process_agent(agent, query)
497
+
498
+ tasks = [
499
+ process_with_limit(agent, query)
500
+ for agent, query in zip(agents, queries)
501
+ ]
502
+
503
+ return await asyncio.gather(*tasks, return_exceptions=True)
504
+ ```
505
+
506
+ This development workflow guide provides a comprehensive framework for maintaining code quality, consistency, and performance in the Auto-Analyst backend system. Following these patterns ensures that new features integrate seamlessly with the existing architecture while maintaining the high standards of the codebase.
docs/getting_started.md ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend - Getting Started Guide
2
+
3
+ ## 🎯 Overview
4
+
5
+ This guide will help you set up and understand the Auto-Analyst backend system. Auto-Analyst is a multi-agent AI platform that orchestrates specialized agents for comprehensive data analysis.
6
+
7
+ ## 🏗️ Core Concepts
8
+
9
+ ### 1. **Multi-Agent System**
10
+ The platform uses specialized AI agents:
11
+ - **Preprocessing Agent**: Data cleaning and preparation
12
+ - **Statistical Analytics Agent**: Statistical analysis and insights
13
+ - **Machine Learning Agent**: Scikit-learn based modeling
14
+ - **Data Visualization Agent**: Chart and plot generation
15
+
16
+ ### 2. **Template System**
17
+ - **Individual Agents**: Single-purpose agents for specific tasks
18
+ - **Planner Agents**: Multi-agent coordination for complex workflows
19
+ - **User Templates**: Customizable agent preferences
20
+ - **Default vs Premium**: Core agents available to all users
21
+
22
+ ### 3. **Session Management**
23
+ - Session-based user tracking
24
+ - Shared DataFrame context between agents
25
+ - Conversation history and code execution tracking
26
+
27
+ ### 4. **Deep Analysis System**
28
+ - Multi-step analysis workflow (questions → planning → execution → synthesis)
29
+ - Streaming progress updates
30
+ - HTML report generation
31
+
32
+ ## 🚀 Quick Start
33
+
34
+ ### 1. Installation
35
+
36
+ ```bash
37
+ # Clone and navigate to backend
38
+ cd Auto-Analyst-CS/auto-analyst-backend
39
+
40
+ # Create virtual environment
41
+ python -m venv venv
42
+ source venv/bin/activate # Linux/Mac
43
+ # or
44
+ venv\Scripts\activate # Windows
45
+
46
+ # Install dependencies
47
+ pip install -r requirements.txt
48
+ ```
49
+
50
+ ### 2. Environment Variables
51
+
52
+ Create `.env` file with:
53
+
54
+ ```env
55
+ # Database
56
+ DATABASE_URL=sqlite:///./auto_analyst.db # For development
57
+ # DATABASE_URL=postgresql://user:pass@host:port/db # For production
58
+
59
+ # AI Models
60
+ ANTHROPIC_API_KEY=your_anthropic_key_here
61
+ OPENAI_API_KEY=your_openai_key_here
62
+
63
+ # Authentication (optional)
64
+ ADMIN_API_KEY=your_admin_key_here
65
+ ```
66
+
67
+ ### 3. Database Initialization
68
+
69
+ ```bash
70
+ # Initialize database and default agents
71
+ python -c "
72
+ from src.db.init_db import init_db
73
+ init_db()
74
+ print('✅ Database initialized successfully')
75
+ "
76
+ ```
77
+
78
+ ### 4. Start the Server
79
+
80
+ ```bash
81
+ # Development server
82
+ python app.py
83
+
84
+ # Or with uvicorn
85
+ uvicorn app:app --reload --host 0.0.0.0 --port 8000
86
+ ```
87
+
88
+ ### 5. Verify Setup
89
+
90
+ Visit: `http://localhost:8000/docs` for interactive API documentation
91
+
92
+ ## 📚 Key Files to Understand
93
+
94
+ ### Core Application Files
95
+
96
+ 1. **`app.py`** - Main FastAPI application and core endpoints
97
+ 2. **`src/agents/agents.py`** - Agent definitions and orchestration
98
+ 3. **`src/agents/deep_agents.py`** - Deep analysis system
99
+ 4. **`src/db/schemas/models.py`** - Database models
100
+ 5. **`src/managers/chat_manager.py`** - Chat and session management
101
+
102
+ ### Route Files (API Endpoints)
103
+
104
+ - **`src/routes/session_routes.py`** - File uploads, sessions, authentication
105
+ - **`src/routes/chat_routes.py`** - Chat and messaging
106
+ - **`src/routes/code_routes.py`** - Code execution and processing
107
+ - **`src/routes/templates_routes.py`** - Agent template management
108
+ - **`src/routes/deep_analysis_routes.py`** - Deep analysis reports
109
+ - **`src/routes/analytics_routes.py`** - Usage analytics and monitoring
110
+
111
+ ### Configuration Files
112
+
113
+ - **`agents_config.json`** - Agent and template definitions
114
+ - **`requirements.txt`** - Python dependencies
115
+ - **`alembic.ini`** - Database migration configuration
116
+
117
+ ## 🔧 Development Workflow
118
+
119
+ ### 1. Adding New Agents
120
+
121
+ ```python
122
+ # 1. Define agent signature in src/agents/agents.py
123
+ class new_agent(dspy.Signature):
124
+ """Agent description"""
125
+ goal = dspy.InputField(desc="Analysis goal")
126
+ dataset = dspy.InputField(desc="Dataset info")
127
+ result = dspy.OutputField(desc="Analysis result")
128
+
129
+ # 2. Add to agents_config.json
130
+ {
131
+ "template_name": "new_agent",
132
+ "description": "Agent description",
133
+ "variant_type": "both",
134
+ "is_premium": false,
135
+ "usage_count": 0
136
+ }
137
+
138
+ # 3. Register in agent loading system
139
+ ```
140
+
141
+ ### 2. Adding New Endpoints
142
+
143
+ ```python
144
+ # 1. Create route in src/routes/feature_routes.py
145
+ from fastapi import APIRouter
146
+ router = APIRouter(prefix="/feature", tags=["feature"])
147
+
148
+ @router.get("/endpoint")
149
+ async def new_endpoint():
150
+ return {"message": "Hello"}
151
+
152
+ # 2. Register in app.py
153
+ from src.routes.feature_routes import router as feature_router
154
+ app.include_router(feature_router)
155
+ ```
156
+
157
+ ### 3. Database Changes
158
+
159
+ ```bash
160
+ # 1. Modify models in src/db/schemas/models.py
161
+ # 2. Create migration
162
+ alembic revision --autogenerate -m "description"
163
+ # 3. Apply migration
164
+ alembic upgrade head
165
+ ```
166
+
167
+ ## 🧪 Testing Your Changes
168
+
169
+ ### 1. Test API Endpoints
170
+
171
+ ```bash
172
+ # Use the interactive docs
173
+ open http://localhost:8000/docs
174
+
175
+ # Or use curl
176
+ curl -X GET "http://localhost:8000/health"
177
+ ```
178
+
179
+ ### 2. Test Agent System
180
+
181
+ ```python
182
+ # Test individual agent
183
+ python -c "
184
+ from src.agents.agents import preprocessing_agent
185
+ import dspy
186
+ dspy.LM('anthropic/claude-sonnet-4-20250514')
187
+ agent = dspy.ChainOfThought(preprocessing_agent)
188
+ result = agent(goal='clean data', dataset='test data')
189
+ print(result)
190
+ "
191
+ ```
192
+
193
+ ### 3. Test Database Operations
194
+
195
+ ```python
196
+ # Test database
197
+ python -c "
198
+ from src.db.init_db import session_factory
199
+ from src.db.schemas.models import AgentTemplate
200
+ session = session_factory()
201
+ templates = session.query(AgentTemplate).all()
202
+ print(f'Found {len(templates)} templates')
203
+ session.close()
204
+ "
205
+ ```
206
+
207
+ ## 🔍 Common Development Tasks
208
+
209
+ ### Adding a New Feature
210
+
211
+ 1. **Plan the Feature**: Define requirements and API design
212
+ 2. **Database Changes**: Add new models if needed
213
+ 3. **Create Routes**: Add API endpoints in `src/routes/`
214
+ 4. **Business Logic**: Add managers in `src/managers/` if complex
215
+ 5. **Documentation**: Update relevant `.md` files
216
+ 6. **Testing**: Test endpoints and integration
217
+
218
+ ### Debugging Issues
219
+
220
+ 1. **Check Logs**: Application logs show detailed error information
221
+ 2. **Database State**: Verify data with database queries
222
+ 3. **API Testing**: Use `/docs` interface for endpoint testing
223
+ 4. **Agent Behavior**: Test individual agents separately
224
+
225
+ ### Performance Optimization
226
+
227
+ 1. **Database Queries**: Use SQLAlchemy query optimization
228
+ 2. **Agent Execution**: Implement async patterns for agent orchestration
229
+ 3. **Resource Management**: Monitor memory usage for large datasets
230
+
231
+ ## 📊 System Architecture Overview
232
+
233
+ ```mermaid
234
+ graph TD
235
+ A[Frontend Request] --> B[FastAPI Router]
236
+ B --> C[Route Handler]
237
+ C --> D[Manager Layer]
238
+ D --> E[Database Layer]
239
+ D --> F[Agent System]
240
+ F --> G[AI Models]
241
+ G --> H[Code Generation]
242
+ H --> I[Execution Environment]
243
+ I --> J[Results Processing]
244
+ J --> K[Response]
245
+
246
+ subgraph "Agent Orchestration"
247
+ F1[Individual Agents]
248
+ F2[Planner Module]
249
+ F3[Deep Analysis]
250
+ F1 --> F2
251
+ F2 --> F3
252
+ end
253
+
254
+ F --> F1
255
+ ```
256
+
257
+ ## 📈 Template Integration
258
+
259
+ The system uses **active user templates** for agent selection:
260
+
261
+ ### Default Agents (Always Available)
262
+ - `preprocessing_agent` (individual & planner variants)
263
+ - `statistical_analytics_agent` (individual & planner variants)
264
+ - `sk_learn_agent` (individual & planner variants)
265
+ - `data_viz_agent` (individual & planner variants)
266
+
267
+ ### Template Loading Logic
268
+ 1. **Individual Agent Execution** (`@agent_name`): Loads ALL available templates
269
+ 2. **Planner Execution**: Loads user's enabled templates (max 10 for performance)
270
+ 3. **Deep Analysis**: Uses user's active template preferences
271
+ 4. **Fallback**: Uses 4 core agents if no user preferences found
272
+
273
+ This architecture ensures users can leverage their preferred agents while maintaining system performance and reliability.
docs/system/database-schema.md ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Database Schema Documentation
2
+
3
+ ## 📋 Overview
4
+
5
+ The Auto-Analyst backend uses a relational database schema designed for scalability and data integrity. The schema supports both **SQLite** (development) and **PostgreSQL** (production) databases through SQLAlchemy ORM.
6
+
7
+ ### **Database Features**
8
+ - **User Management** - Authentication and user data
9
+ - **Chat System** - Conversation sessions and message history
10
+ - **AI Model Tracking** - Usage analytics and cost monitoring
11
+ - **Code Execution** - Code generation and execution tracking
12
+ - **Agent Templates** - Customizable AI agent configurations
13
+ - **Deep Analysis** - Multi-step analysis reports and results
14
+ - **User Feedback** - Rating and feedback system
15
+
16
+ ---
17
+
18
+ ## 🗄️ Database Tables
19
+
20
+ ### **1. Users Table (`users`)**
21
+
22
+ **Purpose**: Core user authentication and profile management
23
+
24
+ | Column | Type | Constraints | Description |
25
+ |--------|------|-------------|-------------|
26
+ | `user_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique user identifier |
27
+ | `username` | `STRING` | UNIQUE, NOT NULL | User's display name |
28
+ | `email` | `STRING` | UNIQUE, NOT NULL | User's email address |
29
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Account creation timestamp |
30
+
31
+ **Relationships:**
32
+ - **One-to-Many**: `chats` (User → Chat sessions)
33
+ - **One-to-Many**: `usage_records` (User → Model usage tracking)
34
+ - **One-to-Many**: `deep_analysis_reports` (User → Analysis reports)
35
+ - **One-to-Many**: `template_preferences` (User → Agent preferences)
36
+
37
+ ---
38
+
39
+ ### **2. Chats Table (`chats`)**
40
+
41
+ **Purpose**: Conversation sessions and chat organization
42
+
43
+ | Column | Type | Constraints | Description |
44
+ |--------|------|-------------|-------------|
45
+ | `chat_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique chat session identifier |
46
+ | `user_id` | `INTEGER` | FOREIGN KEY → `users.user_id`, CASCADE DELETE | Chat owner (nullable for anonymous) |
47
+ | `title` | `STRING` | DEFAULT: 'New Chat' | Human-readable chat title |
48
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Chat creation timestamp |
49
+
50
+ **Relationships:**
51
+ - **Many-to-One**: `user` (Chat → User)
52
+ - **One-to-Many**: `messages` (Chat → Messages)
53
+ - **One-to-Many**: `usage_records` (Chat → Model usage)
54
+
55
+ ---
56
+
57
+ ### **3. Messages Table (`messages`)**
58
+
59
+ **Purpose**: Individual messages within chat conversations
60
+
61
+ | Column | Type | Constraints | Description |
62
+ |--------|------|-------------|-------------|
63
+ | `message_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique message identifier |
64
+ | `chat_id` | `INTEGER` | FOREIGN KEY → `chats.chat_id`, CASCADE DELETE | Parent chat session |
65
+ | `sender` | `STRING` | NOT NULL | Message sender: 'user' or 'ai' |
66
+ | `content` | `TEXT` | NOT NULL | Message content (text/markdown) |
67
+ | `timestamp` | `DATETIME` | DEFAULT: UTC NOW | Message creation time |
68
+
69
+ **Relationships:**
70
+ - **Many-to-One**: `chat` (Message → Chat)
71
+ - **One-to-One**: `feedback` (Message → Feedback)
72
+
73
+ ---
74
+
75
+ ### **4. Model Usage Table (`model_usage`)**
76
+
77
+ **Purpose**: AI model usage tracking for analytics and billing
78
+
79
+ | Column | Type | Constraints | Description |
80
+ |--------|------|-------------|-------------|
81
+ | `usage_id` | `INTEGER` | PRIMARY KEY | Unique usage record identifier |
82
+ | `user_id` | `INTEGER` | FOREIGN KEY → `users.user_id`, SET NULL | User who triggered the usage |
83
+ | `chat_id` | `INTEGER` | FOREIGN KEY → `chats.chat_id`, SET NULL | Associated chat session |
84
+ | `model_name` | `STRING(100)` | NOT NULL | AI model used (e.g., 'gpt-4o-mini') |
85
+ | `provider` | `STRING(50)` | NOT NULL | Model provider ('openai', 'anthropic', etc.) |
86
+ | `prompt_tokens` | `INTEGER` | DEFAULT: 0 | Input tokens consumed |
87
+ | `completion_tokens` | `INTEGER` | DEFAULT: 0 | Output tokens generated |
88
+ | `total_tokens` | `INTEGER` | DEFAULT: 0 | Total tokens (input + output) |
89
+ | `query_size` | `INTEGER` | DEFAULT: 0 | Query size in characters |
90
+ | `response_size` | `INTEGER` | DEFAULT: 0 | Response size in characters |
91
+ | `cost` | `FLOAT` | DEFAULT: 0.0 | Cost in USD for this usage |
92
+ | `timestamp` | `DATETIME` | DEFAULT: UTC NOW | Usage timestamp |
93
+ | `is_streaming` | `BOOLEAN` | DEFAULT: FALSE | Whether response was streamed |
94
+ | `request_time_ms` | `INTEGER` | DEFAULT: 0 | Request processing time (milliseconds) |
95
+
96
+ **Relationships:**
97
+ - **Many-to-One**: `user` (Usage → User)
98
+ - **Many-to-One**: `chat` (Usage → Chat)
99
+
100
+ ---
101
+
102
+ ### **5. Code Executions Table (`code_executions`)**
103
+
104
+ **Purpose**: Track code generation and execution attempts
105
+
106
+ | Column | Type | Constraints | Description |
107
+ |--------|------|-------------|-------------|
108
+ | `execution_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique execution identifier |
109
+ | `message_id` | `INTEGER` | FOREIGN KEY → `messages.message_id`, CASCADE DELETE | Associated message |
110
+ | `chat_id` | `INTEGER` | FOREIGN KEY → `chats.chat_id`, CASCADE DELETE | Parent chat session |
111
+ | `user_id` | `INTEGER` | FOREIGN KEY → `users.user_id`, SET NULL | User who triggered execution |
112
+ | `initial_code` | `TEXT` | NULLABLE | First version of generated code |
113
+ | `latest_code` | `TEXT` | NULLABLE | Most recent code version |
114
+ | `is_successful` | `BOOLEAN` | DEFAULT: FALSE | Whether execution succeeded |
115
+ | `output` | `TEXT` | NULLABLE | Execution output (including errors) |
116
+ | `model_provider` | `STRING(50)` | NULLABLE | AI model provider used |
117
+ | `model_name` | `STRING(100)` | NULLABLE | AI model name used |
118
+ | `failed_agents` | `TEXT` | NULLABLE | JSON list of failed agent names |
119
+ | `error_messages` | `TEXT` | NULLABLE | JSON map of error messages by agent |
120
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Execution creation time |
121
+ | `updated_at` | `DATETIME` | DEFAULT: UTC NOW, ON UPDATE | Last update timestamp |
122
+
123
+ ---
124
+
125
+ ### **6. Message Feedback Table (`message_feedback`)**
126
+
127
+ **Purpose**: User feedback and model settings for messages
128
+
129
+ | Column | Type | Constraints | Description |
130
+ |--------|------|-------------|-------------|
131
+ | `feedback_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique feedback identifier |
132
+ | `message_id` | `INTEGER` | FOREIGN KEY → `messages.message_id`, CASCADE DELETE | Associated message |
133
+ | `rating` | `INTEGER` | NULLABLE | Star rating (1-5 scale) |
134
+ | `model_name` | `STRING(100)` | NULLABLE | Model used for this message |
135
+ | `model_provider` | `STRING(50)` | NULLABLE | Model provider used |
136
+ | `temperature` | `FLOAT` | NULLABLE | Temperature setting used |
137
+ | `max_tokens` | `INTEGER` | NULLABLE | Max tokens setting used |
138
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Feedback creation time |
139
+ | `updated_at` | `DATETIME` | DEFAULT: UTC NOW, ON UPDATE | Last update timestamp |
140
+
141
+ **Relationships:**
142
+ - **One-to-One**: `message` (Feedback ↔ Message)
143
+
144
+ ---
145
+
146
+ ### **7. Deep Analysis Reports Table (`deep_analysis_reports`)**
147
+
148
+ **Purpose**: Store comprehensive multi-agent analysis reports
149
+
150
+ | Column | Type | Constraints | Description |
151
+ |--------|------|-------------|-------------|
152
+ | `report_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique report identifier |
153
+ | `report_uuid` | `STRING(100)` | UNIQUE, NOT NULL | Frontend-generated UUID |
154
+ | `user_id` | `INTEGER` | FOREIGN KEY → `users.user_id`, CASCADE DELETE | Report owner |
155
+ | `goal` | `TEXT` | NOT NULL | Analysis objective/question |
156
+ | `status` | `STRING(20)` | NOT NULL, DEFAULT: 'pending' | Status: 'pending', 'running', 'completed', 'failed' |
157
+ | `start_time` | `DATETIME` | DEFAULT: UTC NOW | Analysis start time |
158
+ | `end_time` | `DATETIME` | NULLABLE | Analysis completion time |
159
+ | `duration_seconds` | `INTEGER` | NULLABLE | Total analysis duration |
160
+ | `deep_questions` | `TEXT` | NULLABLE | Generated analytical questions |
161
+ | `deep_plan` | `TEXT` | NULLABLE | Analysis execution plan |
162
+ | `summaries` | `JSON` | NULLABLE | Array of analysis summaries |
163
+ | `analysis_code` | `TEXT` | NULLABLE | Generated Python code |
164
+ | `plotly_figures` | `JSON` | NULLABLE | Array of Plotly figure data |
165
+ | `synthesis` | `JSON` | NULLABLE | Array of synthesis insights |
166
+ | `final_conclusion` | `TEXT` | NULLABLE | Final analysis conclusion |
167
+ | `html_report` | `TEXT` | NULLABLE | Complete HTML report |
168
+ | `progress_percentage` | `INTEGER` | DEFAULT: 0 | Progress percentage (0-100) |
169
+ | `total_tokens_used` | `INTEGER` | DEFAULT: 0 | Total tokens consumed |
170
+ | `estimated_cost` | `FLOAT` | DEFAULT: 0.0 | Estimated cost in USD |
171
+ | `credits_consumed` | `INTEGER` | DEFAULT: 0 | Credits deducted for analysis |
172
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Report creation time |
173
+ | `updated_at` | `DATETIME` | DEFAULT: UTC NOW, ON UPDATE | Last update timestamp |
174
+
175
+ **Relationships:**
176
+ - **Many-to-One**: `user` (Report → User)
177
+
178
+ ---
179
+
180
+ ### **8. Agent Templates Table (`agent_templates`)**
181
+
182
+ **Purpose**: Store predefined AI agent configurations
183
+
184
+ | Column | Type | Constraints | Description |
185
+ |--------|------|-------------|-------------|
186
+ | `template_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique template identifier |
187
+ | `template_name` | `STRING(100)` | UNIQUE, NOT NULL | Internal template name |
188
+ | `display_name` | `STRING(200)` | NULLABLE | User-friendly display name |
189
+ | `description` | `TEXT` | NOT NULL | Template description |
190
+ | `prompt_template` | `TEXT` | NOT NULL | Agent behavior instructions |
191
+ | `icon_url` | `STRING(500)` | NULLABLE | Template icon URL |
192
+ | `category` | `STRING(50)` | NULLABLE | Template category |
193
+ | `is_premium_only` | `BOOLEAN` | DEFAULT: FALSE | Requires premium subscription |
194
+ | `variant_type` | `STRING(20)` | DEFAULT: 'individual' | 'planner', 'individual', or 'both' |
195
+ | `is_active` | `BOOLEAN` | DEFAULT: TRUE | Template is active/available |
196
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Template creation time |
197
+
198
+ **Relationships:**
199
+ - **One-to-Many**: `user_preferences` (Template → User preferences)
200
+
201
+ ---
202
+
203
+ ### **9. User Template Preferences Table (`user_template_preferences`)**
204
+
205
+ **Purpose**: Track user preferences and usage for agent templates
206
+
207
+ | Column | Type | Constraints | Description |
208
+ |--------|------|-------------|-------------|
209
+ | `preference_id` | `INTEGER` | PRIMARY KEY, AUTO INCREMENT | Unique preference identifier |
210
+ | `user_id` | `INTEGER` | FOREIGN KEY → `users.user_id`, CASCADE DELETE | User who owns preference |
211
+ | `template_id` | `INTEGER` | FOREIGN KEY → `agent_templates.template_id`, CASCADE DELETE | Associated template |
212
+ | `is_enabled` | `BOOLEAN` | DEFAULT: TRUE | Whether user has template enabled |
213
+ | `usage_count` | `INTEGER` | DEFAULT: 0 | Number of times user used template |
214
+ | `last_used_at` | `DATETIME` | NULLABLE | Last time user used template |
215
+ | `created_at` | `DATETIME` | DEFAULT: UTC NOW | Preference creation time |
216
+
217
+ **Relationships:**
218
+ - **Many-to-One**: `user` (Preference → User)
219
+ - **Many-to-One**: `template` (Preference → Template)
220
+
221
+ **Constraints:**
222
+ - **Unique**: `(user_id, template_id)` - One preference per user per template
223
+
224
+ ---
225
+
226
+ ## 🔗 Entity Relationship Diagram
227
+
228
+ ```
229
+ Users (1) ──────────── (Many) Chats
230
+ │ │
231
+ │ ├── (Many) Messages
232
+ │ │ │
233
+ │ │ └── (1) MessageFeedback
234
+ │ │
235
+ │ └── (Many) CodeExecutions
236
+
237
+ ├── (Many) ModelUsage
238
+
239
+ ├── (Many) DeepAnalysisReports
240
+
241
+ └── (Many) UserTemplatePreferences
242
+
243
+ └── (Many) AgentTemplates
244
+ ```
245
+
246
+ ---
247
+
248
+ ## 📊 Database Performance
249
+
250
+ ### **Optimized Indexes**
251
+
252
+ ```sql
253
+ -- High-performance queries
254
+ CREATE INDEX idx_messages_chat_timestamp ON messages(chat_id, timestamp DESC);
255
+ CREATE INDEX idx_model_usage_user_time ON model_usage(user_id, timestamp DESC);
256
+ CREATE INDEX idx_model_usage_model_time ON model_usage(model_name, timestamp DESC);
257
+ CREATE INDEX idx_reports_user_time ON deep_analysis_reports(user_id, created_at DESC);
258
+ ```
259
+
260
+ ### **Cascade Deletion Rules**
261
+
262
+ | Parent → Child | Rule | Description |
263
+ |----------------|------|-------------|
264
+ | `users` → `chats` | CASCADE | Delete all user chats when user deleted |
265
+ | `chats` → `messages` | CASCADE | Delete all chat messages when chat deleted |
266
+ | `messages` → `feedback` | CASCADE | Delete feedback when message deleted |
267
+ | `users` → `model_usage` | SET NULL | Keep usage records for analytics |
268
+
269
+ ---
270
+
271
+ ## 🛡️ Security & Maintenance
272
+
273
+ ### **Data Protection**
274
+ - User data isolated by `user_id`
275
+ - Sensitive fields require encryption in production
276
+ - Automatic cleanup of anonymous data after 90 days
277
+
278
+ ### **Regular Maintenance**
279
+ ```sql
280
+ -- Clean old anonymous chats
281
+ DELETE FROM chats WHERE user_id IS NULL AND created_at < DATE_SUB(NOW(), INTERVAL 90 DAY);
282
+
283
+ -- Update statistics for query optimization
284
+ ANALYZE users, chats, messages, model_usage;
285
+ ```
286
+
287
+ ---
288
+
289
+ This schema supports the full Auto-Analyst application with optimized performance, data integrity, and scalability for both development and production environments.
docs/system/shared_dataframe.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shared Dataframe Between Agents
2
+
3
+ This document explains how to use the shared dataframe functionality that allows one agent to create a processed dataframe (`df_processed`) that other agents can access and use.
4
+
5
+ ## Overview
6
+
7
+ The Auto-Analyst system now supports sharing a processed dataframe between agents. This is useful when:
8
+
9
+ 1. One agent performs data preprocessing, cleaning, or feature engineering
10
+ 2. Subsequent agents need to use this processed data for analysis, visualization, or other tasks
11
+
12
+ The first agent (typically Agent1) creates a dataframe called `df_processed`, and all subsequent agents can access this same dataframe without needing to reprocess the data.
13
+
14
+ ## How It Works
15
+
16
+ 1. Automatic variable sharing is handled through the `SHARED_CONTEXT` global dictionary in `format_response.py`
17
+ 2. When an agent executes Python code that creates a variable named `df_processed`, this variable is automatically stored in the shared context
18
+ 3. Subsequent agent code executions will have access to this `df_processed` variable
19
+
20
+ ## Implementation for Agent Developers
21
+
22
+ ### Agent1 (Data Processor)
23
+
24
+ Agent1 should define a processed dataframe that will be used by subsequent agents:
25
+
26
+ ```python
27
+ import pandas as pd
28
+ import numpy as np
29
+
30
+ # Do some data processing
31
+ df_processed = df.copy() # Start with a copy of the original dataframe
32
+ df_processed = df_processed.dropna() # Remove missing values
33
+ df_processed['new_feature'] = df_processed['column_a'] / df_processed['column_b']
34
+ print("Data processing complete. Created df_processed for other agents to use.")
35
+ ```
36
+
37
+ ### Agent2 (Data Consumer)
38
+
39
+ Agent2 can access the `df_processed` dataframe created by Agent1:
40
+
41
+ ```python
42
+ import matplotlib.pyplot as plt
43
+ import seaborn as sns
44
+
45
+ # Access the shared df_processed dataframe
46
+ print(f"Using shared df_processed with shape: {df_processed.shape}")
47
+
48
+ # Create visualization using the processed data
49
+ plt.figure(figsize=(10, 6))
50
+ sns.scatterplot(data=df_processed, x='column_a', y='new_feature')
51
+ plt.title('Analysis of Processed Data')
52
+ plt.show()
53
+ ```
54
+
55
+ ## Technical Details
56
+
57
+ The shared dataframe functionality is implemented through:
58
+
59
+ 1. A global `SHARED_CONTEXT` dictionary in `format_response.py`
60
+ 2. Modified `execute_code_from_markdown` function that checks for `df_processed` in the execution context
61
+ 3. Updated app.py to process agents in the correct order from the plan_list
62
+
63
+ ## Best Practices
64
+
65
+ 1. Name the shared dataframe consistently as `df_processed`
66
+ 2. Document what processing was done to create the shared dataframe
67
+ 3. Agent1 should print a message confirming that `df_processed` was created
68
+ 4. Agent2 should verify the structure of `df_processed` before using it (e.g., print its shape or columns)
69
+ 5. Keep processing in Agent1, analysis in Agent2 for clean separation of concerns
70
+
71
+ ## Example
72
+
73
+ ```python
74
+ # Agent1 code
75
+ import pandas as pd
76
+
77
+ # Load and process data
78
+ df_processed = df.copy()
79
+ df_processed = df_processed[df_processed['price'] > 0] # Remove invalid prices
80
+ df_processed['price_per_sqft'] = df_processed['price'] / df_processed['sqft']
81
+ print(f"Created df_processed with {len(df_processed)} rows after processing")
82
+
83
+ # Agent2 code
84
+ import plotly.express as px
85
+
86
+ # Use the processed dataframe
87
+ print(f"Using df_processed with {len(df_processed)} rows")
88
+ fig = px.scatter(df_processed, x='sqft', y='price', color='price_per_sqft',
89
+ title='Price vs. Square Footage (Colored by Price per SqFt)')
90
+ fig.show()
91
+ ```
docs/troubleshooting/troubleshooting.md ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-Analyst Backend Troubleshooting Guide
2
+
3
+ ## 🚨 Common Startup Issues
4
+
5
+ ### 1. **Database Connection Problems**
6
+
7
+ #### Problem: Database connection failed
8
+ ```
9
+ ❌ sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) no such table: users
10
+ ```
11
+
12
+ **Solutions:**
13
+ 1. **Initialize Database**:
14
+ ```bash
15
+ python -c "
16
+ from src.db.init_db import init_db
17
+ init_db()
18
+ print('✅ Database initialized')
19
+ "
20
+ ```
21
+
22
+ 2. **Check Database File Permissions**:
23
+ ```bash
24
+ # For SQLite
25
+ ls -la auto_analyst.db
26
+ chmod 666 auto_analyst.db # If needed
27
+ ```
28
+
29
+ 3. **Verify DATABASE_URL**:
30
+ ```bash
31
+ # Check .env file
32
+ cat .env | grep DATABASE_URL
33
+
34
+ # For PostgreSQL (production)
35
+ DATABASE_URL=postgresql://user:password@host:port/database
36
+
37
+ # For SQLite (development)
38
+ DATABASE_URL=sqlite:///./auto_analyst.db
39
+ ```
40
+
41
+ #### Problem: PostgreSQL connection issues
42
+ ```
43
+ ❌ psycopg2.OperationalError: FATAL: database "auto_analyst" does not exist
44
+ ```
45
+
46
+ **Solutions:**
47
+ 1. **Create Database**:
48
+ ```sql
49
+ -- Connect to PostgreSQL
50
+ psql -h localhost -U postgres
51
+ CREATE DATABASE auto_analyst;
52
+ \q
53
+ ```
54
+
55
+ 2. **Update Connection String**:
56
+ ```env
57
+ DATABASE_URL=postgresql://username:password@localhost:5432/auto_analyst
58
+ ```
59
+
60
+ ### 2. **Agent Template Loading Issues**
61
+
62
+ #### Problem: No agents found
63
+ ```
64
+ ❌ RuntimeError: No agents loaded for user. Cannot proceed with analysis.
65
+ ```
66
+
67
+ **Solutions:**
68
+ 1. **Initialize Default Agents**:
69
+ ```python
70
+ python -m scripts.populate_agent_templates
71
+ print('✅ Default agents initialized')
72
+ "
73
+ ```
74
+
75
+ 2. **Check Agent Templates in Database**:
76
+ ```python
77
+ python -c "
78
+ from src.db.init_db import session_factory
79
+ from src.db.schemas.models import AgentTemplate
80
+ session = session_factory()
81
+ templates = session.query(AgentTemplate).all()
82
+ print(f'Found {len(templates)} templates:')
83
+ for t in templates:
84
+ print(f' - {t.template_name}: {t.is_active}')
85
+ session.close()
86
+ "
87
+ ```
88
+
89
+ 3. **Populate Templates from Config**:
90
+ ```bash
91
+ python scripts/populate_agent_templates.py
92
+ ```
93
+
94
+ ### 3. **API Key Issues**
95
+
96
+ #### Problem: Missing API keys
97
+ ```
98
+ ❌ AuthenticationError: Invalid API key provided
99
+ ```
100
+
101
+ **Solutions:**
102
+ 1. **Check Environment Variables**:
103
+ ```bash
104
+ # Verify API keys are set
105
+ echo $ANTHROPIC_API_KEY
106
+ echo $OPENAI_API_KEY
107
+
108
+ # Or check .env file
109
+ cat .env | grep API_KEY
110
+ ```
111
+
112
+ 2. **Add Missing Keys**:
113
+ ```env
114
+ # Add to .env file
115
+ ANTHROPIC_API_KEY=sk-ant-api03-...
116
+ OPENAI_API_KEY=sk-...
117
+ ADMIN_API_KEY=your_admin_key_here
118
+ ```
119
+
120
+ 3. **Test API Key Validity**:
121
+ ```python
122
+ python -c "
123
+ import os
124
+ from anthropic import Anthropic
125
+ client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
126
+ try:
127
+ # Test call
128
+ response = client.messages.create(
129
+ model='claude-3-sonnet-20241022',
130
+ max_tokens=10,
131
+ messages=[{'role': 'user', 'content': 'Hello'}]
132
+ )
133
+ print('✅ Anthropic API key valid')
134
+ except Exception as e:
135
+ print(f'❌ Anthropic API key invalid: {e}')
136
+ "
137
+ ```
138
+
139
+ ## 🤖 Agent System Issues
140
+
141
+ ### 1. **Agent Not Found Errors**
142
+
143
+ #### Problem: Specific agent not available
144
+ ```
145
+ ❌ KeyError: 'custom_agent' not found in loaded agents
146
+ ```
147
+
148
+ **Solutions:**
149
+ 1. **Check Available Agents**:
150
+ ```python
151
+ python -c "
152
+ from src.agents.agents import load_user_enabled_templates_from_db
153
+ from src.db.init_db import session_factory
154
+ session = session_factory()
155
+ agents = load_user_enabled_templates_from_db('test_user', session)
156
+ print('Available agents:', list(agents.keys()))
157
+ session.close()
158
+ "
159
+ ```
160
+
161
+ 2. **Verify Agent Template Exists**:
162
+ ```python
163
+ python -c "
164
+ from src.db.init_db import session_factory
165
+ from src.db.schemas.models import AgentTemplate
166
+ session = session_factory()
167
+ agent = session.query(AgentTemplate).filter_by(template_name='custom_agent').first()
168
+ if agent:
169
+ print(f'Agent found: {agent.display_name}, Active: {agent.is_active}')
170
+ else:
171
+ print('Agent not found in database')
172
+ session.close()
173
+ "
174
+ ```
175
+
176
+ 3. **Add Missing Agent Template**:
177
+ ```python
178
+ # Add to agents_config.json or use database insertion
179
+ python scripts/populate_agent_templates.py
180
+ ```
181
+
182
+ ### 2. **Deep Analysis Failures**
183
+
184
+ #### Problem: Deep analysis stops unexpectedly
185
+ ```
186
+ ❌ DeepAnalysisError: Agent execution failed at step 3
187
+ ```
188
+
189
+ **Solutions:**
190
+ 1. **Check Agent Configuration**:
191
+ ```python
192
+ # Verify user has required agents enabled
193
+ python -c "
194
+ from src.agents.deep_agents import get_user_enabled_agent_names
195
+ from src.db.init_db import session_factory
196
+ session = session_factory()
197
+ agents = get_user_enabled_agent_names('test_user', session)
198
+ required = ['preprocessing_agent', 'statistical_analytics_agent', 'sk_learn_agent', 'data_viz_agent']
199
+ print('Required agents:', required)
200
+ print('Available agents:', agents)
201
+ print('Missing:', [a for a in required if a not in agents])
202
+ session.close()
203
+ "
204
+ ```
205
+
206
+ 2. **Increase Timeout Settings**:
207
+ ```python
208
+ # In deep_agents.py, increase timeout values
209
+ timeout = 300 # Increase from default
210
+ ```
211
+
212
+ 3. **Check Dataset Size**:
213
+ ```python
214
+ # Reduce dataset size for complex analysis
215
+ df_sample = df.sample(n=1000) # Use sample for testing
216
+ ```
217
+
218
+ ## ⚡ Code Execution Problems
219
+
220
+ ### 1. **Code Execution Timeouts**
221
+
222
+ #### Problem: Code execution takes too long
223
+ ```
224
+ ❌ TimeoutError: Code execution exceeded 120 seconds
225
+ ```
226
+
227
+ **Solutions:**
228
+ 1. **Optimize Generated Code**:
229
+ - Use data sampling for large datasets
230
+ - Simplify analysis requirements
231
+ - Use sampling for large datasets
232
+
233
+ 2. **Check Resource Usage**:
234
+ ```python
235
+ import psutil
236
+ print(f"Memory usage: {psutil.virtual_memory().percent}%")
237
+ print(f"CPU usage: {psutil.cpu_percent()}%")
238
+ ```
239
+
240
+ 3. **Increase Timeout Settings**:
241
+ ```python
242
+ # In clean_and_store_code function
243
+ future.result(timeout=600) # Increase timeout to 10 minutes
244
+ ```
245
+
246
+ #### Problem: Import Errors in Generated Code
247
+ ```
248
+ ❌ ModuleNotFoundError: No module named 'some_library'
249
+ ```
250
+
251
+ **Solutions:**
252
+ 1. **Check Available Libraries**:
253
+ ```python
254
+ # Available in execution environment:
255
+ import pandas as pd
256
+ import numpy as np
257
+ import plotly.express as px
258
+ import plotly.graph_objects as go
259
+ import sklearn
260
+ import statsmodels.api as sm
261
+ ```
262
+
263
+ 2. **Add Missing Dependencies**:
264
+ ```bash
265
+ pip install missing_library
266
+ ```
267
+
268
+ 3. **Update Execution Environment**:
269
+ ```python
270
+ # In clean_and_store_code function
271
+ exec_globals.update({
272
+ 'new_library': __import__('new_library')
273
+ })
274
+ ```
275
+
276
+ ### 4. **Database Issues**
277
+
278
+ #### Problem: Migration Errors
279
+ ```
280
+ ❌ alembic.util.exc.CommandError: Can't locate revision identified by 'xyz'
281
+ ```
282
+
283
+ **Solutions:**
284
+ 1. **Reset Migration History**:
285
+ ```bash
286
+ # Delete migration files (except __init__.py)
287
+ rm migrations/versions/*.py
288
+
289
+ # Create new initial migration
290
+ alembic revision --autogenerate -m "initial migration"
291
+ alembic upgrade head
292
+ ```
293
+
294
+ 2. **Force Migration**:
295
+ ```bash
296
+ # Mark current state as up-to-date
297
+ alembic stamp head
298
+ ```
299
+
300
+ 3. **Recreate Database**:
301
+ ```bash
302
+ # For SQLite (development)
303
+ rm auto_analyst.db
304
+ python -c "from src.db.init_db import init_db; init_db()"
305
+ ```
306
+
307
+ #### Problem: Constraint Violations
308
+ ```
309
+ ❌ IntegrityError: UNIQUE constraint failed
310
+ ```
311
+
312
+ **Solutions:**
313
+ 1. **Check Existing Records**:
314
+ ```python
315
+ from src.db.init_db import session_factory
316
+ from src.db.schemas.models import AgentTemplate
317
+
318
+ session = session_factory()
319
+ templates = session.query(AgentTemplate).all()
320
+ for t in templates:
321
+ print(f"{t.template_name}: {t.template_id}")
322
+ session.close()
323
+ ```
324
+
325
+ 2. **Clean Duplicate Data**:
326
+ ```bash
327
+ python -c "
328
+ from src.db.init_db import session_factory
329
+ from src.db.schemas.models import AgentTemplate
330
+ session = session_factory()
331
+ # Remove duplicates based on template_name
332
+ seen = set()
333
+ for template in session.query(AgentTemplate).all():
334
+ if template.template_name in seen:
335
+ session.delete(template)
336
+ else:
337
+ seen.add(template.template_name)
338
+ session.commit()
339
+ session.close()
340
+ "
341
+ ```
342
+
343
+ ### 5. **Authentication and Authorization Issues**
344
+
345
+ #### Problem: Unauthorized Access
346
+ ```
347
+ ❌ 401 Unauthorized: Invalid session
348
+ ```
349
+
350
+ **Solutions:**
351
+ 1. **Check Session ID**:
352
+ ```python
353
+ # Ensure session_id is provided in request
354
+ headers = {"X-Session-ID": "your_session_id"}
355
+ # Or as query parameter: ?session_id=your_session_id
356
+ ```
357
+
358
+ 2. **Create Valid Session**:
359
+ ```bash
360
+ curl -X POST "http://localhost:8000/session_info" \
361
+ -H "Content-Type: application/json"
362
+ ```
363
+
364
+ 3. **Verify Admin API Key**:
365
+ ```bash
366
+ curl -X GET "http://localhost:8000/analytics/usage" \
367
+ -H "X-API-Key: your_admin_key"
368
+ ```
369
+
370
+ ### 6. **Performance Issues**
371
+
372
+ #### Problem: Slow Response Times
373
+ ```
374
+ ⚠️ Request taking longer than expected
375
+ ```
376
+
377
+ **Solutions:**
378
+ 1. **Enable Database Connection Pooling**:
379
+ ```python
380
+ # In init_db.py
381
+ engine = create_engine(
382
+ DATABASE_URL,
383
+ poolclass=QueuePool,
384
+ pool_size=10,
385
+ max_overflow=20
386
+ )
387
+ ```
388
+
389
+ 2. **Optimize Database Queries**:
390
+ ```python
391
+ # Use eager loading for relationships
392
+ session.query(User).options(joinedload(User.chats)).all()
393
+ ```
394
+
395
+ 3. **Add Response Caching**:
396
+ ```python
397
+ # Use local caching for expensive operations
398
+ @lru_cache(maxsize=100)
399
+ def expensive_operation(data):
400
+ return result
401
+ ```
402
+
403
+ #### Problem: Memory Usage High
404
+ ```
405
+ ⚠️ Memory usage above 80%
406
+ ```
407
+
408
+ **Solutions:**
409
+ 1. **Optimize DataFrame Operations**:
410
+ ```python
411
+ # Use chunking for large datasets
412
+ for chunk in pd.read_csv('file.csv', chunksize=1000):
413
+ process_chunk(chunk)
414
+ ```
415
+
416
+ 2. **Clear Unused Variables**:
417
+ ```python
418
+ # In code execution
419
+ del large_dataframe
420
+ import gc
421
+ gc.collect()
422
+ ```
423
+
424
+ 3. **Monitor Memory Usage**:
425
+ ```python
426
+ import psutil
427
+ import logging
428
+
429
+ memory_percent = psutil.virtual_memory().percent
430
+ if memory_percent > 80:
431
+ logging.warning(f"High memory usage: {memory_percent}%")
432
+ ```
433
+
434
+ ## 🔧 Debugging Tools and Commands
435
+
436
+ ### Health Check Commands
437
+
438
+ ```bash
439
+ # Test basic connectivity
440
+ curl http://localhost:8000/health
441
+
442
+ # Check database status
443
+ python -c "
444
+ from src.db.init_db import session_factory
445
+ try:
446
+ session = session_factory()
447
+ session.execute('SELECT 1')
448
+ print('✅ Database connection OK')
449
+ session.close()
450
+ except Exception as e:
451
+ print(f'❌ Database error: {e}')
452
+ "
453
+
454
+ # Verify agent templates
455
+ python -c "
456
+ from src.db.init_db import session_factory
457
+ from src.db.schemas.models import AgentTemplate
458
+ session = session_factory()
459
+ count = session.query(AgentTemplate).count()
460
+ print(f'Agent templates in database: {count}')
461
+ session.close()
462
+ "
463
+ ```
464
+
465
+ ### Performance Monitoring
466
+
467
+ ```python
468
+ # Memory and CPU monitoring
469
+ import psutil
470
+ import time
471
+
472
+ def monitor_system():
473
+ while True:
474
+ cpu = psutil.cpu_percent(interval=1)
475
+ memory = psutil.virtual_memory()
476
+ print(f"CPU: {cpu}% | Memory: {memory.percent}% | Available: {memory.available // 1024 // 1024}MB")
477
+ time.sleep(5)
478
+
479
+ # Run monitoring
480
+ monitor_system()
481
+ ```
482
+
483
+ ### Database Inspection
484
+
485
+ ```python
486
+ # Inspect database tables
487
+ from src.db.init_db import session_factory
488
+ from src.db.schemas.models import *
489
+
490
+ session = session_factory()
491
+
492
+ # Count records in each table
493
+ tables = [User, Chat, Message, AgentTemplate, UserTemplatePreference, DeepAnalysisReport]
494
+ for table in tables:
495
+ count = session.query(table).count()
496
+ print(f"{table.__name__}: {count} records")
497
+
498
+ session.close()
499
+ ```
500
+
501
+ ### Log Analysis
502
+
503
+ ```bash
504
+ # View recent logs
505
+ tail -f logs/app.log
506
+
507
+ # Search for errors
508
+ grep "ERROR" logs/app.log | tail -20
509
+
510
+ # Search for specific issues
511
+ grep -i "agent" logs/app.log | grep -i "error"
512
+ ```
513
+
514
+ ## 🚀 Performance Optimization Tips
515
+
516
+ ### Database Optimization
517
+
518
+ 1. **Use Indexes**: Ensure frequently queried columns have indexes
519
+ 2. **Query Optimization**: Use `joinedload` for relationships
520
+ 3. **Connection Pooling**: Configure appropriate pool sizes
521
+ 4. **Batch Operations**: Use bulk operations for multiple records
522
+
523
+ ### Agent Performance
524
+
525
+ 1. **Async Execution**: Use async patterns for concurrent operations
526
+ 2. **Result Caching**: Cache expensive computations
527
+ 3. **Memory Management**: Clean up large objects after use
528
+ 4. **Code Optimization**: Simplify generated code for better performance
529
+
530
+ ### System Monitoring
531
+
532
+ 1. **Resource Tracking**: Monitor CPU, memory, and disk usage
533
+ 2. **Error Monitoring**: Set up alerting for critical errors
534
+ 3. **Performance Metrics**: Track response times and throughput
535
+ 4. **Usage Analytics**: Monitor feature usage and optimization opportunities
536
+
537
+ This troubleshooting guide covers the most common issues you'll encounter with the Auto-Analyst backend. For additional help, check the system logs and use the debugging tools provided.
entrypoint_local.sh ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Entrypoint script for Auto-Analyst backend
4
+ # This script safely initializes the database and starts the application
5
+ # SAFE for PostgreSQL/RDS - only modifies SQLite databases
6
+
7
+ set -e # Exit on any error
8
+
9
+ echo "🚀 Starting Auto-Analyst Backend..."
10
+
11
+ # Function to run safe database initialization
12
+ init_production_database() {
13
+ echo "🔧 Running SAFE database initialization..."
14
+
15
+ # Run the safe initialization script
16
+ python scripts/init_production_db.py
17
+
18
+ # Don't fail if database initialization has issues - let app try to start
19
+ if [ $? -eq 0 ]; then
20
+ echo "✅ Database initialization completed successfully"
21
+ else
22
+ echo "⚠️ Database initialization had issues, but continuing..."
23
+ echo "📋 App will start but some features may not work properly"
24
+ fi
25
+ }
26
+
27
+ # Function to verify basic app imports work
28
+ verify_app_imports() {
29
+ echo "🔍 Verifying application imports..."
30
+ python -c "
31
+ try:
32
+ from app import app
33
+ print('✅ Main application imports successful')
34
+ except Exception as e:
35
+ print(f'❌ Application import failed: {e}')
36
+ exit(1)
37
+ " || {
38
+ echo "❌ Critical application import failure - cannot start"
39
+ exit 1
40
+ }
41
+ }
42
+
43
+ # Function to verify database connectivity (non-failing)
44
+ verify_database_connectivity() {
45
+ echo "🔗 Testing database connectivity..."
46
+ python -c "
47
+ try:
48
+ from src.db.init_db import get_session, is_postgres_db
49
+ from src.db.schemas.models import AgentTemplate
50
+
51
+ db_type = 'PostgreSQL/RDS' if is_postgres_db() else 'SQLite'
52
+ print(f'🗄️ Database type: {db_type}')
53
+
54
+ session = get_session()
55
+
56
+ # Try to query templates if table exists
57
+ try:
58
+ template_count = session.query(AgentTemplate).count()
59
+ print(f'✅ Database connected. Found {template_count} templates.')
60
+ except Exception as table_error:
61
+ print(f'⚠️ Database connected but template table issue: {table_error}')
62
+ print('📋 Template functionality may not work')
63
+ finally:
64
+ session.close()
65
+
66
+ except Exception as e:
67
+ print(f'⚠️ Database connectivity issue: {e}')
68
+ print('📋 App will start but database features may not work')
69
+ "
70
+ # Don't exit on database connectivity issues - let app try to start
71
+ }
72
+
73
+ # Function to populate agents and templates for development (SQLite only)
74
+ # Uses agents_config.json if available, falls back to legacy method
75
+ populate_agents_templates() {
76
+ echo "🔧 Checking if agents/templates need to be populated..."
77
+ python -c "
78
+ try:
79
+ from src.db.init_db import DATABASE_URL
80
+ from src.db.schemas.models import AgentTemplate
81
+ from src.db.init_db import session_factory
82
+
83
+ # Check database type
84
+ if DATABASE_URL.startswith('sqlite'):
85
+ print('🔍 SQLite database detected - checking template population')
86
+
87
+ session = session_factory()
88
+ try:
89
+ template_count = session.query(AgentTemplate).count()
90
+
91
+ if template_count == 0:
92
+ print('📋 No templates found - populating agents and templates...')
93
+ session.close()
94
+ exit(1) # Signal that population is needed
95
+ else:
96
+ print(f'✅ Found {template_count} templates - population not needed')
97
+ session.close()
98
+ exit(0) # Signal that population is not needed
99
+ except Exception as e:
100
+ print(f'⚠️ Error checking templates: {e}')
101
+ print('📋 Will attempt to populate anyway')
102
+ session.close()
103
+ exit(1) # Signal that population is needed
104
+ else:
105
+ print('🔍 PostgreSQL/RDS detected - skipping auto-population')
106
+ exit(0) # Signal that population is not needed
107
+
108
+ except Exception as e:
109
+ print(f'❌ Error during template check: {e}')
110
+ exit(0) # Don't fail startup, just skip population
111
+ "
112
+
113
+ # Check if population is needed (exit code 1 means yes)
114
+ if [ $? -eq 1 ]; then
115
+ echo "🚀 Running agent/template population for SQLite..."
116
+
117
+ # Check if agents_config.json exists (try multiple locations)
118
+ if [ -f "agents_config.json" ] || [ -f "/app/agents_config.json" ] || [ -f "../agents_config.json" ]; then
119
+ echo "📖 Found agents_config.json - validating configuration..."
120
+
121
+ # Validate configuration first
122
+ python scripts/populate_agent_templates.py validate
123
+ validation_result=$?
124
+
125
+ if [ $validation_result -eq 0 ]; then
126
+ echo "✅ Configuration valid - proceeding with sync"
127
+ python scripts/populate_agent_templates.py sync
128
+ else
129
+ echo "⚠️ Configuration validation failed - attempting sync anyway"
130
+ python scripts/populate_agent_templates.py sync
131
+ fi
132
+ else
133
+ echo "⚠️ agents_config.json not found - trying legacy method"
134
+ python scripts/populate_agent_templates.py
135
+ fi
136
+
137
+ if [ $? -eq 0 ]; then
138
+ echo "✅ Agent/template population completed successfully"
139
+ else
140
+ echo "⚠️ Agent/template population had issues, but continuing..."
141
+ echo "📋 You may need to populate templates manually"
142
+ echo "💡 Tip: Ensure agents_config.json exists in the backend directory"
143
+ fi
144
+ fi
145
+ }
146
+
147
+ # Check if we need to find agents_config.json from space root
148
+ if [ ! -f "/app/agents_config.json" ]; then
149
+ echo "⚠️ agents_config.json not found in container - checking build issues"
150
+ echo "📁 Files in /app directory:"
151
+ ls -la /app/ | head -10
152
+ else
153
+ echo "✅ agents_config.json found in container"
154
+ fi
155
+
156
+ # Main startup sequence
157
+ echo "🔧 Initializing production environment..."
158
+
159
+ # Verify critical imports first
160
+ verify_app_imports
161
+
162
+ # Initialize database safely (won't modify RDS)
163
+ init_production_database
164
+
165
+ # Test database connectivity (non-failing)
166
+ verify_database_connectivity
167
+
168
+ # Populate agents and templates for development (SQLite only)
169
+ populate_agents_templates
170
+
171
+ echo "🎯 Starting FastAPI application..."
172
+ echo "🌐 Application will be available on port 7860"
173
+
174
+ # Start the FastAPI application
175
+ exec uvicorn app:app --host 0.0.0.0 --port 7860
images/AI snapshot-chat.png ADDED

Git LFS Details

  • SHA256: d4bacf72e135239daf86d45a93ee6798aa40e2376498a7944d32b3392ac0ab19
  • Pointer size: 131 Bytes
  • Size of remote file: 305 kB
images/Auto-Analyst Banner.png ADDED

Git LFS Details

  • SHA256: 30a322031c1e8eca20f202d2bda534a921c5c4556ee9ae81f0fcabdb156ab2cd
  • Pointer size: 131 Bytes
  • Size of remote file: 184 kB
images/Auto-analyst-poster.png ADDED

Git LFS Details

  • SHA256: 7ba24a0c523fd084d27fd3f13ae3887095556d16bbcc2b8502fee3b9c8907cdc
  • Pointer size: 131 Bytes
  • Size of remote file: 184 kB
images/Auto-analysts icon small.png ADDED

Git LFS Details

  • SHA256: 5e1f25fd62bef47e389023315b1e3994321ea21eec44b70d9630154672a46d8f
  • Pointer size: 130 Bytes
  • Size of remote file: 10.1 kB
images/auto-analyst logo.png ADDED

Git LFS Details

  • SHA256: 7459da6f81ce2674f304693de6a04c7e0526c92fa7ef5c3b19d98d7a989e7fb7
  • Pointer size: 130 Bytes
  • Size of remote file: 28.1 kB
requirements.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ beautifulsoup4==4.13.4
3
+ dspy==2.6.27
4
+ litellm==1.75.2
5
+ email_validator==2.2.0
6
+ fastapi==0.115.5
7
+ fastapi-cli==0.0.7
8
+ FastAPI-SQLAlchemy==0.2.1
9
+ fastapi-sso==0.16.0
10
+ groq==0.18.0
11
+ gunicorn==23.0.0
12
+ huggingface-hub==0.30.2
13
+ joblib==1.4.2
14
+ llama-cloud==0.1.19
15
+ llama-cloud-services==0.6.21
16
+ llama-index==0.12.14
17
+ llama-index-agent-openai==0.4.2
18
+ llama-index-cli==0.4.1
19
+ llama-index-core==0.12.34.post1
20
+ llama-index-embeddings-openai==0.3.1
21
+ llama-index-indices-managed-llama-cloud==0.6.4
22
+ llama-index-llms-openai==0.3.14
23
+ llama-index-multi-modal-llms-openai==0.4.2
24
+ llama-index-program-openai==0.3.1
25
+ llama-index-question-gen-openai==0.3.0
26
+ Markdown==3.7
27
+ matplotlib==3.10.0
28
+ matplotlib-inline==0.1.7
29
+ numpy==2.2.2
30
+ openpyxl==3.1.2
31
+ xlrd==2.0.1
32
+ openai==1.97.0
33
+ pandas==2.2.3
34
+ polars==1.31.0
35
+ pillow==11.1.0
36
+ plotly==5.24.1
37
+ psycopg2==2.9.10
38
+ python-dateutil==2.9.0.post0
39
+ python-dotenv==1.0.1
40
+ requests==2.32.3
41
+ scikit-learn==1.6.1
42
+ scipy==1.15.1
43
+ seaborn==0.13.2
44
+ setuptools==75.8.0
45
+ SQLAlchemy==2.0.37
46
+ statsmodels==0.14.4
47
+ tabulate==0.9.0
48
+ threadpoolctl==3.5.0
49
+ tiktoken==0.8.0
50
+ tokenizers==0.21.0
51
+ tqdm==4.67.1
52
+ urllib3==2.4.0
53
+ uvicorn==0.29.0
54
+ websockets>=13.1.0
55
+ wheel==0.45.1
56
+ xgboost-cpu==3.0.2
57
+ bokeh==3.7.3
58
+ pymc==5.23.0
59
+ lightgbm==4.6.0
60
+ arviz==0.21.0
61
+ optuna==4.3.0
62
+ litellm[proxy]
scripts/__init__.py ADDED
File without changes
scripts/format_response.py ADDED
@@ -0,0 +1,1112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import sys
4
+ import contextlib
5
+ from io import StringIO
6
+ import time
7
+ import logging
8
+ from src.utils.logger import Logger
9
+ import textwrap
10
+
11
+ logger = Logger(__name__, level="INFO", see_time=False, console_log=False)
12
+
13
+ @contextlib.contextmanager
14
+ def stdoutIO(stdout=None):
15
+ old = sys.stdout
16
+ if stdout is None:
17
+ stdout = StringIO()
18
+ sys.stdout = stdout
19
+ yield stdout
20
+ sys.stdout = old
21
+
22
+ # Precompile regex patterns for better performance
23
+ SENSITIVE_MODULES = re.compile(r"(os|sys|subprocess|dotenv|requests|http|socket|smtplib|ftplib|telnetlib|paramiko)")
24
+ IMPORT_PATTERN = re.compile(r"^\s*import\s+(" + SENSITIVE_MODULES.pattern + r").*?(\n|$)", re.MULTILINE)
25
+ FROM_IMPORT_PATTERN = re.compile(r"^\s*from\s+(" + SENSITIVE_MODULES.pattern + r").*?(\n|$)", re.MULTILINE)
26
+ DYNAMIC_IMPORT_PATTERN = re.compile(r"__import__\s*\(\s*['\"](" + SENSITIVE_MODULES.pattern + r")['\"].*?\)")
27
+ ENV_ACCESS_PATTERN = re.compile(r"(os\.getenv|os\.environ|load_dotenv|\.__import__\s*\(\s*['\"]os['\"].*?\.environ)")
28
+ FILE_ACCESS_PATTERN = re.compile(r"(open\(|read\(|write\(|file\(|with\s+open)")
29
+
30
+ # Enhanced API key detection patterns
31
+ API_KEY_PATTERNS = [
32
+ # Direct key assignments
33
+ re.compile(r"(?i)(api_?key|access_?token|secret_?key|auth_?token|password|credential|secret)s?\s*=\s*[\"\'][\w\-\+\/\=]{8,}[\"\']"),
34
+ # Function calls with keys
35
+ re.compile(r"(?i)\.set_api_key\(\s*[\"\'][\w\-\+\/\=]{8,}[\"\']"),
36
+ # Dictionary assignments
37
+ re.compile(r"(?i)['\"](?:api_?key|access_?token|secret_?key|auth_?token|password|credential|secret)['\"](?:\s*:\s*)[\"\'][\w\-\+\/\=]{8,}[\"\']"),
38
+ # Common key formats (base64-like, hex)
39
+ re.compile(r"[\"\'](?:[A-Za-z0-9\+\/\=]{32,}|[0-9a-fA-F]{32,})[\"\']"),
40
+ # Bearer token pattern
41
+ re.compile(r"[\"\'](Bearer\s+[\w\-\+\/\=]{8,})[\"\']"),
42
+ # Inline URL with auth
43
+ re.compile(r"https?:\/\/[\w\-\+\/\=]{8,}@")
44
+ ]
45
+
46
+ # Network request patterns
47
+ NETWORK_REQUEST_PATTERNS = re.compile(r"(requests\.|urllib\.|http\.|\.post\(|\.get\(|\.connect\()")
48
+
49
+ def check_security_concerns(code_str):
50
+ """Check code for security concerns and return info about what was found"""
51
+ security_concerns = {
52
+ "has_concern": False,
53
+ "messages": [],
54
+ "blocked_imports": False,
55
+ "blocked_dynamic_imports": False,
56
+ "blocked_env_access": False,
57
+ "blocked_file_access": False,
58
+ "blocked_api_keys": False,
59
+ "blocked_network": False
60
+ }
61
+
62
+ # Check for sensitive imports
63
+ if IMPORT_PATTERN.search(code_str) or FROM_IMPORT_PATTERN.search(code_str):
64
+ security_concerns["has_concern"] = True
65
+ security_concerns["blocked_imports"] = True
66
+ security_concerns["messages"].append("Sensitive module imports blocked")
67
+
68
+ # Check for __import__ bypass technique
69
+ if DYNAMIC_IMPORT_PATTERN.search(code_str):
70
+ security_concerns["has_concern"] = True
71
+ security_concerns["blocked_dynamic_imports"] = True
72
+ security_concerns["messages"].append("Dynamic import of sensitive modules blocked")
73
+
74
+ # Check for environment variables access
75
+ if ENV_ACCESS_PATTERN.search(code_str):
76
+ security_concerns["has_concern"] = True
77
+ security_concerns["blocked_env_access"] = True
78
+ security_concerns["messages"].append("Environment variables access blocked")
79
+
80
+ # Check for file operations
81
+ if FILE_ACCESS_PATTERN.search(code_str):
82
+ security_concerns["has_concern"] = True
83
+ security_concerns["blocked_file_access"] = True
84
+ security_concerns["messages"].append("File operations blocked")
85
+
86
+ # Check for API key patterns
87
+ for pattern in API_KEY_PATTERNS:
88
+ if pattern.search(code_str):
89
+ security_concerns["has_concern"] = True
90
+ security_concerns["blocked_api_keys"] = True
91
+ security_concerns["messages"].append("API key/token usage blocked")
92
+ break
93
+
94
+ # Check for network requests
95
+ if NETWORK_REQUEST_PATTERNS.search(code_str):
96
+ security_concerns["has_concern"] = True
97
+ security_concerns["blocked_network"] = True
98
+ security_concerns["messages"].append("Network requests blocked")
99
+
100
+ return security_concerns
101
+
102
+ def clean_code_for_security(code_str, security_concerns):
103
+ """Apply security modifications to the code based on detected concerns"""
104
+ modified_code = code_str
105
+
106
+ # Block sensitive imports if needed
107
+ if security_concerns["blocked_imports"]:
108
+ modified_code = IMPORT_PATTERN.sub(r'# BLOCKED: import \1\n', modified_code)
109
+ modified_code = FROM_IMPORT_PATTERN.sub(r'# BLOCKED: from \1\n', modified_code)
110
+
111
+ # Block dynamic imports if needed
112
+ if security_concerns["blocked_dynamic_imports"]:
113
+ modified_code = DYNAMIC_IMPORT_PATTERN.sub(r'"BLOCKED_DYNAMIC_IMPORT"', modified_code)
114
+
115
+ # Block environment access if needed
116
+ if security_concerns["blocked_env_access"]:
117
+ modified_code = ENV_ACCESS_PATTERN.sub(r'"BLOCKED_ENV_ACCESS"', modified_code)
118
+
119
+ # Block file operations if needed
120
+ if security_concerns["blocked_file_access"]:
121
+ modified_code = FILE_ACCESS_PATTERN.sub(r'"BLOCKED_FILE_ACCESS"', modified_code)
122
+
123
+ # Block API keys if needed
124
+ if security_concerns["blocked_api_keys"]:
125
+ for pattern in API_KEY_PATTERNS:
126
+ modified_code = pattern.sub(r'"BLOCKED_API_KEY"', modified_code)
127
+
128
+ # Block network requests if needed
129
+ if security_concerns["blocked_network"]:
130
+ modified_code = NETWORK_REQUEST_PATTERNS.sub(r'"BLOCKED_NETWORK_REQUEST"', modified_code)
131
+
132
+ # Add warning banner if needed
133
+ if security_concerns["has_concern"]:
134
+ security_message = "⚠️ SECURITY WARNING: " + ". ".join(security_concerns["messages"]) + "."
135
+ modified_code = f"print('{security_message}')\n\n" + modified_code
136
+
137
+ return modified_code
138
+
139
+ def format_correlation_output(text):
140
+ """Format correlation matrix output for better readability"""
141
+ lines = text.split('\n')
142
+ formatted_lines = []
143
+
144
+ for line in lines:
145
+ # Skip empty lines at the beginning
146
+ if not line.strip() and not formatted_lines:
147
+ continue
148
+
149
+ if not line.strip():
150
+ formatted_lines.append(line)
151
+ continue
152
+
153
+ # Check if this line contains correlation values or variable names
154
+ stripped_line = line.strip()
155
+ parts = stripped_line.split()
156
+
157
+ if len(parts) > 1:
158
+ # Check if this is a header line with variable names
159
+ if all(part.replace('_', '').replace('-', '').isalpha() for part in parts):
160
+ # This is a header row with variable names
161
+ formatted_header = f"{'':12}" # Empty first column for row labels
162
+ for part in parts:
163
+ formatted_header += f"{part:>12}"
164
+ formatted_lines.append(formatted_header)
165
+ elif any(char.isdigit() for char in stripped_line) and ('.' in stripped_line or '-' in stripped_line):
166
+ # This looks like a correlation line with numbers
167
+ row_name = parts[0] if parts else ""
168
+ values = parts[1:] if len(parts) > 1 else []
169
+
170
+ formatted_row = f"{row_name:<12}"
171
+ for value in values:
172
+ try:
173
+ val = float(value)
174
+ formatted_row += f"{val:>12.3f}"
175
+ except ValueError:
176
+ formatted_row += f"{value:>12}"
177
+
178
+ formatted_lines.append(formatted_row)
179
+ else:
180
+ # Other lines (like titles)
181
+ formatted_lines.append(line)
182
+ else:
183
+ formatted_lines.append(line)
184
+
185
+ return '\n'.join(formatted_lines)
186
+
187
+ def format_summary_stats(text):
188
+ """Format summary statistics for better readability"""
189
+ lines = text.split('\n')
190
+ formatted_lines = []
191
+
192
+ for line in lines:
193
+ if not line.strip():
194
+ formatted_lines.append(line)
195
+ continue
196
+
197
+ # Check if this is a header line with statistical terms only (missing first column)
198
+ stripped_line = line.strip()
199
+ if any(stat in stripped_line.lower() for stat in ['count', 'mean', 'median', 'std', 'min', 'max', '25%', '50%', '75%']):
200
+ parts = stripped_line.split()
201
+ # Check if this is a header row (starts with statistical terms)
202
+ if parts and parts[0].lower() in ['count', 'mean', 'median', 'std', 'min', 'max', '25%', '50%', '75%']:
203
+ # This is a header row - add proper spacing
204
+ formatted_header = f"{'':12}" # Empty first column for row labels
205
+ for part in parts:
206
+ formatted_header += f"{part:>15}"
207
+ formatted_lines.append(formatted_header)
208
+ else:
209
+ # This is a data row - format normally
210
+ row_name = parts[0] if parts else ""
211
+ values = parts[1:] if len(parts) > 1 else []
212
+
213
+ formatted_row = f"{row_name:<12}"
214
+ for value in values:
215
+ try:
216
+ if '.' in value or 'e' in value.lower():
217
+ val = float(value)
218
+ if abs(val) >= 1000000:
219
+ formatted_row += f"{val:>15.2e}"
220
+ elif abs(val) >= 1:
221
+ formatted_row += f"{val:>15.2f}"
222
+ else:
223
+ formatted_row += f"{val:>15.6f}"
224
+ else:
225
+ val = int(value)
226
+ formatted_row += f"{val:>15}"
227
+ except ValueError:
228
+ formatted_row += f"{value:>15}"
229
+
230
+ formatted_lines.append(formatted_row)
231
+ else:
232
+ # Other lines (titles, etc.) - keep as is
233
+ formatted_lines.append(line)
234
+
235
+ return '\n'.join(formatted_lines)
236
+
237
+ def clean_print_statements(code_block):
238
+ """
239
+ This function cleans up any `print()` statements that might contain unwanted `\n` characters.
240
+ It ensures print statements are properly formatted without unnecessary newlines.
241
+ """
242
+ # This regex targets print statements, even if they have newlines inside
243
+ return re.sub(r'print\((.*?)(\\n.*?)(.*?)\)', r'print(\1\3)', code_block, flags=re.DOTALL)
244
+
245
+ def remove_code_block_from_summary(summary):
246
+ # use regex to remove code block from summary list
247
+ summary = re.sub(r'```python\n(.*?)\n```', '', summary)
248
+ return summary.split("\n")
249
+
250
+ def remove_main_block(code):
251
+ # Match the __main__ block
252
+ pattern = r'(?m)^if\s+__name__\s*==\s*["\']__main__["\']\s*:\s*\n((?:\s+.*\n?)*)'
253
+
254
+ match = re.search(pattern, code)
255
+ if match:
256
+ main_block = match.group(1)
257
+
258
+ # Dedent the code block inside __main__
259
+ dedented_block = textwrap.dedent(main_block)
260
+
261
+ # Remove \n from any print statements in the block (also handling multiline print cases)
262
+ dedented_block = clean_print_statements(dedented_block)
263
+ # Replace the block in the code
264
+ cleaned_code = re.sub(pattern, dedented_block, code)
265
+
266
+ # Optional: Remove leading newlines if any
267
+ cleaned_code = cleaned_code.strip()
268
+
269
+ return cleaned_code
270
+ return code
271
+
272
+
273
+ def format_code_block(code_str):
274
+ code_clean = re.sub(r'^```python\n?', '', code_str, flags=re.MULTILINE)
275
+ code_clean = re.sub(r'\n```$', '', code_clean)
276
+ return f'\n{code_clean}\n'
277
+
278
+ def format_code_backticked_block(code_str):
279
+ # Add None check at the beginning
280
+ if code_str is None:
281
+ return
282
+
283
+ # Add type check to ensure it's a string
284
+ if not isinstance(code_str, str):
285
+ return f"```python\n# Invalid code type: {type(code_str)}\n```"
286
+
287
+ code_clean = re.sub(r'^```python\n?', '', code_str, flags=re.MULTILINE)
288
+ code_clean = re.sub(r'\n```$', '', code_clean)
289
+ # Only match assignments at top level (not indented)
290
+ # 1. Remove 'df = pd.DataFrame()' if it's at the top level
291
+
292
+
293
+ # Remove reading the csv file if it's already in the context
294
+ modified_code = re.sub(r"df\s*=\s*pd\.read_csv\([\"\'].*?[\"\']\).*?(\n|$)", '', code_clean)
295
+ modified_code = re.sub(r'^(\s*)(df\s*=.*)$', r'\1# \2', code_clean, flags=re.MULTILINE)
296
+
297
+ # Only match assignments at top level (not indented)
298
+ # 1. Remove 'df = pd.DataFrame()' if it's at the top level
299
+ modified_code = re.sub(
300
+ r"^df\s*=\s*pd\.DataFrame\(\s*\)\s*(#.*)?$",
301
+ '',
302
+ modified_code,
303
+ flags=re.MULTILINE
304
+ )
305
+
306
+ # # Remove sample dataframe lines with multiple array values
307
+ modified_code = re.sub(r"^# Sample DataFrames?.*?(\n|$)", '', modified_code, flags=re.MULTILINE | re.IGNORECASE)
308
+
309
+ # # Remove plt.show() statements
310
+ modified_code = re.sub(r"plt\.show\(\).*?(\n|$)", '', modified_code)
311
+
312
+
313
+ # remove main
314
+ code_clean = remove_main_block(modified_code)
315
+
316
+ return f'```python\n{code_clean}\n```'
317
+
318
+
319
+ def execute_code_from_markdown(code_str, dataframe=None):
320
+ import pandas as pd
321
+ import plotly.express as px
322
+ import plotly
323
+ import plotly.graph_objects as go
324
+ import matplotlib.pyplot as plt
325
+ import seaborn as sns
326
+ import numpy as np
327
+ import re
328
+ import traceback
329
+ import sys
330
+ from io import StringIO, BytesIO
331
+ import base64
332
+
333
+ # Check for security concerns in the code
334
+ security_concerns = check_security_concerns(code_str)
335
+
336
+ # Apply security modifications to the code
337
+ modified_code = clean_code_for_security(code_str, security_concerns)
338
+
339
+ # Enhanced print function that detects and formats tabular data
340
+ captured_outputs = []
341
+ original_print = print
342
+
343
+ # Set pandas display options for full table display
344
+ pd.set_option('display.max_columns', None)
345
+ pd.set_option('display.max_rows', 20) # Limit to 20 rows instead of unlimited
346
+ pd.set_option('display.width', None)
347
+ pd.set_option('display.max_colwidth', 50)
348
+ pd.set_option('display.expand_frame_repr', False)
349
+
350
+
351
+
352
+ def enhanced_print(*args, **kwargs):
353
+ # Convert all args to strings
354
+ str_args = [str(arg) for arg in args]
355
+ output_text = kwargs.get('sep', ' ').join(str_args)
356
+
357
+ # Special case for DataFrames - use pipe delimiter and clean format
358
+ if isinstance(args[0], pd.DataFrame) and len(args) == 1:
359
+ # Format DataFrame with pipe delimiter using to_csv for reliable column separation
360
+ df = args[0]
361
+
362
+ # Use StringIO to capture CSV output with pipe delimiter
363
+ from io import StringIO
364
+ csv_buffer = StringIO()
365
+
366
+ # Export to CSV with pipe delimiter, preserving index
367
+ df.to_csv(csv_buffer, sep='|', index=True, float_format='%.6g')
368
+ csv_output = csv_buffer.getvalue()
369
+
370
+ # Clean up the CSV output - remove quotes and extra formatting
371
+ lines = csv_output.strip().split('\n')
372
+ cleaned_lines = []
373
+
374
+ for line in lines:
375
+ # Remove any quotes that might have been added by to_csv
376
+ clean_line = line.replace('"', '')
377
+ # Split by pipe, strip whitespace from each part, then rejoin
378
+ parts = [part.strip() for part in clean_line.split('|')]
379
+ cleaned_lines.append(' | '.join(parts))
380
+
381
+ output_text = '\n'.join(cleaned_lines)
382
+ captured_outputs.append(f"<TABLE_START>\n{output_text}\n<TABLE_END>")
383
+ original_print(output_text)
384
+ return
385
+
386
+ # Detect if this looks like tabular data (generic approach)
387
+ is_table = False
388
+
389
+ # Check for table patterns:
390
+ # 1. Multiple lines with consistent spacing
391
+ lines = output_text.split('\n')
392
+ if len(lines) > 2:
393
+ # Count lines that look like they have multiple columns (2+ spaces between words)
394
+ multi_column_lines = sum(1 for line in lines if len(line.split()) > 1 and ' ' in line)
395
+ if multi_column_lines >= 2: # At least 2 lines with multiple columns
396
+ is_table = True
397
+
398
+ # Check for pandas DataFrame patterns like index with column names
399
+ if any(re.search(r'^\s*\d+\s+', line) for line in lines):
400
+ # Look for lines starting with an index number followed by spaces
401
+ is_table = True
402
+
403
+ # Look for table-like structured output with multiple rows of similar format
404
+ if len(lines) >= 3:
405
+ # Sample a few lines to check for consistent structure
406
+ sample_lines = [lines[i] for i in range(min(len(lines), 5)) if i < len(lines) and lines[i].strip()]
407
+
408
+ # Check for consistent whitespace patterns
409
+ if len(sample_lines) >= 2:
410
+ # Get positions of whitespace groups in first line
411
+ whitespace_positions = []
412
+ for i, line in enumerate(sample_lines):
413
+ if not line.strip():
414
+ continue
415
+ positions = [m.start() for m in re.finditer(r'\s{2,}', line)]
416
+ if i == 0:
417
+ whitespace_positions = positions
418
+ elif len(positions) == len(whitespace_positions):
419
+ # Check if whitespace positions are roughly the same
420
+ is_similar = all(abs(pos - whitespace_positions[j]) <= 3
421
+ for j, pos in enumerate(positions)
422
+ if j < len(whitespace_positions))
423
+ if is_similar:
424
+ is_table = True
425
+
426
+ # 2. Contains common table indicators
427
+ if any(indicator in output_text.lower() for indicator in [
428
+ 'count', 'mean', 'std', 'min', 'max', '25%', '50%', '75%', # Summary stats
429
+ 'correlation', 'corr', # Correlation tables
430
+ 'coefficient', 'r-squared', 'p-value', # Regression tables
431
+ ]):
432
+ is_table = True
433
+
434
+ # 3. Has many decimal numbers (likely a data table)
435
+ if output_text.count('.') > 5 and len(lines) > 2:
436
+ is_table = True
437
+
438
+ # If we have detected a table, convert space-delimited to pipe-delimited format
439
+ if is_table:
440
+ # Convert the table to pipe-delimited format for better parsing in frontend
441
+ formatted_lines = []
442
+ for line in lines:
443
+ if not line.strip():
444
+ formatted_lines.append(line) # Keep empty lines
445
+ continue
446
+
447
+ # Split by multiple spaces and join with pipe delimiter
448
+ parts = re.split(r'\s{2,}', line.strip())
449
+ if parts:
450
+ formatted_lines.append(" | ".join(parts))
451
+ else:
452
+ formatted_lines.append(line)
453
+
454
+ # Use the pipe-delimited format
455
+ output_text = "\n".join(formatted_lines)
456
+
457
+ # Format and mark the output for table processing in UI
458
+ captured_outputs.append(f"<TABLE_START>\n{output_text}\n<TABLE_END>")
459
+ else:
460
+ captured_outputs.append(output_text)
461
+
462
+ # Also use original print for stdout capture
463
+ original_print(*args, **kwargs)
464
+
465
+ # Custom matplotlib capture function
466
+ def capture_matplotlib_chart():
467
+ """Capture current matplotlib figure as base64 encoded image"""
468
+ try:
469
+ fig = plt.gcf() # Get current figure
470
+ if fig.get_axes(): # Check if figure has any plots
471
+ buffer = BytesIO()
472
+ fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight',
473
+ facecolor='white', edgecolor='none')
474
+ buffer.seek(0)
475
+ img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
476
+ buffer.close()
477
+ plt.close(fig) # Close the figure to free memory
478
+ return img_base64
479
+ return None
480
+ except Exception:
481
+ return None
482
+
483
+ # Store original plt.show function
484
+ original_plt_show = plt.show
485
+
486
+ def custom_plt_show(*args, **kwargs):
487
+ """Custom plt.show that captures the chart instead of displaying it"""
488
+ img_base64 = capture_matplotlib_chart()
489
+ if img_base64:
490
+ matplotlib_outputs.append(img_base64)
491
+ # Don't call original show to prevent display
492
+
493
+ context = {
494
+ 'pd': pd,
495
+ 'px': px,
496
+ 'go': go,
497
+ 'plt': plt,
498
+ 'plotly': plotly,
499
+ '__builtins__': __builtins__,
500
+ '__import__': __import__,
501
+ 'sns': sns,
502
+ 'np': np,
503
+ 'json_outputs': [], # List to store multiple Plotly JSON outputs
504
+ 'matplotlib_outputs': [], # List to store matplotlib chart images as base64
505
+ 'print': enhanced_print # Replace print with our enhanced version
506
+ }
507
+
508
+ # Add matplotlib_outputs to local scope for the custom show function
509
+ matplotlib_outputs = context['matplotlib_outputs']
510
+
511
+ # Replace plt.show with our custom function
512
+ plt.show = custom_plt_show
513
+
514
+
515
+
516
+ # Modify code to store multiple JSON outputs
517
+ modified_code = re.sub(
518
+ r'(\w*_?)fig(\w*)\.show\(\)',
519
+ r'json_outputs.append(plotly.io.to_json(\1fig\2, pretty=True))',
520
+ modified_code
521
+ )
522
+
523
+ modified_code = re.sub(
524
+ r'(\w*_?)fig(\w*)\.to_html\(.*?\)',
525
+ r'json_outputs.append(plotly.io.to_json(\1fig\2, pretty=True))',
526
+ modified_code
527
+ )
528
+ # Remove reading the csv file if it's already in the context
529
+ modified_code = re.sub(r"df\s*=\s*pd\.read_csv\([\"\'].*?[\"\']\).*?(\n|$)", '', modified_code)
530
+
531
+ # Only match assignments at top level (not indented)
532
+ # 1. Remove 'df = pd.DataFrame()' if it's at the top level
533
+ modified_code = re.sub(
534
+ r"^df\s*=\s*pd\.DataFrame\(\s*\)\s*(#.*)?$",
535
+ '',
536
+ modified_code,
537
+ flags=re.MULTILINE
538
+ )
539
+
540
+
541
+ # Custom display function for DataFrames to show head + tail for large datasets
542
+ original_repr = pd.DataFrame.__repr__
543
+
544
+ def custom_df_repr(self):
545
+ if len(self) > 15:
546
+ # For large DataFrames, show first 10 and last 5 rows
547
+ head_part = self.head(10)
548
+ tail_part = self.tail(5)
549
+
550
+ head_str = head_part.__repr__()
551
+ tail_str = tail_part.__repr__()
552
+
553
+ # Extract just the data rows (skip the header from tail)
554
+ tail_lines = tail_str.split('\n')
555
+ tail_data = '\n'.join(tail_lines[1:]) # Skip header line
556
+
557
+ return f"{head_str}\n...\n{tail_data}"
558
+ else:
559
+ return original_repr(self)
560
+
561
+ # Apply custom representation temporarily
562
+ pd.DataFrame.__repr__ = custom_df_repr
563
+
564
+ # If a dataframe is provided, add it to the context
565
+ if dataframe is not None:
566
+ context['df'] = dataframe
567
+
568
+ # remove pd.read_csv() if it's already in the context
569
+ modified_code = re.sub(r"pd\.read_csv\(\s*[\"\'].*?[\"\']\s*\)", '', modified_code)
570
+
571
+ # Remove sample dataframe lines with multiple array values
572
+ modified_code = re.sub(r"^# Sample DataFrames?.*?(\n|$)", '', modified_code, flags=re.MULTILINE | re.IGNORECASE)
573
+
574
+ # Replace plt.savefig() calls with plt.show() to ensure plots are displayed
575
+ modified_code = re.sub(r'plt\.savefig\([^)]*\)', 'plt.show()', modified_code)
576
+
577
+ # Instead of removing plt.show(), keep them - they'll be handled by our custom function
578
+ # Also handle seaborn plots that might not have explicit plt.show()
579
+ # Add plt.show() after seaborn plot functions if not already present
580
+ seaborn_plot_functions = [
581
+ 'sns.scatterplot', 'sns.lineplot', 'sns.barplot', 'sns.boxplot', 'sns.violinplot',
582
+ 'sns.stripplot', 'sns.swarmplot', 'sns.pointplot', 'sns.catplot', 'sns.relplot',
583
+ 'sns.displot', 'sns.histplot', 'sns.kdeplot', 'sns.ecdfplot', 'sns.rugplot',
584
+ 'sns.distplot', 'sns.jointplot', 'sns.pairplot', 'sns.FacetGrid', 'sns.PairGrid',
585
+ 'sns.heatmap', 'sns.clustermap', 'sns.regplot', 'sns.lmplot', 'sns.residplot'
586
+ ]
587
+
588
+ # Add automatic plt.show() after seaborn plots if not already present
589
+ for func in seaborn_plot_functions:
590
+ pattern = rf'({re.escape(func)}\([^)]*\)(?:\.[^(]*\([^)]*\))*)'
591
+ def add_show(match):
592
+ plot_call = match.group(1)
593
+ # Check if the next non-empty line already has plt.show()
594
+ return f'{plot_call}\nplt.show()'
595
+
596
+ modified_code = re.sub(pattern, add_show, modified_code)
597
+
598
+ # Only add df = pd.read_csv() if no dataframe was provided and the code contains pd.read_csv
599
+ if dataframe is None and 'pd.read_csv' not in modified_code:
600
+ modified_code = re.sub(
601
+ r'import pandas as pd',
602
+ r'import pandas as pd\n\n# Read Housing.csv\ndf = pd.read_csv("Housing.csv")',
603
+ modified_code
604
+ )
605
+
606
+ # Identify code blocks by comments
607
+ code_blocks = []
608
+ current_block = []
609
+ current_block_name = "unknown"
610
+
611
+ for line in modified_code.splitlines():
612
+ # Check if line contains a block identifier comment
613
+ block_match = re.match(r'^# ([a-zA-Z_]+)_agent code start', line)
614
+ if block_match:
615
+ # If we had a previous block, save it
616
+ if current_block:
617
+ code_blocks.append((current_block_name, '\n'.join(current_block)))
618
+ # Start a new block
619
+ current_block_name = block_match.group(1)
620
+ current_block = []
621
+ else:
622
+ current_block.append(line)
623
+
624
+ # Add the last block if it exists
625
+ if current_block:
626
+ code_blocks.append((current_block_name, '\n'.join(current_block)))
627
+
628
+ # Execute each code block separately
629
+ all_outputs = []
630
+ for block_name, block_code in code_blocks:
631
+ try:
632
+ # Clear captured outputs for each block
633
+ captured_outputs.clear()
634
+
635
+ with stdoutIO() as s:
636
+ exec(block_code, context) # Execute the block
637
+
638
+ # Get both stdout and our enhanced captured outputs
639
+ stdout_output = s.getvalue()
640
+
641
+ # Combine outputs, preferring our enhanced format when available
642
+ if captured_outputs:
643
+ combined_output = '\n'.join(captured_outputs)
644
+ else:
645
+ combined_output = stdout_output
646
+
647
+ all_outputs.append((block_name, combined_output, None)) # None means no error
648
+ except Exception as e:
649
+ # Reset pandas options in case of error
650
+ pd.reset_option('display.max_columns')
651
+ pd.reset_option('display.max_rows')
652
+ pd.reset_option('display.width')
653
+ pd.reset_option('display.max_colwidth')
654
+ pd.reset_option('display.expand_frame_repr')
655
+
656
+ # Restore original DataFrame representation in case of error
657
+ pd.DataFrame.__repr__ = original_repr
658
+
659
+ # Restore original plt.show
660
+ plt.show = original_plt_show
661
+
662
+ error_traceback = traceback.format_exc()
663
+
664
+ # Extract error message and error type
665
+ error_message = str(e)
666
+ error_type = type(e).__name__
667
+ error_lines = error_traceback.splitlines()
668
+
669
+ # Format error with context of the actual code
670
+ formatted_error = f"Error in {block_name}_agent: {error_message}\n"
671
+
672
+ # Add first few lines of traceback
673
+ first_lines = error_lines[:3]
674
+ formatted_error += "\n".join(first_lines) + "\n"
675
+
676
+ # Parse problem variables/values from the error message
677
+ problem_vars = []
678
+
679
+ # Look for common error patterns
680
+ if "not in index" in error_message:
681
+ # Extract column names for 'not in index' errors
682
+ column_match = re.search(r"\['([^']+)'(?:, '([^']+)')*\] not in index", error_message)
683
+ if column_match:
684
+ problem_vars = [g for g in column_match.groups() if g is not None]
685
+
686
+ # Look for DataFrame accessing operations and list/variable definitions
687
+ potential_lines = []
688
+ code_lines = block_code.splitlines()
689
+
690
+ # First, find all DataFrame column access patterns
691
+ df_access_patterns = []
692
+ for i, line in enumerate(code_lines):
693
+ # Find DataFrame variables from patterns like "df_name[...]" or "df_name.loc[...]"
694
+ df_matches = re.findall(r'(\w+)(?:\[|\.)(?:loc|iloc|columns|at|iat|\.select)', line)
695
+ for df_var in df_matches:
696
+ df_access_patterns.append((i, df_var))
697
+
698
+ # Find variables that might contain column lists
699
+ for var in problem_vars:
700
+ if re.search(r'\b(numeric_columns|categorical_columns|columns|features|cols)\b', line):
701
+ potential_lines.append(i)
702
+
703
+ # Identify the most likely problematic lines
704
+ if df_access_patterns:
705
+ for i, df_var in df_access_patterns:
706
+ if any(re.search(rf'{df_var}\[.*?\]', line) for line in code_lines):
707
+ potential_lines.append(i)
708
+
709
+ # If no specific lines found yet, look for any DataFrame operations
710
+ if not potential_lines:
711
+ for i, line in enumerate(code_lines):
712
+ if re.search(r'(?:corr|drop|groupby|pivot|merge|join|concat|apply|map|filter|loc|iloc)\(', line):
713
+ potential_lines.append(i)
714
+
715
+ # Sort and deduplicate
716
+ potential_lines = sorted(set(potential_lines))
717
+ elif "name" in error_message and "is not defined" in error_message:
718
+ # Extract variable name for NameError
719
+ var_match = re.search(r"name '([^']+)' is not defined", error_message)
720
+ if var_match:
721
+ problem_vars = [var_match.group(1)]
722
+ elif "object has no attribute" in error_message:
723
+ # Extract attribute name for AttributeError
724
+ attr_match = re.search(r"'([^']+)' object has no attribute '([^']+)'", error_message)
725
+ if attr_match:
726
+ problem_vars = [f"{attr_match.group(1)}.{attr_match.group(2)}"]
727
+
728
+ # Scan code for lines containing the problem variables
729
+ if problem_vars:
730
+ formatted_error += "\nProblem likely in these lines:\n"
731
+ code_lines = block_code.splitlines()
732
+ problem_lines = []
733
+
734
+ # First try direct variable references
735
+ direct_matches = False
736
+ for i, line in enumerate(code_lines):
737
+ if any(var in line for var in problem_vars):
738
+ direct_matches = True
739
+ # Get line and its context (1 line before and after)
740
+ start_idx = max(0, i-1)
741
+ end_idx = min(len(code_lines), i+2)
742
+
743
+ for j in range(start_idx, end_idx):
744
+ line_prefix = f"{j+1}: "
745
+ if j == i: # The line with the problem variable
746
+ problem_lines.append(f"{line_prefix}>>> {code_lines[j]} <<<")
747
+ else:
748
+ problem_lines.append(f"{line_prefix}{code_lines[j]}")
749
+
750
+ problem_lines.append("") # Empty line between sections
751
+
752
+ # If no direct matches found but we identified potential problematic lines for DataFrame issues
753
+ if not direct_matches and "not in index" in error_message and 'potential_lines' in locals():
754
+ for i in potential_lines:
755
+ start_idx = max(0, i-1)
756
+ end_idx = min(len(code_lines), i+2)
757
+
758
+ for j in range(start_idx, end_idx):
759
+ line_prefix = f"{j+1}: "
760
+ if j == i:
761
+ problem_lines.append(f"{line_prefix}>>> {code_lines[j]} <<<")
762
+ else:
763
+ problem_lines.append(f"{line_prefix}{code_lines[j]}")
764
+
765
+ problem_lines.append("") # Empty line between sections
766
+
767
+ if problem_lines:
768
+ formatted_error += "\n".join(problem_lines)
769
+ else:
770
+ # Special message for column errors when we can't find the exact reference
771
+ if "not in index" in error_message:
772
+ formatted_error += (f"Unable to locate direct reference to columns: {', '.join(problem_vars)}\n"
773
+ f"Check for variables that might contain these column names (like numeric_columns, "
774
+ f"categorical_columns, etc.)\n")
775
+ else:
776
+ formatted_error += f"Unable to locate lines containing: {', '.join(problem_vars)}\n"
777
+ else:
778
+ # If we couldn't identify specific variables, check for line numbers in traceback
779
+ for line in reversed(error_lines): # Search from the end of traceback
780
+ # Look for user code references in the traceback
781
+ if ', line ' in line and '<module>' in line:
782
+ try:
783
+ line_num = int(re.search(r', line (\d+)', line).group(1))
784
+ code_lines = block_code.splitlines()
785
+ if 0 < line_num <= len(code_lines):
786
+ line_idx = line_num - 1
787
+ start_idx = max(0, line_idx-2)
788
+ end_idx = min(len(code_lines), line_idx+3)
789
+
790
+ formatted_error += "\nProblem at this location:\n"
791
+ for i in range(start_idx, end_idx):
792
+ line_prefix = f"{i+1}: "
793
+ if i == line_idx:
794
+ formatted_error += f"{line_prefix}>>> {code_lines[i]} <<<\n"
795
+ else:
796
+ formatted_error += f"{line_prefix}{code_lines[i]}\n"
797
+ break
798
+ except (ValueError, AttributeError, IndexError):
799
+ pass
800
+
801
+ # Add the last few lines of the traceback
802
+ formatted_error += "\nFull error details:\n"
803
+ last_lines = error_lines[-3:]
804
+ formatted_error += "\n".join(last_lines)
805
+
806
+ all_outputs.append((block_name, None, formatted_error))
807
+
808
+ # Reset pandas options after execution
809
+ pd.reset_option('display.max_columns')
810
+ pd.reset_option('display.max_rows')
811
+ pd.reset_option('display.width')
812
+ pd.reset_option('display.max_colwidth')
813
+ pd.reset_option('display.expand_frame_repr')
814
+
815
+ # Restore original DataFrame representation
816
+ pd.DataFrame.__repr__ = original_repr
817
+
818
+ # Restore original plt.show
819
+ plt.show = original_plt_show
820
+
821
+ # Compile all outputs and errors
822
+ output_text = ""
823
+ json_outputs = context.get('json_outputs', [])
824
+ matplotlib_outputs = context.get('matplotlib_outputs', [])
825
+ error_found = False
826
+
827
+ for block_name, output, error in all_outputs:
828
+ if error:
829
+ output_text += f"\n\n=== ERROR IN {block_name.upper()}_AGENT ===\n{error}\n"
830
+ error_found = True
831
+ elif output:
832
+ output_text += f"\n\n=== OUTPUT FROM {block_name.upper()}_AGENT ===\n{output}\n"
833
+
834
+ if error_found:
835
+ return output_text, [], []
836
+ else:
837
+ return output_text, json_outputs, matplotlib_outputs
838
+
839
+
840
+ def format_plan_instructions(plan_instructions):
841
+ """
842
+ Format any plan instructions (JSON string or dict) into markdown sections per agent.
843
+ """
844
+ # Parse input into a dict
845
+
846
+ if "basic_qa_agent" in str(plan_instructions):
847
+ return "**Non-Data Request**: Please ask a data related query, don't waste credits!"
848
+
849
+
850
+ try:
851
+ if isinstance(plan_instructions, str):
852
+ try:
853
+ instructions = json.loads(plan_instructions)
854
+ except json.JSONDecodeError as e:
855
+ # Try to clean the string if it's not valid JSON
856
+ cleaned_str = plan_instructions.strip()
857
+ if cleaned_str.startswith("'") and cleaned_str.endswith("'"):
858
+ cleaned_str = cleaned_str[1:-1]
859
+ try:
860
+ instructions = json.loads(cleaned_str)
861
+ except json.JSONDecodeError:
862
+ raise ValueError(f"Invalid JSON format in plan instructions: {str(e)}")
863
+ elif isinstance(plan_instructions, dict):
864
+ instructions = plan_instructions
865
+ else:
866
+ raise TypeError(f"Unsupported plan instructions type: {type(plan_instructions)}")
867
+ except Exception as e:
868
+ raise ValueError(f"Error processing plan instructions: {str(e)} + {dspy.settings.lm} ")
869
+ # logger.log_message(f"Plan instructions: {instructions}", level=logging.INFO)
870
+
871
+
872
+
873
+ markdown_lines = []
874
+ for agent, content in instructions.items():
875
+ if agent != 'basic_qa_agent':
876
+ agent_title = agent.replace('_', ' ').title()
877
+ markdown_lines.append(f"#### {agent_title}")
878
+ if isinstance(content, dict):
879
+ # Handle 'create' key
880
+ create_vals = content.get('create', [])
881
+ if create_vals:
882
+ markdown_lines.append(f"- **Create**:")
883
+ for item in create_vals:
884
+ markdown_lines.append(f" - {item}")
885
+ else:
886
+ markdown_lines.append(f"- **Create**: None")
887
+
888
+ # Handle 'use' key
889
+ use_vals = content.get('use', [])
890
+ if use_vals:
891
+ markdown_lines.append(f"- **Use**:")
892
+ for item in use_vals:
893
+ markdown_lines.append(f" - {item}")
894
+ else:
895
+ markdown_lines.append(f"- **Use**: None")
896
+
897
+ # Handle 'instruction' key
898
+ instr = content.get('instruction')
899
+ if isinstance(instr, str) and instr:
900
+ markdown_lines.append(f"- **Instruction**: {instr}")
901
+ else:
902
+ markdown_lines.append(f"- **Instruction**: None")
903
+ else:
904
+ # Fallback for non-dict content
905
+ markdown_lines.append(f"- {content}")
906
+ markdown_lines.append("") # blank line between agents
907
+ else:
908
+ markdown_lines.append(f"**Non-Data Request**: {content.get('instruction')}")
909
+
910
+ return "\n".join(markdown_lines).strip()
911
+
912
+
913
+ def format_complexity(instructions):
914
+ markdown_lines = []
915
+ # Extract complexity from various possible locations in the structure
916
+ if isinstance(instructions, dict):
917
+ # Case 1: Direct complexity field
918
+ if 'complexity' in instructions:
919
+ complexity = instructions['complexity']
920
+ # Case 2: Complexity in 'plan' object
921
+ elif 'plan' in instructions and isinstance(instructions['plan'], dict):
922
+ if 'complexity' in instructions['plan']:
923
+ complexity = instructions['plan']['complexity']
924
+ else:
925
+ complexity = "unrelated"
926
+
927
+ if 'plan' in instructions and isinstance(instructions['plan'], str) and "basic_qa_agent" in instructions['plan']:
928
+ complexity = "unrelated"
929
+
930
+ if complexity:
931
+ # Pink color scheme variations
932
+ color_map = {
933
+ "unrelated": "#FFB6B6", # Light pink
934
+ "basic": "#FF9E9E", # Medium pink
935
+ "intermediate": "#FF7F7F", # Main pink
936
+ "advanced": "#FF5F5F" # Dark pink
937
+ }
938
+
939
+ indicator_map = {
940
+ "unrelated": "○",
941
+ "basic": "●",
942
+ "intermediate": "●●",
943
+ "advanced": "●●●"
944
+ }
945
+
946
+ color = color_map.get(complexity.lower(), "#FFB6B6") # Default to light pink
947
+ indicator = indicator_map.get(complexity.lower(), "○")
948
+
949
+ # Slightly larger display with pink styling
950
+ markdown_lines.append(f"<div style='color: {color}; border: 2px solid {color}; padding: 2px 8px; border-radius: 12px; display: inline-block; font-size: 14.4px;'>{indicator} {complexity}</div>\n")
951
+
952
+ return "\n".join(markdown_lines).strip()
953
+
954
+
955
+ def format_response_to_markdown(api_response, agent_name = None, dataframe=None):
956
+ try:
957
+ markdown = []
958
+ # logger.log_message(f"API response for {agent_name} at {time.strftime('%Y-%m-%d %H:%M:%S')}: {api_response}", level=logging.INFO)
959
+
960
+ if isinstance(api_response, dict):
961
+ for key in api_response:
962
+ if "error" in api_response[key] and "litellm.RateLimitError" in api_response[key]['error'].lower():
963
+ return f"**Error**: Rate limit exceeded. Please try switching models from the settings."
964
+ # You can add more checks here if needed for other keys
965
+
966
+ # Handle error responses
967
+ if isinstance(api_response, dict) and "error" in api_response:
968
+ return f"**Error**: {api_response['error']}"
969
+ if "response" in api_response and isinstance(api_response['response'], str):
970
+ if any(err in api_response['response'].lower() for err in ["auth", "api", "lm"]):
971
+ return "**Error**: Authentication failed. Please check your API key in settings and try again."
972
+ if "model" in api_response['response'].lower():
973
+ return "**Error**: Model configuration error. Please verify your model selection in settings."
974
+
975
+ for agent, content in api_response.items():
976
+ agent = agent.split("__")[0] if "__" in agent else agent
977
+ if "memory" in agent or not content:
978
+ continue
979
+
980
+ if "complexity" in content:
981
+ markdown.append(f"{format_complexity(content)}\n")
982
+
983
+ markdown.append(f"\n## {agent.replace('_', ' ').title()}\n")
984
+
985
+ if agent == "analytical_planner":
986
+ logger.log_message(f"Analytical planner content: {content}", level=logging.INFO)
987
+ if 'plan_desc' in content:
988
+ markdown.append(f"### Reasoning\n{content['plan_desc']}\n")
989
+ if 'plan_instructions' in content:
990
+ markdown.append(f"{format_plan_instructions(content['plan_instructions'])}\n")
991
+ else:
992
+ markdown.append(f"### Reasoning\n{content['rationale']}\n")
993
+ else:
994
+ if "rationale" in content:
995
+ markdown.append(f"### Reasoning\n{content['rationale']}\n")
996
+
997
+ if 'code' in content and content['code'] is not None:
998
+ markdown.append(f"### Code Implementation\n{format_code_backticked_block(content['code'])}\n")
999
+ if 'answer' in content:
1000
+ markdown.append(f"### Answer\n{content['answer']}\n Please ask a query about the data")
1001
+ if 'summary' in content:
1002
+ import re
1003
+ summary_text = content['summary']
1004
+ summary_text = re.sub(r'```python\n(.*?)\n```', '', summary_text, flags=re.DOTALL)
1005
+
1006
+ markdown.append("### Summary\n")
1007
+
1008
+ # Extract pre-list intro, bullet points, and post-list text
1009
+ intro_match = re.split(r'\(\d+\)', summary_text, maxsplit=1)
1010
+ if len(intro_match) > 1:
1011
+ intro_text = intro_match[0].strip()
1012
+ rest_text = "(1)" + intro_match[1] # reattach for bullet parsing
1013
+ else:
1014
+ intro_text = summary_text.strip()
1015
+ rest_text = ""
1016
+
1017
+ if intro_text:
1018
+ markdown.append(f"{intro_text}\n")
1019
+
1020
+ # Split bullets at numbered items like (1)...(8)
1021
+ bullets = re.split(r'\(\d+\)', rest_text)
1022
+ bullets = [b.strip(" ,.\n") for b in bullets if b.strip()]
1023
+
1024
+ # Check for post-list content (anything after the last number)
1025
+ for i, bullet in enumerate(bullets):
1026
+ markdown.append(f"* {bullet}\n")
1027
+
1028
+
1029
+
1030
+
1031
+ if 'refined_complete_code' in content and 'summary' in content:
1032
+ try:
1033
+ if content['refined_complete_code'] is not None and content['refined_complete_code'] != "":
1034
+ clean_code = format_code_block(content['refined_complete_code'])
1035
+ markdown_code = format_code_backticked_block(content['refined_complete_code'])
1036
+ output, json_outputs, matplotlib_outputs = execute_code_from_markdown(clean_code, dataframe)
1037
+ elif "```python" in content['summary']:
1038
+ clean_code = format_code_block(content['summary'])
1039
+ markdown_code = format_code_backticked_block(content['summary'])
1040
+ output, json_outputs, matplotlib_outputs = execute_code_from_markdown(clean_code, dataframe)
1041
+ except Exception as e:
1042
+ logger.log_message(f"Error in execute_code_from_markdown: {str(e)}", level=logging.ERROR)
1043
+ markdown_code = f"**Error**: {str(e)}"
1044
+ output = None
1045
+ json_outputs = []
1046
+ matplotlib_outputs = []
1047
+ # continue
1048
+
1049
+ if markdown_code is not None:
1050
+ markdown.append(f"### Refined Complete Code\n{markdown_code}\n")
1051
+
1052
+ if output:
1053
+ markdown.append("### Execution Output\n")
1054
+ markdown.append(f"```output\n{output}\n```\n")
1055
+
1056
+ if json_outputs:
1057
+ markdown.append("### Plotly JSON Outputs\n")
1058
+ for idx, json_output in enumerate(json_outputs):
1059
+ markdown.append(f"```plotly\n{json_output}\n```\n")
1060
+
1061
+ if matplotlib_outputs:
1062
+ markdown.append("### Matplotlib/Seaborn Charts\n")
1063
+ for idx, img_base64 in enumerate(matplotlib_outputs):
1064
+ markdown.append(f"```matplotlib\n{img_base64}\n```\n")
1065
+ # if agent_name is not None:
1066
+ # if f"memory_{agent_name}" in api_response:
1067
+ # markdown.append(f"### Memory\n{api_response[f'memory_{agent_name}']}\n")
1068
+
1069
+ except Exception as e:
1070
+ logger.log_message(f"Error in format_response_to_markdown: {str(e)}", level=logging.ERROR)
1071
+ return f"error formating markdown {str(e)}"
1072
+
1073
+ # logger.log_message(f"Generated markdown content for agent '{agent_name}' at {time.strftime('%Y-%m-%d %H:%M:%S')}: {markdown}, length: {len(markdown)}", level=logging.INFO)
1074
+
1075
+ if not markdown or len(markdown) <= 1:
1076
+ logger.log_message(
1077
+ f"Invalid markdown content for agent '{agent_name}' at {time.strftime('%Y-%m-%d %H:%M:%S')}: "
1078
+ f"Content: '{markdown}', Type: {type(markdown)}, Length: {len(markdown) if markdown else 0}, "
1079
+ f"API Response: {api_response}",
1080
+ level=logging.ERROR
1081
+ )
1082
+ return ""
1083
+
1084
+ return '\n'.join(markdown)
1085
+
1086
+
1087
+ # Example usage with dummy data
1088
+ if __name__ == "__main__":
1089
+ sample_response = {
1090
+ "code_combiner_agent": {
1091
+ "reasoning": "Sample reasoning for multiple charts.",
1092
+ "refined_complete_code": """
1093
+ ```python
1094
+ import plotly.express as px
1095
+ import pandas as pd
1096
+
1097
+ # Sample Data
1098
+ df = pd.DataFrame({'Category': ['A', 'B', 'C'], 'Values': [10, 20, 30]})
1099
+
1100
+ # First Chart
1101
+ fig = px.bar(df, x='Category', y='Values', title='Bar Chart')
1102
+ fig.show()
1103
+
1104
+ # Second Chart
1105
+ fig2 = px.pie(df, values='Values', names='Category', title='Pie Chart')
1106
+ fig2.show()
1107
+ ```
1108
+ """
1109
+ }
1110
+ }
1111
+
1112
+ formatted_md = format_response_to_markdown(sample_response)
scripts/init_production_db.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Production database initialization script.
4
+ This ensures templates are populated properly and verifies database health.
5
+ SAFE for PostgreSQL/RDS - only creates tables on SQLite databases.
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ import logging
11
+ from datetime import datetime, UTC
12
+
13
+ # Add the project root to the Python path
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ from src.db.init_db import init_db, session_factory, engine, is_postgres_db
17
+ from src.db.schemas.models import Base, AgentTemplate, UserTemplatePreference
18
+ from scripts.populate_agent_templates import populate_templates
19
+ from sqlalchemy import inspect, text
20
+ from src.utils.logger import Logger
21
+
22
+ logger = Logger("init_production_db", see_time=True, console_log=True)
23
+
24
+ def get_database_type():
25
+ """Get the database type (sqlite or postgresql)."""
26
+ try:
27
+ if is_postgres_db():
28
+ return "postgresql"
29
+ else:
30
+ return "sqlite"
31
+ except Exception as e:
32
+ logger.log_message(f"Error determining database type: {e}", logging.ERROR)
33
+ return "unknown"
34
+
35
+ def check_table_exists(table_name: str) -> bool:
36
+ """Check if a table exists in the database."""
37
+ try:
38
+ inspector = inspect(engine)
39
+ tables = inspector.get_table_names()
40
+ return table_name in tables
41
+ except Exception as e:
42
+ logger.log_message(f"Error checking table existence: {e}", logging.ERROR)
43
+ return False
44
+
45
+ def verify_database_schema():
46
+ """Verify that all required tables exist. Only create tables on SQLite."""
47
+ db_type = get_database_type()
48
+ logger.log_message(f"🔍 Verifying database schema for {db_type.upper()} database...", logging.INFO)
49
+
50
+ required_tables = [
51
+ 'users', 'chats', 'messages', 'model_usage', 'code_executions',
52
+ 'message_feedback', 'deep_analysis_reports', 'agent_templates',
53
+ 'user_template_preferences'
54
+ ]
55
+
56
+ missing_tables = []
57
+ existing_tables = []
58
+
59
+ for table in required_tables:
60
+ if not check_table_exists(table):
61
+ missing_tables.append(table)
62
+ logger.log_message(f"❌ Missing table: {table}", logging.WARNING)
63
+ else:
64
+ existing_tables.append(table)
65
+ logger.log_message(f"✅ Table exists: {table}", logging.INFO)
66
+
67
+ if missing_tables:
68
+ if db_type == "sqlite":
69
+ logger.log_message(f"🔧 Creating missing tables on SQLite: {missing_tables}", logging.INFO)
70
+ try:
71
+ # Safe to create tables on SQLite
72
+ Base.metadata.create_all(engine)
73
+ logger.log_message("✅ All tables created successfully on SQLite", logging.INFO)
74
+ except Exception as e:
75
+ logger.log_message(f"❌ Failed to create tables: {e}", logging.ERROR)
76
+ raise
77
+ else:
78
+ # PostgreSQL/RDS - DO NOT create tables automatically
79
+ logger.log_message(f"⚠️ WARNING: Missing tables detected in {db_type.upper()} database: {missing_tables}", logging.WARNING)
80
+ logger.log_message("🛡️ SAFETY: Not creating tables automatically on PostgreSQL/RDS", logging.INFO)
81
+ logger.log_message("📋 Please ensure these tables exist in your RDS database:", logging.INFO)
82
+ for table in missing_tables:
83
+ logger.log_message(f" - {table}", logging.INFO)
84
+
85
+ # Continue without failing - the app might still work with existing tables
86
+ if 'agent_templates' in missing_tables or 'user_template_preferences' in missing_tables:
87
+ logger.log_message("⚠️ Template functionality may not work without agent_templates and user_template_preferences tables", logging.WARNING)
88
+ else:
89
+ logger.log_message(f"✅ All required tables exist in {db_type.upper()} database", logging.INFO)
90
+
91
+ def verify_template_data():
92
+ """Verify that agent templates are populated. Safe for all database types."""
93
+ logger.log_message("📋 Verifying template data...", logging.INFO)
94
+
95
+ session = session_factory()
96
+ try:
97
+ # Check if agent_templates table exists before querying
98
+ if not check_table_exists('agent_templates'):
99
+ logger.log_message("⚠️ agent_templates table does not exist, skipping template verification", logging.WARNING)
100
+ return
101
+
102
+ template_count = session.query(AgentTemplate).filter(AgentTemplate.is_active == True).count()
103
+ logger.log_message(f"📊 Found {template_count} active templates", logging.INFO)
104
+
105
+ if template_count == 0:
106
+ logger.log_message("🔧 No templates found, populating...", logging.INFO)
107
+ try:
108
+ populate_templates()
109
+
110
+ # Verify population worked
111
+ new_count = session.query(AgentTemplate).filter(AgentTemplate.is_active == True).count()
112
+ logger.log_message(f"✅ Templates populated. Total active templates: {new_count}", logging.INFO)
113
+ except Exception as e:
114
+ logger.log_message(f"❌ Template population failed: {e}", logging.ERROR)
115
+ logger.log_message("⚠️ App will continue but template functionality may not work", logging.WARNING)
116
+ else:
117
+ logger.log_message("✅ Templates already populated", logging.INFO)
118
+
119
+ except Exception as e:
120
+ logger.log_message(f"❌ Error verifying templates: {e}", logging.ERROR)
121
+ logger.log_message("⚠️ Template verification failed, but app will continue", logging.WARNING)
122
+ finally:
123
+ session.close()
124
+
125
+ def test_template_api_functionality():
126
+ """Test that template-related database operations work. Safe for all database types."""
127
+ logger.log_message("🧪 Testing template API functionality...", logging.INFO)
128
+
129
+ session = session_factory()
130
+ try:
131
+ # Check if agent_templates table exists before testing
132
+ if not check_table_exists('agent_templates'):
133
+ logger.log_message("⚠️ agent_templates table does not exist, skipping API test", logging.WARNING)
134
+ return
135
+
136
+ # Test basic template query
137
+ templates = session.query(AgentTemplate).filter(AgentTemplate.is_active == True).limit(5).all()
138
+ logger.log_message(f"✅ Successfully queried {len(templates)} templates", logging.INFO)
139
+
140
+ if templates:
141
+ sample_template = templates[0]
142
+ logger.log_message(f"📄 Sample template: {sample_template.template_name} - {sample_template.display_name}", logging.INFO)
143
+ else:
144
+ logger.log_message("📭 No templates found in database", logging.INFO)
145
+
146
+ except Exception as e:
147
+ logger.log_message(f"❌ Template API test failed: {e}", logging.ERROR)
148
+ logger.log_message("⚠️ Template API may not work properly", logging.WARNING)
149
+ finally:
150
+ session.close()
151
+
152
+ def run_safe_initialization():
153
+ """Run safe database initialization that respects production databases."""
154
+ db_type = get_database_type()
155
+ logger.log_message(f"🚀 Starting SAFE database initialization for {db_type.upper()}...", logging.INFO)
156
+
157
+ if db_type == "postgresql":
158
+ logger.log_message("🛡️ PostgreSQL/RDS detected - running in SAFE mode", logging.INFO)
159
+ logger.log_message("📋 Will only verify schema and populate templates", logging.INFO)
160
+ elif db_type == "sqlite":
161
+ logger.log_message("💽 SQLite detected - full initialization mode", logging.INFO)
162
+
163
+ try:
164
+ # Step 1: Initialize database (safe for all types)
165
+ logger.log_message("Step 1: Basic database initialization", logging.INFO)
166
+ if db_type == "sqlite":
167
+ init_db() # Only run full init on SQLite
168
+ else:
169
+ logger.log_message("Skipping init_db() for PostgreSQL (safety)", logging.INFO)
170
+
171
+ # Step 2: Verify schema (safe - only creates tables on SQLite)
172
+ logger.log_message("Step 2: Schema verification", logging.INFO)
173
+ verify_database_schema()
174
+
175
+ # Step 3: Verify template data (safe for all types)
176
+ logger.log_message("Step 3: Template data verification", logging.INFO)
177
+ verify_template_data()
178
+
179
+ # Step 4: Test functionality (safe for all types)
180
+ logger.log_message("Step 4: Functionality testing", logging.INFO)
181
+ test_template_api_functionality()
182
+
183
+ logger.log_message(f"🎉 Safe database initialization completed for {db_type.upper()}!", logging.INFO)
184
+
185
+ except Exception as e:
186
+ logger.log_message(f"💥 Database initialization failed: {e}", logging.ERROR)
187
+ logger.log_message("⚠️ App may still start but some features might not work", logging.WARNING)
188
+ # Don't raise - let the app try to start anyway
189
+
190
+ if __name__ == "__main__":
191
+ run_safe_initialization()
scripts/populate_agent_templates.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SQLite Agent Template Management Script
4
+ Similar to manage_templates.py but optimized for local SQLite development.
5
+ Reads agents from agents_config.json and manages SQLite database.
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ import json
11
+ import requests
12
+ from datetime import datetime, UTC
13
+ from pathlib import Path
14
+
15
+ # Add the project root to the Python path
16
+ script_dir = os.path.dirname(os.path.abspath(__file__))
17
+ backend_dir = os.path.dirname(script_dir)
18
+ project_root = os.path.dirname(os.path.dirname(backend_dir))
19
+
20
+ # Change to backend directory to ensure proper path resolution
21
+ os.chdir(backend_dir)
22
+ sys.path.append(backend_dir)
23
+
24
+ from src.db.init_db import session_factory, DATABASE_URL
25
+ from src.db.schemas.models import AgentTemplate
26
+ from sqlalchemy.exc import IntegrityError
27
+
28
+ def get_database_type():
29
+ """Detect database type from DATABASE_URL"""
30
+ if DATABASE_URL.startswith('postgresql'):
31
+ return "postgresql"
32
+ elif DATABASE_URL.startswith('sqlite'):
33
+ return "sqlite"
34
+ else:
35
+ return "unknown"
36
+
37
+ def load_agents_config():
38
+ """Load agents configuration from agents_config.json"""
39
+ # Try multiple possible locations for agents_config.json
40
+ possible_paths = [
41
+ os.path.join(backend_dir, 'agents_config.json'), # Backend directory (copied file)
42
+ os.path.join(project_root, 'agents_config.json'), # Project root
43
+ '/app/agents_config.json', # Container root (HF Spaces)
44
+ 'agents_config.json' # Current directory
45
+ ]
46
+
47
+ config_path = None
48
+ for path in possible_paths:
49
+ if os.path.exists(path):
50
+ config_path = path
51
+ print(f"📖 Found agents_config.json at: {config_path}")
52
+ break
53
+
54
+ if not config_path:
55
+ paths_str = '\n '.join(possible_paths)
56
+ raise FileNotFoundError(f"agents_config.json not found in any of these locations:\n {paths_str}")
57
+
58
+ with open(config_path, 'r', encoding='utf-8') as f:
59
+ config = json.load(f)
60
+
61
+ return config.get('templates', [])
62
+
63
+ def download_icon(icon_url, template_name):
64
+ """Download icon from URL and save to frontend directory"""
65
+ if not icon_url or not icon_url.startswith('http'):
66
+ print(f"⏭️ Skipping icon download for {template_name} (not a URL: {icon_url})")
67
+ return icon_url
68
+
69
+ try:
70
+ # Determine frontend directory
71
+ frontend_dir = os.path.join(project_root, 'Auto-Analyst-CS', 'auto-analyst-frontend')
72
+ public_dir = os.path.join(frontend_dir, 'public')
73
+
74
+ if not os.path.exists(public_dir):
75
+ print(f"⚠️ Frontend public directory not found: {public_dir}")
76
+ return icon_url
77
+
78
+ # Parse the path from icon_url
79
+ if '/icons/templates/' in icon_url:
80
+ relative_path = icon_url.split('/icons/templates/')[-1]
81
+ icon_dir = os.path.join(public_dir, 'icons', 'templates')
82
+ else:
83
+ # Fallback: use filename from URL
84
+ filename = icon_url.split('/')[-1]
85
+ if not filename.endswith(('.svg', '.png', '.jpg', '.jpeg')):
86
+ filename += '.svg'
87
+ relative_path = filename
88
+ icon_dir = os.path.join(public_dir, 'icons', 'templates')
89
+
90
+ # Create icon directory if it doesn't exist
91
+ os.makedirs(icon_dir, exist_ok=True)
92
+
93
+ # Download and save icon
94
+ icon_path = os.path.join(icon_dir, relative_path)
95
+
96
+ # Skip if already exists
97
+ if os.path.exists(icon_path):
98
+ print(f"📁 Icon already exists: {relative_path}")
99
+ return f"/icons/templates/{relative_path}"
100
+
101
+ response = requests.get(icon_url, timeout=10)
102
+ response.raise_for_status()
103
+
104
+ with open(icon_path, 'wb') as f:
105
+ f.write(response.content)
106
+
107
+ print(f"📥 Downloaded icon: {relative_path}")
108
+ return f"/icons/templates/{relative_path}"
109
+
110
+ except Exception as e:
111
+ print(f"❌ Failed to download icon for {template_name}: {str(e)}")
112
+ return icon_url
113
+
114
+ def sync_agents_from_config():
115
+ """Synchronize agents from agents_config.json to SQLite database"""
116
+ session = session_factory()
117
+ db_type = get_database_type()
118
+
119
+ # if db_type != "sqlite":
120
+ # print(f"⚠️ This script is designed for SQLite, but detected {db_type}")
121
+ # print("Consider using manage_templates.py for PostgreSQL")
122
+ # return
123
+
124
+ try:
125
+ # Load configuration
126
+ print(f"📖 Loading agents from agents_config.json...")
127
+ templates_config = load_agents_config()
128
+
129
+ if not templates_config:
130
+ print("❌ No templates found in agents_config.json")
131
+ return
132
+
133
+ # Track statistics
134
+ created_count = 0
135
+ updated_count = 0
136
+ skipped_count = 0
137
+
138
+ print(f"🔍 Processing {len(templates_config)} templates for SQLite database")
139
+ print(f"📋 Database URL: {DATABASE_URL}")
140
+
141
+ # Group templates by category for display
142
+ categories = {}
143
+ for template_data in templates_config:
144
+ category = template_data.get('category', 'Uncategorized')
145
+ if category not in categories:
146
+ categories[category] = []
147
+ categories[category].append(template_data)
148
+
149
+ # Process templates by category
150
+ for category, templates in categories.items():
151
+ print(f"\n📁 {category}:")
152
+
153
+ for template_data in templates:
154
+ template_name = template_data["template_name"]
155
+
156
+ # Check if template already exists
157
+ existing = session.query(AgentTemplate).filter(
158
+ AgentTemplate.template_name == template_name
159
+ ).first()
160
+
161
+ # Download icon if it's a URL
162
+ icon_url = template_data.get("icon_url", "")
163
+ if icon_url.startswith('http'):
164
+ icon_url = download_icon(icon_url, template_name)
165
+
166
+ if existing:
167
+ # Update existing template
168
+ existing.display_name = template_data["display_name"]
169
+ existing.description = template_data["description"]
170
+ existing.icon_url = icon_url
171
+ existing.prompt_template = template_data["prompt_template"]
172
+ existing.category = template_data.get("category", "Uncategorized")
173
+ existing.is_premium_only = template_data.get("is_premium_only", False)
174
+ existing.is_active = template_data.get("is_active", True)
175
+ existing.variant_type = template_data.get("variant_type", "individual")
176
+ existing.base_agent = template_data.get("base_agent", template_name)
177
+ existing.updated_at = datetime.now(UTC)
178
+
179
+ variant_icon = "🤖" if template_data.get("variant_type") == "planner" else "👤"
180
+ premium_icon = "🔒" if template_data.get("is_premium_only") else "🆓"
181
+ print(f"🔄 Updated: {template_name} {variant_icon} {premium_icon}")
182
+ updated_count += 1
183
+ else:
184
+ # Create new template
185
+ template = AgentTemplate(
186
+ template_name=template_name,
187
+ display_name=template_data["display_name"],
188
+ description=template_data["description"],
189
+ icon_url=icon_url,
190
+ prompt_template=template_data["prompt_template"],
191
+ category=template_data.get("category", "Uncategorized"),
192
+ is_premium_only=template_data.get("is_premium_only", False),
193
+ is_active=template_data.get("is_active", True),
194
+ variant_type=template_data.get("variant_type", "individual"),
195
+ base_agent=template_data.get("base_agent", template_name),
196
+ created_at=datetime.now(UTC),
197
+ updated_at=datetime.now(UTC)
198
+ )
199
+
200
+ session.add(template)
201
+ variant_icon = "🤖" if template_data.get("variant_type") == "planner" else "👤"
202
+ premium_icon = "🔒" if template_data.get("is_premium_only") else "🆓"
203
+ print(f"✅ Created: {template_name} {variant_icon} {premium_icon}")
204
+ created_count += 1
205
+
206
+ # Handle removals if specified in config
207
+ remove_list = []
208
+ # Re-load the full config to check for removals
209
+ try:
210
+ full_config_path = None
211
+ possible_paths = [
212
+ os.path.join(backend_dir, 'agents_config.json'),
213
+ os.path.join(project_root, 'agents_config.json'),
214
+ '/app/agents_config.json',
215
+ 'agents_config.json'
216
+ ]
217
+
218
+ for path in possible_paths:
219
+ if os.path.exists(path):
220
+ full_config_path = path
221
+ break
222
+
223
+ if full_config_path:
224
+ with open(full_config_path, 'r', encoding='utf-8') as f:
225
+ full_config = json.load(f)
226
+ if 'remove' in full_config:
227
+ remove_list = full_config['remove']
228
+ except Exception as e:
229
+ print(f"⚠️ Could not load removal list: {e}")
230
+
231
+ # Remove templates marked for removal
232
+ if remove_list:
233
+ print(f"\n🗑️ --- Processing Removals ---")
234
+ for template_name in remove_list:
235
+ existing = session.query(AgentTemplate).filter(
236
+ AgentTemplate.template_name == template_name
237
+ ).first()
238
+
239
+ if existing:
240
+ session.delete(existing)
241
+ print(f"🗑️ Removed: {template_name}")
242
+ else:
243
+ print(f"⏭️ Skipping removal: {template_name} (not found)")
244
+
245
+ # Commit all changes
246
+ session.commit()
247
+
248
+ print(f"\n📊 --- Summary ---")
249
+ print(f"✅ Templates created: {created_count}")
250
+ print(f"🔄 Templates updated: {updated_count}")
251
+ print(f"⏭️ Templates skipped: {skipped_count}")
252
+
253
+ # Show total count in database
254
+ total_count = session.query(AgentTemplate).count()
255
+ free_count = session.query(AgentTemplate).filter(AgentTemplate.is_premium_only == False).count()
256
+ premium_count = session.query(AgentTemplate).filter(AgentTemplate.is_premium_only == True).count()
257
+ individual_count = session.query(AgentTemplate).filter(AgentTemplate.variant_type == 'individual').count()
258
+ planner_count = session.query(AgentTemplate).filter(AgentTemplate.variant_type == 'planner').count()
259
+
260
+ print(f"🗄️ Total templates in database: {total_count}")
261
+ print(f"🆓 Free templates: {free_count}")
262
+ print(f"🔒 Premium templates: {premium_count}")
263
+ print(f"👤 Individual variants: {individual_count}")
264
+ print(f"🤖 Planner variants: {planner_count}")
265
+
266
+ except Exception as e:
267
+ session.rollback()
268
+ print(f"❌ Error syncing templates: {str(e)}")
269
+ raise
270
+ finally:
271
+ session.close()
272
+
273
+ def list_templates():
274
+ """List all existing templates in the database"""
275
+ session = session_factory()
276
+
277
+ try:
278
+ templates = session.query(AgentTemplate).order_by(
279
+ AgentTemplate.category,
280
+ AgentTemplate.is_premium_only,
281
+ AgentTemplate.template_name
282
+ ).all()
283
+
284
+ if not templates:
285
+ print("No templates found in database.")
286
+ return
287
+
288
+ print(f"\n--- Existing Templates ({len(templates)} total) ---")
289
+
290
+ current_category = None
291
+ for template in templates:
292
+ if template.category != current_category:
293
+ current_category = template.category
294
+ print(f"\n📁 {current_category}:")
295
+
296
+ status = "🔒 Premium" if template.is_premium_only else "🆓 Free"
297
+ active = "✅ Active" if template.is_active else "❌ Inactive"
298
+ variant = getattr(template, 'variant_type', 'individual')
299
+ variant_icon = "🤖" if variant == "planner" else "👤"
300
+
301
+ print(f" • {template.template_name} ({template.display_name})")
302
+ print(f" {status} - {active} - {variant_icon} {variant}")
303
+ print(f" 📝 {template.description}")
304
+
305
+ except Exception as e:
306
+ print(f"❌ Error listing templates: {str(e)}")
307
+ finally:
308
+ session.close()
309
+
310
+ def remove_all_templates():
311
+ """Remove all templates from database (for testing)"""
312
+ session = session_factory()
313
+
314
+ try:
315
+ deleted_count = session.query(AgentTemplate).delete()
316
+ session.commit()
317
+ print(f"🗑️ Removed {deleted_count} templates from database")
318
+
319
+ except Exception as e:
320
+ session.rollback()
321
+ print(f"❌ Error removing templates: {str(e)}")
322
+ finally:
323
+ session.close()
324
+
325
+ def validate_config():
326
+ """Validate the agents_config.json structure"""
327
+ try:
328
+ templates_config = load_agents_config()
329
+
330
+ print(f"📋 Validating agents_config.json...")
331
+ print(f"✅ Found {len(templates_config)} templates")
332
+
333
+ # Check required fields
334
+ required_fields = ['template_name', 'display_name', 'description', 'prompt_template']
335
+ issues = []
336
+
337
+ for i, template in enumerate(templates_config):
338
+ for field in required_fields:
339
+ if field not in template:
340
+ issues.append(f"Template {i}: Missing required field '{field}'")
341
+
342
+ if issues:
343
+ print(f"❌ Validation issues found:")
344
+ for issue in issues:
345
+ print(f" • {issue}")
346
+ else:
347
+ print(f"✅ Configuration is valid")
348
+
349
+ # Show summary by category
350
+ categories = {}
351
+ for template in templates_config:
352
+ category = template.get('category', 'Uncategorized')
353
+ if category not in categories:
354
+ categories[category] = {'free': 0, 'premium': 0, 'individual': 0, 'planner': 0}
355
+
356
+ if template.get('is_premium_only', False):
357
+ categories[category]['premium'] += 1
358
+ else:
359
+ categories[category]['free'] += 1
360
+
361
+ if template.get('variant_type', 'individual') == 'planner':
362
+ categories[category]['planner'] += 1
363
+ else:
364
+ categories[category]['individual'] += 1
365
+
366
+ print(f"\n📊 Summary by category:")
367
+ for category, counts in categories.items():
368
+ total = counts['free'] + counts['premium']
369
+ print(f" 📁 {category}: {total} templates")
370
+ print(f" 🆓 Free: {counts['free']} | 🔒 Premium: {counts['premium']}")
371
+ print(f" 👤 Individual: {counts['individual']} | 🤖 Planner: {counts['planner']}")
372
+
373
+ except Exception as e:
374
+ print(f"❌ Error validating config: {str(e)}")
375
+
376
+ def create_minimal_templates():
377
+ """Create a minimal set of essential templates for container environments"""
378
+ session = session_factory()
379
+
380
+ try:
381
+ print("🔧 Creating minimal template set...")
382
+
383
+ # Define minimal essential templates
384
+ minimal_templates = [
385
+ {
386
+ "template_name": "preprocessing_agent",
387
+ "display_name": "Data Preprocessing Agent",
388
+ "description": "Cleans and prepares DataFrame using Pandas and NumPy",
389
+ "icon_url": "/icons/templates/preprocessing_agent.svg",
390
+ "category": "Data Manipulation",
391
+ "is_premium_only": False,
392
+ "variant_type": "individual",
393
+ "base_agent": "preprocessing_agent",
394
+ "is_active": True,
395
+ "prompt_template": "You are a preprocessing agent that cleans and prepares data using Pandas and NumPy. Handle missing values, detect column types, and convert date strings to datetime. Generate clean Python code for data preprocessing based on the user's analysis goals."
396
+ },
397
+ {
398
+ "template_name": "data_viz_agent",
399
+ "display_name": "Data Visualization Agent",
400
+ "description": "Creates interactive visualizations using Plotly",
401
+ "icon_url": "/icons/templates/data_viz_agent.svg",
402
+ "category": "Data Visualization",
403
+ "is_premium_only": False,
404
+ "variant_type": "individual",
405
+ "base_agent": "data_viz_agent",
406
+ "is_active": True,
407
+ "prompt_template": "You are a data visualization agent. Create interactive visualizations using Plotly based on user requirements. Generate appropriate chart types, apply styling, and ensure visualizations effectively communicate insights."
408
+ },
409
+ {
410
+ "template_name": "sk_learn_agent",
411
+ "display_name": "Machine Learning Agent",
412
+ "description": "Trains ML models using scikit-learn",
413
+ "icon_url": "/icons/templates/sk_learn_agent.svg",
414
+ "category": "Data Modelling",
415
+ "is_premium_only": False,
416
+ "variant_type": "individual",
417
+ "base_agent": "sk_learn_agent",
418
+ "is_active": True,
419
+ "prompt_template": "You are a machine learning agent. Use scikit-learn to train and evaluate ML models including classification, regression, and clustering. Provide feature importance insights and model performance metrics."
420
+ }
421
+ ]
422
+
423
+ created_count = 0
424
+
425
+ for template_data in minimal_templates:
426
+ template_name = template_data["template_name"]
427
+
428
+ # Check if template already exists
429
+ existing = session.query(AgentTemplate).filter(
430
+ AgentTemplate.template_name == template_name
431
+ ).first()
432
+
433
+ if not existing:
434
+ template = AgentTemplate(
435
+ template_name=template_name,
436
+ display_name=template_data["display_name"],
437
+ description=template_data["description"],
438
+ icon_url=template_data["icon_url"],
439
+ prompt_template=template_data["prompt_template"],
440
+ category=template_data["category"],
441
+ is_premium_only=template_data["is_premium_only"],
442
+ is_active=template_data["is_active"],
443
+ variant_type=template_data["variant_type"],
444
+ base_agent=template_data["base_agent"],
445
+ created_at=datetime.now(UTC),
446
+ updated_at=datetime.now(UTC)
447
+ )
448
+
449
+ session.add(template)
450
+ print(f"✅ Created minimal template: {template_name}")
451
+ created_count += 1
452
+ else:
453
+ print(f"⏭️ Template already exists: {template_name}")
454
+
455
+ session.commit()
456
+ print(f"📊 Created {created_count} minimal templates")
457
+
458
+ except Exception as e:
459
+ session.rollback()
460
+ print(f"❌ Error creating minimal templates: {str(e)}")
461
+ raise
462
+ finally:
463
+ session.close()
464
+
465
+ def populate_templates():
466
+ """Legacy compatibility function for backward compatibility"""
467
+ print("⚠️ Legacy populate_templates() called - checking for agents_config.json...")
468
+
469
+ # Check if agents_config.json exists anywhere
470
+ possible_paths = [
471
+ os.path.join(backend_dir, 'agents_config.json'),
472
+ os.path.join(project_root, 'agents_config.json'),
473
+ '/app/agents_config.json',
474
+ 'agents_config.json'
475
+ ]
476
+
477
+ config_exists = any(os.path.exists(path) for path in possible_paths)
478
+
479
+ if config_exists:
480
+ print("📖 Found agents_config.json - using sync_agents_from_config()")
481
+ sync_agents_from_config()
482
+ else:
483
+ print("⚠️ agents_config.json not found - using fallback minimal templates")
484
+ print("💡 Creating essential templates for container environment")
485
+ create_minimal_templates()
486
+
487
+ if __name__ == "__main__":
488
+ import argparse
489
+
490
+ parser = argparse.ArgumentParser(description="SQLite Agent Template Management")
491
+ parser.add_argument("action", choices=["sync", "list", "remove-all", "validate"],
492
+ help="Action to perform")
493
+
494
+ args = parser.parse_args()
495
+
496
+ if args.action == "sync":
497
+ print("🚀 Synchronizing agents from agents_config.json to SQLite...")
498
+ sync_agents_from_config()
499
+ elif args.action == "list":
500
+ list_templates()
501
+ elif args.action == "validate":
502
+ validate_config()
503
+ elif args.action == "remove-all":
504
+ confirm = input("⚠️ Are you sure you want to remove ALL templates? (yes/no): ")
505
+ if confirm.lower() == "yes":
506
+ remove_all_templates()
507
+ else:
508
+ print("Operation cancelled.")
scripts/tier_maker.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils.model_registry import MODEL_COSTS, MODEL_TIERS
2
+
3
+ # divide models in 3 tiers based on cost per 1k tokens
4
+ # tier 1: < $0.0005
5
+ # tier 2: < $0.001
6
+ # tier 3: > $0.05
7
+ # tier 4: > $0.1
8
+
9
+ TIERS_COST = {
10
+ "tier1": 0.0005,
11
+ "tier2": 0.001,
12
+ "tier3": 0.05,
13
+ "tier4": 0.1
14
+ }
15
+
16
+ def get_tier(model_name):
17
+ for provider, models in MODEL_COSTS.items():
18
+ for model, cost in models.items():
19
+ if model == model_name:
20
+ return cost
21
+ return None
22
+
23
+ def get_tier_1():
24
+ tier_1 = []
25
+ for provider, models in MODEL_COSTS.items():
26
+ for model, cost in models.items():
27
+ if cost["input"] + cost["output"] < TIERS_COST["tier1"]:
28
+ tier_1.append(model)
29
+ return tier_1
30
+
31
+ def get_tier_2():
32
+ tier_2 = []
33
+ for provider, models in MODEL_COSTS.items():
34
+ for model, cost in models.items():
35
+ if cost["input"] + cost["output"] >= TIERS_COST["tier1"] and cost["input"] + cost["output"] < TIERS_COST["tier2"]:
36
+ tier_2.append(model)
37
+ return tier_2
38
+
39
+ def get_tier_3():
40
+ tier_3 = []
41
+ for provider, models in MODEL_COSTS.items():
42
+ for model, cost in models.items():
43
+ if cost["input"] + cost["output"] >= TIERS_COST["tier2"] and cost["input"] + cost["output"] < TIERS_COST["tier3"]:
44
+ tier_3.append(model)
45
+ return tier_3
46
+
47
+ def get_tier_4():
48
+ tier_4 = []
49
+ for provider, models in MODEL_COSTS.items():
50
+ for model, cost in models.items():
51
+ if cost["input"] + cost["output"] >= TIERS_COST["tier3"]:
52
+ tier_4.append(model)
53
+ return tier_4
54
+
55
+ # Print current tier definitions from registry
56
+ import json
57
+ print("Current tier definitions from registry:")
58
+ print(json.dumps(MODEL_TIERS, indent=4))
59
+ print("\n")
60
+
61
+ # Generate new tier assignments based on cost
62
+ model_tiers = {
63
+ "tier1": {
64
+ "name": "Basic",
65
+ "credits": 1,
66
+ "models": get_tier_1()
67
+ },
68
+ "tier2": {
69
+ "name": "Standard",
70
+ "credits": 3,
71
+ "models": get_tier_2()
72
+ },
73
+ "tier3": {
74
+ "name": "Premium",
75
+ "credits": 5,
76
+ "models": get_tier_3()
77
+ },
78
+ "tier4": {
79
+ "name": "Premium Plus",
80
+ "credits": 10,
81
+ "models": get_tier_4()
82
+ }
83
+ }
84
+
85
+ print("Suggested tier definitions based on cost:")
86
+ print(json.dumps(model_tiers, indent=4))
src/__init__.py ADDED
File without changes
src/agents/agents.py ADDED
The diff for this file is too large to render. See raw diff
 
src/agents/deep_agents.py ADDED
@@ -0,0 +1,1085 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import ast
3
+ import json
4
+ import os
5
+ import dspy
6
+ import numpy as np
7
+ import pandas as pd
8
+ from dotenv import load_dotenv
9
+ from src.utils.logger import Logger
10
+ import logging
11
+ import datetime
12
+ import re
13
+ import textwrap
14
+
15
+ def clean_print_statements(code_block):
16
+ """
17
+ This function cleans up any `print()` statements that might contain unwanted `\n` characters.
18
+ It ensures print statements are properly formatted without unnecessary newlines.
19
+ """
20
+ # This regex targets print statements, even if they have newlines inside
21
+ return re.sub(r'print\((.*?)(\\n.*?)(.*?)\)', r'print(\1\3)', code_block, flags=re.DOTALL)
22
+
23
+
24
+ def clean_unicode_chars(text):
25
+ """
26
+ Clean Unicode characters that might cause encoding issues.
27
+ Replaces common Unicode characters with ASCII equivalents.
28
+ """
29
+ if not isinstance(text, str):
30
+ return text
31
+
32
+ # Replace common Unicode characters with ASCII equivalents
33
+ replacements = {
34
+ '\u2192': ' -> ', # Right arrow
35
+ '\u2190': ' <- ', # Left arrow
36
+ '\u2194': ' <-> ', # Left-right arrow
37
+ '\u2500': '-', # Box drawing horizontal
38
+ '\u2502': '|', # Box drawing vertical
39
+ '\u2026': '...', # Ellipsis
40
+ '\u2013': '-', # En dash
41
+ '\u2014': '-', # Em dash
42
+ '\u201c': '"', # Left double quotation mark
43
+ '\u201d': '"', # Right double quotation mark
44
+ '\u2018': "'", # Left single quotation mark
45
+ '\u2019': "'", # Right single quotation mark
46
+ }
47
+
48
+ for unicode_char, ascii_replacement in replacements.items():
49
+ text = text.replace(unicode_char, ascii_replacement)
50
+
51
+ # Remove any remaining non-ASCII characters
52
+ text = text.encode('ascii', 'ignore').decode('ascii')
53
+
54
+ return text
55
+
56
+
57
+ def remove_main_block(code):
58
+ # Match the __main__ block
59
+ pattern = r'(?m)^if\s+__name__\s*==\s*["\']__main__["\']\s*:\s*\n((?:\s+.*\n?)*)'
60
+
61
+ match = re.search(pattern, code)
62
+ if match:
63
+ main_block = match.group(1)
64
+
65
+ # Dedent the code block inside __main__
66
+ dedented_block = textwrap.dedent(main_block)
67
+
68
+ # Remove \n from any print statements in the block (also handling multiline print cases)
69
+ dedented_block = clean_print_statements(dedented_block)
70
+ # Replace the block in the code
71
+ cleaned_code = re.sub(pattern, dedented_block, code)
72
+
73
+ # Optional: Remove leading newlines if any
74
+ cleaned_code = cleaned_code.strip()
75
+
76
+ return cleaned_code
77
+ return code
78
+
79
+
80
+ # Configure Plotly to prevent auto-display
81
+ def configure_plotly_no_display():
82
+ """Configure Plotly to prevent automatic browser display"""
83
+ try:
84
+ import plotly.io as pio
85
+
86
+ # Set environment variables to prevent browser opening
87
+ os.environ['BROWSER'] = ''
88
+ os.environ['PLOTLY_RENDERER'] = 'json'
89
+
90
+ # Configure Plotly renderers
91
+ pio.renderers.default = 'json'
92
+ pio.templates.default = 'plotly_white'
93
+
94
+ # Disable Kaleido auto-display if available
95
+ try:
96
+ import plotly.graph_objects as go
97
+ # Configure figure defaults to not auto-display
98
+ go.Figure.show = lambda self, *args, **kwargs: None
99
+ except ImportError:
100
+ pass
101
+
102
+ except ImportError:
103
+ print("Warning: Plotly not available for configuration")
104
+
105
+ # Call the configuration function immediately
106
+ configure_plotly_no_display()
107
+
108
+ logger = Logger("deep_agents", see_time=True, console_log=False)
109
+ load_dotenv()
110
+
111
+ class deep_questions(dspy.Signature):
112
+ """
113
+ You are a data analysis assistant.
114
+
115
+ Your role is to take a user's high-level analytical goal and generate a set of deep, targeted follow-up questions. These questions should guide an analyst toward a more thorough understanding of the goal by encouraging exploration, segmentation, and causal reasoning.
116
+
117
+ Instructions:
118
+ - Generate up to 5 insightful, data-relevant questions.
119
+ - Use the dataset structure to tailor your questions (e.g., look at the available columns, data types, and what kind of information they can reveal).
120
+ - The questions should help the user decompose their analytic goal and explore it from multiple angles (e.g., time trends, customer segments, usage behavior, external factors, feedback).
121
+ - Each question should be specific enough to guide actionable analysis or investigation.
122
+ - Use a clear and concise style, but maintain depth.
123
+
124
+ Inputs:
125
+ - goal: The user's analytical goal or main question they want to explore
126
+ - dataset_info: A description of the dataset the user is querying, including:
127
+ - What the dataset represents
128
+ - Key columns and their data types
129
+
130
+ Output:
131
+ - deep_questions: A list of up to 5 specific, data-driven questions that support the analytic goal
132
+
133
+ ---
134
+
135
+ Example:
136
+
137
+ Analytical Goal:
138
+ Understand why churn has been rising
139
+
140
+ Dataset Info:
141
+ Customer Retention Dataset tracking subscription activity over time.
142
+ Columns:
143
+ - customer_id (string)
144
+ - join_date (date)
145
+ - churn_date (date, nullable)
146
+ - is_churned (boolean)
147
+ - plan_type (string: 'basic', 'premium', 'enterprise')
148
+ - region (string)
149
+ - last_login_date (date)
150
+ - avg_weekly_logins (float)
151
+ - support_tickets_last_30d (int)
152
+ - satisfaction_score (float, 0–10 scale)
153
+
154
+ Decomposed Questions:
155
+ 1. How has the churn rate changed month-over-month, and during which periods was the increase most pronounced?
156
+ 2. Are specific plan types or regions showing a higher churn rate relative to others?
157
+ 3. What is the average satisfaction score and support ticket count among churned users compared to retained users?
158
+ 4. Do churned users exhibit different login behavior (e.g., avg_weekly_logins) in the weeks leading up to their churn date?
159
+ 5. What is the tenure distribution (time from join_date to churn_date) among churned customers, and are short-tenure users more likely to churn?
160
+
161
+ """
162
+ goal = dspy.InputField(desc="User analytical goal — what main insight or question they want to answer")
163
+ dataset_info = dspy.InputField(desc="A description of the dataset: what it represents, and the main columns with data types")
164
+ deep_questions = dspy.OutputField(desc="A list of up to five questions that help deeply explore the analytical goal using the dataset")
165
+
166
+ class deep_synthesizer(dspy.Signature):
167
+ """
168
+ You are a data analysis synthesis expert.
169
+
170
+ Your job is to take the outputs from a multi-agent data analytics system - including the original user query, the code summaries from each agent, and the actual printed results from running those code blocks - and synthesize them into a comprehensive, well-structured final report.
171
+
172
+ This report should:
173
+ - Explain what steps were taken and why (based on the query)
174
+ - Summarize the code logic used by each agent, without including raw code
175
+ - Highlight key findings and results from the code outputs
176
+ - Offer clear, actionable insights tied back to the user's original question
177
+ - Be structured, readable, and suitable for decision-makers or analysts
178
+
179
+ Instructions:
180
+ - Begin with a brief restatement of the original query and what it aimed to solve
181
+ - Organize your report step-by-step or by analytical theme (e.g., segmentation, trend analysis, etc.)
182
+ - For each part, summarize what was analyzed, how (based on code summaries), and what the result was (based on printed output)
183
+ - End with a final set of synthesized conclusions and potential next steps or recommendations
184
+
185
+ Inputs:
186
+ - query: The user's original analytical question or goal
187
+ - summaries: A list of natural language descriptions of what each agent's code did
188
+ - print_outputs: A list of printed outputs (results) from running each agent's code
189
+
190
+ Output:
191
+ - synthesized_report: A structured and readable report that ties all parts together, grounded in the code logic and results
192
+
193
+ Example use:
194
+ You are not just summarizing outputs - you're telling a story that answers the user's query using real data.
195
+ """
196
+
197
+ query = dspy.InputField(desc="The original user query or analytical goal")
198
+ summaries = dspy.InputField(desc="List of code summaries - each describing what a particular agent's code did")
199
+ print_outputs = dspy.InputField(desc="List of print outputs - the actual data insights generated by the code")
200
+ synthesized_report = dspy.OutputField(desc="The final, structured report that synthesizes all the information into clear insights")
201
+
202
+ def clean_and_store_code(code, session_df=None):
203
+ """
204
+ Cleans and stores code execution results in a standardized format.
205
+
206
+ Args:
207
+ code (str): Raw code text to execute
208
+ session_df (DataFrame): Optional session DataFrame
209
+
210
+ Returns:
211
+ dict: Execution results containing printed_output, plotly_figs, and error info
212
+ """
213
+ import io
214
+ import sys
215
+ import re
216
+ import plotly.express as px
217
+ import plotly.graph_objects as go
218
+ from plotly.subplots import make_subplots
219
+ import plotly.io as pio
220
+
221
+ # Make session DataFrame available globally if provided
222
+ if session_df is not None:
223
+ globals()['df'] = session_df
224
+
225
+ # Initialize output containers
226
+ output_dict = {
227
+ 'exec_result': None,
228
+ 'printed_output': '',
229
+ 'plotly_figs': [],
230
+ 'error': None
231
+ }
232
+
233
+ try:
234
+ # Clean the code
235
+ cleaned_code = code.strip()
236
+
237
+ cleaned_code = cleaned_code.replace('```python', '').replace('```', '')
238
+
239
+
240
+ # Fix try statement syntax
241
+ cleaned_code = cleaned_code.replace('try\n', 'try:\n')
242
+
243
+ # Remove code patterns that would make the code unrunnable
244
+ invalid_patterns = [
245
+ '```', # Code block markers
246
+ '\\n', # Raw newlines
247
+ '\\t', # Raw tabs
248
+ '\\r', # Raw carriage returns
249
+ ]
250
+
251
+ for pattern in invalid_patterns:
252
+ if pattern in cleaned_code:
253
+ cleaned_code = cleaned_code.replace(pattern, '')
254
+
255
+
256
+ # Remove reading the csv file if it's already in the context
257
+ cleaned_code = re.sub(r"df\s*=\s*pd\.read_csv\([\"\'].*?[\"\']\).*?(\n|$)", '', cleaned_code)
258
+
259
+ # Only match assignments at top level (not indented)
260
+ # 1. Remove 'df = pd.DataFrame()' if it's at the top level
261
+ cleaned_code = re.sub(
262
+ r"^df\s*=\s*pd\.DataFrame\(\s*\)\s*(#.*)?$",
263
+ '',
264
+ cleaned_code,
265
+ flags=re.MULTILINE
266
+ )
267
+ cleaned_code = re.sub(r"plt\.show\(\).*?(\n|$)", '', cleaned_code)
268
+ # Remove all .show() method calls more comprehensively
269
+ cleaned_code = re.sub(r'\b\w*\.show\(\)', '', cleaned_code)
270
+ cleaned_code = re.sub(r'^\s*\w*fig\w*\.show\(\)\s*;?\s*$', '', cleaned_code, flags=re.MULTILINE)
271
+
272
+ # Additional patterns to catch more .show() variations
273
+ cleaned_code = re.sub(r'\.show\(\s*\)', '', cleaned_code) # .show() with optional spaces
274
+ cleaned_code = re.sub(r'\.show\(\s*renderer\s*=\s*[\'"][^\'\"]*[\'"]\s*\)', '', cleaned_code) # .show(renderer='...')
275
+ cleaned_code = re.sub(r'plotly_figs\[\d+\]\.show\(\)', '', cleaned_code) # plotly_figs[0].show()
276
+
277
+ # More comprehensive patterns
278
+ cleaned_code = re.sub(r'\.show\([^)]*\)', '', cleaned_code) # .show(any_args)
279
+ cleaned_code = re.sub(r'fig\w*\.show\(\s*[^)]*\s*\)', '', cleaned_code) # fig*.show(any_args)
280
+ cleaned_code = re.sub(r'\w+_fig\w*\.show\(\s*[^)]*\s*\)', '', cleaned_code) # *_fig*.show(any_args)
281
+
282
+ cleaned_code = remove_main_block(cleaned_code)
283
+
284
+ # Clean Unicode characters that might cause encoding issues
285
+ cleaned_code = clean_unicode_chars(cleaned_code)
286
+
287
+ # Capture printed output
288
+ old_stdout = sys.stdout
289
+ captured_output = io.StringIO()
290
+ sys.stdout = captured_output
291
+
292
+ # Create execution environment with common imports and session data
293
+ exec_globals = {
294
+ '__builtins__': __builtins__,
295
+ 'pd': __import__('pandas'),
296
+ 'np': __import__('numpy'),
297
+ 'px': px,
298
+ 'go': go,
299
+ 'make_subplots': make_subplots,
300
+ 'plotly_figs': [],
301
+ 'print': print,
302
+ }
303
+
304
+ # Add session DataFrame if available
305
+ if session_df is not None:
306
+ exec_globals['df'] = session_df
307
+ elif 'df' in globals():
308
+ exec_globals['df'] = globals()['df']
309
+
310
+ # Add other common libraries that might be needed
311
+ try:
312
+ exec_globals['sm'] = __import__('statsmodels.api', fromlist=[''])
313
+ exec_globals['train_test_split'] = __import__('sklearn.model_selection', fromlist=['train_test_split']).train_test_split
314
+ exec_globals['LinearRegression'] = __import__('sklearn.linear_model', fromlist=['LinearRegression']).LinearRegression
315
+ exec_globals['mean_absolute_error'] = __import__('sklearn.metrics', fromlist=['mean_absolute_error']).mean_absolute_error
316
+ exec_globals['r2_score'] = __import__('sklearn.metrics', fromlist=['r2_score']).r2_score
317
+ exec_globals['LabelEncoder'] = __import__('sklearn.preprocessing', fromlist=['LabelEncoder']).LabelEncoder
318
+ exec_globals['warnings'] = __import__('warnings')
319
+ except ImportError as e:
320
+ print(f"Warning: Could not import some optional libraries: {e}")
321
+
322
+ # Execute the code
323
+ exec(cleaned_code, exec_globals)
324
+
325
+ # Restore stdout
326
+ sys.stdout = old_stdout
327
+
328
+ # Get the captured output
329
+ printed_output = captured_output.getvalue()
330
+ output_dict['printed_output'] = printed_output
331
+ # Extract plotly figures from the execution environment
332
+ if 'plotly_figs' in exec_globals:
333
+ plotly_figs = exec_globals['plotly_figs']
334
+ if isinstance(plotly_figs, list):
335
+ output_dict['plotly_figs'] = plotly_figs
336
+ else:
337
+ output_dict['plotly_figs'] = [plotly_figs] if plotly_figs else []
338
+
339
+ # Also check for any figure variables that might have been created
340
+ for var_name, var_value in exec_globals.items():
341
+ if hasattr(var_value, 'to_json') and hasattr(var_value, 'show'):
342
+ # This looks like a Plotly figure
343
+ if var_value not in output_dict['plotly_figs']:
344
+ output_dict['plotly_figs'].append(var_value)
345
+
346
+ except Exception as e:
347
+ # Restore stdout in case of error
348
+ sys.stdout = old_stdout
349
+ error_msg = str(e)
350
+ output_dict['error'] = error_msg
351
+ output_dict['printed_output'] = f"Error executing code: {error_msg}"
352
+ print(f"Code execution error: {error_msg}")
353
+
354
+ return output_dict
355
+
356
+ def score_code(args, code):
357
+ """
358
+ Cleans and stores code execution results in a standardized format.
359
+ Safely handles execution errors and returns clean output even if execution fails.
360
+ Ensures plotly figures are properly created and captured.
361
+
362
+ Args:
363
+ args: Arguments (unused but required for dspy.Refine)
364
+ code: Code object with combined_code attribute
365
+
366
+ Returns:
367
+ int: Score (0=error, 1=success, 2=success with plots)
368
+ """
369
+
370
+ code_text = code.combined_code
371
+ try:
372
+ # Fix try statement syntax
373
+ code_text = code_text.replace('try\n', 'try:\n')
374
+ code_text = code_text.replace('```python', '').replace('```', '')
375
+
376
+
377
+ # Remove code patterns that would make the code unrunnable
378
+ invalid_patterns = [
379
+ '```', '\\n', '\\t', '\\r'
380
+ ]
381
+
382
+ for pattern in invalid_patterns:
383
+ if pattern in code_text:
384
+ code_text = code_text.replace(pattern, '')
385
+
386
+ cleaned_code = re.sub(r"plt\.show\(\).*?(\n|$)", '', code_text)
387
+ # Remove all .show() method calls more comprehensively
388
+ cleaned_code = re.sub(r'\b\w*\.show\(\)', '', cleaned_code)
389
+ cleaned_code = re.sub(r'^\s*\w*fig\w*\.show\(\)\s*;?\s*$', '', cleaned_code, flags=re.MULTILINE)
390
+
391
+ # Additional patterns to catch more .show() variations
392
+ cleaned_code = re.sub(r'\.show\(\s*\)', '', cleaned_code) # .show() with optional spaces
393
+ cleaned_code = re.sub(r'\.show\(\s*renderer\s*=\s*[\'"][^\'\"]*[\'"]\s*\)', '', cleaned_code) # .show(renderer='...')
394
+ cleaned_code = re.sub(r'plotly_figs\[\d+\]\.show\(\)', '', cleaned_code) # plotly_figs[0].show()
395
+
396
+ # More comprehensive patterns
397
+ cleaned_code = re.sub(r'\.show\([^)]*\)', '', cleaned_code) # .show(any_args)
398
+ cleaned_code = re.sub(r'fig\w*\.show\(\s*[^)]*\s*\)', '', cleaned_code) # fig*.show(any_args)
399
+ cleaned_code = re.sub(r'\w+_fig\w*\.show\(\s*[^)]*\s*\)', '', cleaned_code) # *_fig*.show(any_args)
400
+
401
+ cleaned_code = remove_main_block(cleaned_code)
402
+ # Capture stdout using StringIO
403
+ from io import StringIO
404
+ import sys
405
+ import plotly.graph_objects as go
406
+ stdout_capture = StringIO()
407
+ original_stdout = sys.stdout
408
+ sys.stdout = stdout_capture
409
+
410
+ # Execute code in a new namespace to avoid polluting globals
411
+ local_vars = {}
412
+ exec(cleaned_code, globals(), local_vars)
413
+
414
+ # Capture any plotly figures from local namespace
415
+ plotly_figs = []
416
+ for var_name, var in local_vars.items():
417
+ if isinstance(var, go.Figure):
418
+ if not var.layout.title:
419
+ var.update_layout(title=f"Figure {len(plotly_figs) + 1}")
420
+ if not var.layout.template:
421
+ var.update_layout(template="plotly_white")
422
+ plotly_figs.append(var)
423
+ elif isinstance(var, (list, tuple)):
424
+ for item in var:
425
+ if isinstance(item, go.Figure):
426
+ if not item.layout.title:
427
+ item.update_layout(title=f"Figure {len(plotly_figs) + 1}")
428
+ if not item.layout.template:
429
+ item.update_layout(template="plotly_white")
430
+ plotly_figs.append(item)
431
+
432
+ # Restore stdout and get captured output
433
+ sys.stdout = original_stdout
434
+ captured_output = stdout_capture.getvalue()
435
+ stdout_capture.close()
436
+
437
+ # Calculate score based on execution and plot generation
438
+ score = 2 if plotly_figs else 1
439
+
440
+ return score
441
+
442
+ except Exception as e:
443
+ # Restore stdout in case of error
444
+ if 'stdout_capture' in locals():
445
+ sys.stdout = original_stdout
446
+ stdout_capture.close()
447
+
448
+ return 0
449
+
450
+
451
+ class deep_planner(dspy.Signature):
452
+ """
453
+ You are an advanced multi-question planning agent. Your task is to generate the most optimized and minimal plan
454
+ to answer up to 5 analytical questions using available agents.
455
+
456
+ Your responsibilities:
457
+ 1. Feasibility: Verify that the goal is achievable using the provided datasets and agent descriptions.
458
+ 2. Optimization:
459
+ - Batch up to 2 similar questions per agent call.
460
+ - Reuse outputs across questions wherever possible.
461
+ - Avoid unnecessary agents or redundant processing.
462
+ - Minimize total agent calls while preserving correctness.
463
+ 3. Clarity:
464
+ - Define clear variable usage (create/use).
465
+ - Specify concise step-by-step instructions per agent.
466
+ - Use dependency arrows (->) to indicate required agent outputs used by others.
467
+
468
+ Inputs:
469
+ - deep_questions: A list of up to 5 deep analytical questions (e.g., ["q1", "q2", ..., "q5"])
470
+ - dataset: The available dataset(s) in memory or context
471
+ - agents_desc: Dictionary containing each agent's name and its capabilities or descriptions
472
+
473
+ Outputs:
474
+ - plan_instructions: Detailed per-agent variable flow and functionality in the format:
475
+ {
476
+ "agent_x": {
477
+ "create": ["cleaned_data: DataFrame - cleaned version of the input dataset"],
478
+ "use": ["df: DataFrame - raw input dataset"],
479
+ "instruction": "Clean the dataset by handling null values and standardizing formats."
480
+ },
481
+ "agent_y": {
482
+ "create": ["analysis_results: dict - results of correlation analysis"],
483
+ "use": ["cleaned_data: DataFrame - output from @agent_x"],
484
+ "instruction": "Perform correlation analysis to identify strong predictors."
485
+ }
486
+ }
487
+
488
+ Output Goal:
489
+ Generate a small, clean, optimized execution plan using minimal agent calls, reusable outputs, and well-structured dependencies.
490
+ USE THE EXACT NAME OF THE AGENTS IN THE INSTRUCTIONS
491
+ """
492
+
493
+ deep_questions = dspy.InputField(desc="List of up to 5 deep analytical questions to answer")
494
+ dataset = dspy.InputField(desc="Available datasets, use 'df' as the working dataset")
495
+ agents_desc = dspy.InputField(desc="Descriptions of available agents and their functions")
496
+ plan_instructions = dspy.OutputField(desc="Variable-level instructions for each agent used in the plan")
497
+
498
+ class deep_plan_fixer(dspy.Signature):
499
+ """
500
+ You are a plan instruction fixer agent. Your task is to take potentially malformed plan instructions
501
+ and convert them into a properly structured dictionary format that can be safely evaluated.
502
+
503
+ Your responsibilities:
504
+ 1. Parse and validate the input plan instructions
505
+ 2. Convert the instructions into a proper dictionary format
506
+ 3. Ensure all agent instructions follow the required structure:
507
+ {
508
+ "@agent_name": {
509
+ "create": ["variable: type - description"],
510
+ "use": ["variable: type - description"],
511
+ "instruction": "clear instruction text"
512
+ }
513
+ }
514
+ 4. Handle any malformed or missing components
515
+ 5. Return a properly formatted dictionary string that can be safely evaluated
516
+
517
+ Inputs:
518
+ - plan_instructions: The potentially malformed plan instructions to fix
519
+
520
+ Outputs:
521
+ - fixed_plan: A properly formatted dictionary string that can be safely evaluated
522
+ """
523
+
524
+ plan_instructions = dspy.InputField(desc="The potentially malformed plan instructions to fix")
525
+ fixed_plan = dspy.OutputField(desc="Properly formatted dictionary string that can be safely evaluated")
526
+
527
+ class final_conclusion(dspy.Signature):
528
+ """
529
+ You are a high-level analytics reasoning engine.
530
+
531
+ Your task is to take multiple synthesized analytical results (each answering part of the original query) and produce a cohesive final conclusion that directly addresses the user's original question.
532
+
533
+ This is not just a summary — it's a judgment. Use evidence from the synthesized findings to:
534
+ - Answer the original question with clarity
535
+ - Highlight the most important insights
536
+ - Offer any causal reasoning or patterns discovered
537
+ - Suggest next steps or strategic recommendations where appropriate
538
+
539
+ Instructions:
540
+ - Focus on relevance to the original query
541
+ - Do not just repeat what the synthesized sections say — instead, infer, interpret, and connect dots
542
+ - Prioritize clarity and insight over detail
543
+ - End with a brief "Next Steps" section if applicable
544
+
545
+ Inputs:
546
+ - query: The original user question or goal
547
+ - synthesized_sections: A list of synthesized result sections from the deep_synthesizer step (each covering part of the analysis)
548
+
549
+ Output:
550
+ - final_summary: A cohesive final conclusion that addresses the query, draws insight, and offers high-level guidance
551
+
552
+ ---
553
+
554
+ Example Output Structure:
555
+
556
+ **Conclusion**
557
+ Summarize the overall answer to the user's question, using the most compelling evidence across the synthesized sections.
558
+
559
+ **Key Takeaways**
560
+ - Bullet 1
561
+ - Bullet 2
562
+ - Bullet 3
563
+
564
+ **Recommended Next Steps**
565
+ (Optional based on context)
566
+
567
+ """
568
+
569
+ query = dspy.InputField(desc="The user's original query or analytical goal")
570
+ synthesized_sections = dspy.InputField(desc="List of synthesized outputs — each one corresponding to a sub-part of the analysis")
571
+ final_conclusion = dspy.OutputField(desc="A cohesive, conclusive answer that addresses the query and integrates key insights")
572
+
573
+
574
+
575
+
576
+ class deep_code_synthesizer(dspy.Signature):
577
+ """
578
+ You are a code synthesis and optimization engine that combines and fixes code from multiple analytical agents.
579
+
580
+ Your task is to take code outputs from preprocessing, statistical analysis, machine learning, and visualization agents, then:
581
+ - Combine them into a single, coherent analysis pipeline
582
+ - Fix any errors or inconsistencies between agent outputs
583
+ - Ensure proper data flow between steps
584
+ - Optimize the combined code for efficiency
585
+ - Add necessary imports and dependencies
586
+ - Handle any data type mismatches or conversion issues
587
+ - Validate and normalize data types between agent outputs (e.g., ensure DataFrame operations maintain DataFrame type)
588
+ - Convert between common data structures (lists, dicts, DataFrames) as needed
589
+ - Add type hints and validation checks
590
+ - Ensure consistent variable naming across agents
591
+ - Ensure all visualizations use Plotly exclusively
592
+ - Create comprehensive visualizations that show all important variables and relationships
593
+ - Store all Plotly figures in a list for later use in the report
594
+
595
+ Instructions:
596
+ - Review each agent's code for correctness and completeness
597
+ - Ensure variables are properly passed between steps with consistent types
598
+ - Fix any syntax errors or logical issues
599
+ - Add error handling and type validation where needed
600
+ - Optimize code structure and performance
601
+ - Maintain consistent coding style
602
+ - Add clear comments explaining the analysis flow
603
+ - Add data type conversion functions where needed
604
+ - Validate input/output types between agent steps
605
+ - Handle edge cases where agents might return different data structures
606
+ - Convert any non-Plotly visualizations to Plotly format
607
+ - Ensure all important variables are visualized appropriately
608
+ - Store all Plotly figures in a list called plotly_figs
609
+ - Include appropriate titles, labels, and legends for all visualizations
610
+ - Use consistent styling across all Plotly visualizations
611
+ - DONOT COMMENT OUT ANYTHING AS THE CODE SHOULD RUN & SHOW OUTPUTS
612
+ - THE DATASET IS ALREADY LOADED, DON'T CREATE FAKE DATA. 'df' is always loaded
613
+
614
+ Inputs:
615
+ - deep_questions- The five deep questions this system is answering
616
+ - dataset_info - Information about the dataset structure and types
617
+ - planner_instructions - the plan according to the planner, ensure that the final code makes everything coherent
618
+ - code - List of all agent code
619
+
620
+
621
+ Output:
622
+ - combined_code: - A single, optimized Python script that combines all analysis steps with proper type handling and Plotly visualizations
623
+
624
+ """
625
+ deep_questions = dspy.InputField(desc="The five deep questions this system is answering")
626
+ dataset_info = dspy.InputField(desc="Information about the dataset")
627
+ planner_instructions = dspy.InputField(desc="The planner instructions for each")
628
+ code = dspy.InputField(desc="The code generated by all agents")
629
+ combined_code = dspy.OutputField(desc="A single, optimized Python script that combines all analysis steps")
630
+
631
+ class deep_code_fix(dspy.Signature):
632
+ """
633
+ You are a code debugging and fixing agent that analyzes and repairs code errors.
634
+
635
+ Your task is to:
636
+ - Analyze error messages and identify root causes
637
+ - Fix syntax errors, logical issues, and runtime problems
638
+ - Ensure proper data type handling and conversions
639
+ - Add appropriate error handling and validation
640
+ - Maintain code style and documentation
641
+ - Preserve the original analysis intent
642
+
643
+ Instructions:
644
+ - Carefully analyze the error message and stack trace
645
+ - Identify the specific line(s) causing the error
646
+ - Determine if the issue is syntax, logic, or runtime related
647
+ - Fix the code while maintaining its original purpose
648
+ - Add appropriate error handling if needed
649
+ - Ensure the fix doesn't introduce new issues
650
+ - Document the changes made
651
+
652
+ Inputs:
653
+ - code: The code that generated the error
654
+ - error: The error message and stack trace
655
+
656
+ Output:
657
+ - fixed_code: The repaired code with error handling
658
+ - fix_explanation: Explanation of what was fixed and why
659
+ """
660
+ code = dspy.InputField(desc="The code that generated the error")
661
+ error = dspy.InputField(desc="The error message and stack trace")
662
+ fixed_code = dspy.OutputField(desc="The repaired code with error handling")
663
+ fix_explanation = dspy.OutputField(desc="Explanation of what was fixed and why")
664
+
665
+
666
+ chart_instructions = """
667
+ Chart Styling Guidelines:
668
+
669
+ 1. General Styling:
670
+ - Use a clean, professional color palette (e.g., Tableau, ColorBrewer)
671
+ - Include clear titles and axis labels
672
+ - Add appropriate legends
673
+ - Use consistent font sizes and styles
674
+ - Include grid lines where helpful
675
+ - Add hover information for interactive plots
676
+
677
+ 2. Specific Chart Types:
678
+ - Bar Charts:
679
+ * Use horizontal bars for many categories
680
+ * Sort bars by value when appropriate
681
+ * Use consistent bar widths
682
+ * Add value labels on bars
683
+
684
+ - Line Charts:
685
+ * Use distinct line styles/colors
686
+ * Add markers at data points
687
+ * Include trend lines when relevant
688
+ * Show confidence intervals if applicable
689
+
690
+ - Scatter Plots:
691
+ * Use appropriate marker sizes
692
+ * Add regression lines when needed
693
+ * Use color to show additional dimensions
694
+ * Include density contours for large datasets
695
+
696
+ - Heatmaps:
697
+ * Use diverging color schemes for correlation
698
+ * Include value annotations
699
+ * Sort rows/columns by similarity
700
+ * Add clear color scale legend
701
+
702
+ 3. Data Visualization Best Practices:
703
+ - Start axes at zero when appropriate
704
+ - Use log scales for wide-ranging data
705
+ - Include reference lines/benchmarks
706
+ - Add annotations for important points
707
+ - Show uncertainty where relevant
708
+ - Use consistent color encoding
709
+ - Include data source and timestamp
710
+ - Add clear figure captions
711
+
712
+ 4. Interactive Features:
713
+ - Enable zooming and panning
714
+ - Add tooltips with detailed information
715
+ - Include download options
716
+ - Allow toggling of data series
717
+ - Enable cross-filtering between charts
718
+
719
+ 5. Accessibility:
720
+ - Use colorblind-friendly palettes
721
+ - Include alt text for all visualizations
722
+ - Ensure sufficient contrast
723
+ - Make interactive elements keyboard accessible
724
+ - Provide text alternatives for key insights
725
+ """
726
+
727
+
728
+
729
+
730
+ class deep_analysis_module(dspy.Module):
731
+ def __init__(self,agents, agents_desc):
732
+ # logger.log_message(f"Initializing deep_analysis_module with {agents} agents: {list(agents.keys())}", level=logging.INFO)
733
+
734
+ self.agents = agents
735
+ # Make all dspy operations async using asyncify
736
+ self.deep_questions = dspy.asyncify(dspy.Predict(deep_questions))
737
+ self.deep_planner = dspy.asyncify(dspy.ChainOfThought(deep_planner))
738
+ self.deep_synthesizer = dspy.asyncify(dspy.ChainOfThought(deep_synthesizer))
739
+ # Keep both asyncified and non-asyncified versions for code synthesizer
740
+ self.deep_code_synthesizer_sync = dspy.Predict(deep_code_synthesizer) # For dspy.Refine
741
+ self.deep_code_synthesizer = dspy.asyncify(dspy.Predict(deep_code_synthesizer)) # For async use
742
+ self.deep_plan_fixer = dspy.asyncify(dspy.ChainOfThought(deep_plan_fixer))
743
+ self.deep_code_fixer = dspy.asyncify(dspy.ChainOfThought(deep_code_fix))
744
+ self.styling_instructions = chart_instructions
745
+ self.agents_desc = agents_desc
746
+ self.final_conclusion = dspy.asyncify(dspy.ChainOfThought(final_conclusion))
747
+
748
+ # logger.log_message(f"Deep analysis module initialized successfully with agents: {list(self.agents.keys())}", level=logging.INFO)
749
+
750
+ async def execute_deep_analysis_streaming(self, goal, dataset_info, session_df=None):
751
+ """
752
+ Execute deep analysis with streaming progress updates.
753
+ This is an async generator that yields progress updates incrementally.
754
+ """
755
+ # Make the session DataFrame available globally for code execution
756
+ if session_df is not None:
757
+ globals()['df'] = session_df
758
+
759
+ try:
760
+ # Step 1: Generate deep questions (20% progress)
761
+ yield {
762
+ "step": "questions",
763
+ "status": "processing",
764
+ "message": "Generating analytical questions...",
765
+ "progress": 10
766
+ }
767
+
768
+ questions = await self.deep_questions(goal=goal, dataset_info=dataset_info)
769
+ logger.log_message("Questions generated")
770
+
771
+ yield {
772
+ "step": "questions",
773
+ "status": "completed",
774
+ "content": questions.deep_questions,
775
+ "progress": 20
776
+ }
777
+
778
+ # Step 2: Create analysis plan (40% progress)
779
+ yield {
780
+ "step": "planning",
781
+ "status": "processing",
782
+ "message": "Creating analysis plan...",
783
+ "progress": 25
784
+ }
785
+
786
+ question_list = [q.strip() for q in questions.deep_questions.split('\n') if q.strip()]
787
+ deep_plan = await self.deep_planner(
788
+ deep_questions=questions.deep_questions,
789
+ dataset=dataset_info,
790
+ agents_desc=str(self.agents_desc)
791
+ )
792
+ logger.log_message("Plan created")
793
+
794
+ # Parse plan instructions
795
+ try:
796
+ plan_instructions = ast.literal_eval(deep_plan.plan_instructions)
797
+ if not isinstance(plan_instructions, dict):
798
+ plan_instructions = json.loads(deep_plan.plan_instructions)
799
+ keys = [key for key in plan_instructions.keys()]
800
+
801
+ if not all(key in self.agents for key in keys):
802
+ raise ValueError(f"Invalid agent key(s) in plan instructions. Available agents: {list(self.agents.keys())}")
803
+ logger.log_message(f"Plan instructions: {plan_instructions}", logging.INFO)
804
+ logger.log_message(f"Keys: {keys}", logging.INFO)
805
+ except (ValueError, SyntaxError, json.JSONDecodeError) as e:
806
+ try:
807
+ deep_plan = await self.deep_plan_fixer(plan_instructions=deep_plan.plan_instructions)
808
+ plan_instructions = ast.literal_eval(deep_plan.fixed_plan)
809
+ if not isinstance(plan_instructions, dict):
810
+ plan_instructions = json.loads(deep_plan.fixed_plan)
811
+ keys = [key for key in plan_instructions.keys()]
812
+ except (ValueError, SyntaxError, json.JSONDecodeError) as e:
813
+ logger.log_message(f"Error parsing plan instructions: {e}", logging.ERROR)
814
+ raise e
815
+
816
+ logger.log_message("Instructions parsed")
817
+
818
+ yield {
819
+ "step": "planning",
820
+ "status": "completed",
821
+ "content": deep_plan.plan_instructions,
822
+ "progress": 40
823
+ }
824
+
825
+ # Step 3: Execute agent tasks (60% progress)
826
+ yield {
827
+ "step": "agent_execution",
828
+ "status": "processing",
829
+ "message": "Executing analysis agents...",
830
+ "progress": 45
831
+ }
832
+
833
+ queries = [
834
+ dspy.Example(
835
+ goal=questions.deep_questions,
836
+ dataset=dataset_info,
837
+ plan_instructions=str(plan_instructions[key]),
838
+ **({"styling_index": "Sample styling guidelines"} if "data_viz" in key or "viz" in key.lower() or "visual" in key.lower() or "plot" in key.lower() or "chart" in key.lower() else {})
839
+ ).with_inputs(
840
+ "goal",
841
+ "dataset",
842
+ "plan_instructions",
843
+ *(["styling_index"] if "data_viz" in key or "viz" in key.lower() or "visual" in key.lower() or "plot" in key.lower() or "chart" in key.lower() else [])
844
+ )
845
+ for key in keys
846
+ ]
847
+ tasks = [self.agents[key](**q) for q, key in zip(queries, keys)]
848
+
849
+ # Await all tasks to complete
850
+ summaries = []
851
+ codes = []
852
+ logger.log_message("Tasks started")
853
+
854
+ completed_tasks = 0
855
+ for task in asyncio.as_completed(tasks):
856
+ result = await task
857
+ summaries.append(result.summary)
858
+ codes.append(result.code)
859
+ completed_tasks += 1
860
+
861
+ # Update progress for each completed agent
862
+ agent_progress = 45 + (completed_tasks / len(tasks)) * 15 # 45% to 60%
863
+ yield {
864
+ "step": "agent_execution",
865
+ "status": "processing",
866
+ "message": f"Completed {completed_tasks}/{len(tasks)} analysis agents...",
867
+ "progress": int(agent_progress)
868
+ }
869
+ logger.log_message(f"Done with agent {completed_tasks}/{len(tasks)}")
870
+
871
+ yield {
872
+ "step": "agent_execution",
873
+ "status": "completed",
874
+ "message": "All analysis agents completed",
875
+ "progress": 60
876
+ }
877
+
878
+ # Step 4: Code synthesis (80% progress)
879
+ yield {
880
+ "step": "code_synthesis",
881
+ "status": "processing",
882
+ "message": "Analyzing code...",
883
+ "progress": 65
884
+ }
885
+
886
+ # Safely extract code from agent outputs
887
+ code = []
888
+ for c in codes:
889
+ try:
890
+ cleaned_code = remove_main_block(c)
891
+ if "```python" in cleaned_code:
892
+ parts = cleaned_code.split("```python")
893
+ if len(parts) > 1:
894
+ extracted = parts[1].split("```")[0] if "```" in parts[1] else parts[1]
895
+ code.append(extracted.replace('try\n','try:\n'))
896
+ else:
897
+ code.append(cleaned_code.replace('try\n','try:\n'))
898
+ else:
899
+ code.append(cleaned_code.replace('try\n','try:\n'))
900
+ except Exception as e:
901
+ logger.log_message(f"Warning: Error processing code block: {e}", logging.WARNING)
902
+ code.append(c.replace('try\n','try:\n'))
903
+
904
+ # Create deep coder without asyncify to avoid source inspection issues
905
+ deep_coder = dspy.Refine(module=self.deep_code_synthesizer_sync, N=5, reward_fn=score_code, threshold=1.0, fail_count=10)
906
+
907
+ # Check if we have valid API key
908
+ anthropic_key = os.environ.get('ANTHROPIC_API_KEY')
909
+ if not anthropic_key:
910
+ raise ValueError("ANTHROPIC_API_KEY environment variable is not set")
911
+
912
+ try:
913
+ # Create the LM instance that will be used
914
+ thread_lm = dspy.LM("anthropic/claude-sonnet-4-20250514", api_key=anthropic_key, max_tokens=17000)
915
+
916
+ logger.log_message("Starting code generation...")
917
+ start_time = datetime.datetime.now()
918
+ logger.log_message(f"Code generation started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
919
+
920
+ # Define the blocking function to run in thread
921
+ def run_deep_coder():
922
+ with dspy.context(lm=thread_lm):
923
+ return deep_coder(
924
+ deep_questions=str(questions.deep_questions),
925
+ dataset_info=dataset_info,
926
+ planner_instructions=str(plan_instructions),
927
+ code=str(code)
928
+ )
929
+
930
+ # Use asyncio.to_thread for better async integration
931
+ deep_code = await asyncio.to_thread(run_deep_coder)
932
+
933
+ logger.log_message(f"Code generation completed at: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
934
+ except Exception as e:
935
+ logger.log_message(f"Error during code generation: {str(e)}", logging.ERROR)
936
+ raise e
937
+
938
+ code = deep_code.combined_code
939
+ code = code.replace('```python', '').replace('```', '')
940
+
941
+ # Clean Unicode characters that might cause encoding issues
942
+ code = clean_unicode_chars(code)
943
+
944
+ yield {
945
+ "step": "code_synthesis",
946
+ "status": "completed",
947
+ "message": "Code synthesis completed",
948
+ "progress": 80
949
+ }
950
+
951
+ # Step 5: Execute code (85% progress)
952
+ yield {
953
+ "step": "code_execution",
954
+ "status": "processing",
955
+ "message": "Executing code...",
956
+ "progress": 82
957
+ }
958
+
959
+ # Execute the code with error handling and session DataFrame
960
+ try:
961
+ # Run code execution in thread pool to avoid blocking
962
+ import concurrent.futures
963
+ with concurrent.futures.ThreadPoolExecutor() as executor:
964
+ future = executor.submit(clean_and_store_code, code, session_df)
965
+ output = future.result(timeout=300) # 5 minute timeout
966
+
967
+ logger.log_message(f"Deep Code executed")
968
+
969
+ if output.get('error'):
970
+ logger.log_message(f"Warning: Code execution had errors: {output['error']}", logging.ERROR)
971
+
972
+ print_outputs = [output['printed_output']]
973
+ plotly_figs = [output['plotly_figs']]
974
+
975
+ except Exception as e:
976
+ logger.log_message(f"Error during code execution: {str(e)}", logging.ERROR)
977
+ output = {
978
+ 'exec_result': None,
979
+ 'printed_output': f"Code execution failed: {str(e)}",
980
+ 'plotly_figs': [],
981
+ 'error': str(e)
982
+ }
983
+ print_outputs = [output['printed_output']]
984
+ plotly_figs = [output['plotly_figs']]
985
+
986
+ yield {
987
+ "step": "code_execution",
988
+ "status": "completed",
989
+ "message": "Code execution completed",
990
+ "progress": 85
991
+ }
992
+
993
+ # Step 6: Synthesis (90% progress)
994
+ yield {
995
+ "step": "synthesis",
996
+ "status": "processing",
997
+ "message": "Synthesizing results...",
998
+ "progress": 87
999
+ }
1000
+
1001
+ synthesis = []
1002
+ try:
1003
+ synthesis_result = await self.deep_synthesizer(
1004
+ query=goal,
1005
+ summaries=str(summaries),
1006
+ print_outputs=str(output['printed_output'])
1007
+ )
1008
+ synthesis.append(synthesis_result)
1009
+ except Exception as e:
1010
+ logger.log_message(f"Error during synthesis: {str(e)}", logging.ERROR)
1011
+ synthesis.append(type('obj', (object,), {'synthesized_report': f"Synthesis failed: {str(e)}"})())
1012
+
1013
+ logger.log_message("Synthesis done")
1014
+
1015
+ yield {
1016
+ "step": "synthesis",
1017
+ "status": "completed",
1018
+ "message": "Synthesis completed",
1019
+ "progress": 90
1020
+ }
1021
+
1022
+ # Step 7: Final conclusion (100% progress)
1023
+ yield {
1024
+ "step": "conclusion",
1025
+ "status": "processing",
1026
+ "message": "Generating final conclusion...",
1027
+ "progress": 95
1028
+ }
1029
+
1030
+ try:
1031
+ final_conclusion = await self.final_conclusion(
1032
+ query=goal,
1033
+ synthesized_sections=str([s.synthesized_report for s in synthesis])
1034
+ )
1035
+ except Exception as e:
1036
+ logger.log_message(f"Error during final conclusion: {str(e)}", logging.ERROR)
1037
+ final_conclusion = type('obj', (object,), {'final_conclusion': f"Final conclusion failed: {str(e)}"})()
1038
+
1039
+ logger.log_message("Conclusion Made")
1040
+
1041
+ return_dict = {
1042
+ 'goal': goal,
1043
+ 'deep_questions': questions.deep_questions,
1044
+ 'deep_plan': deep_plan.plan_instructions,
1045
+ 'summaries': summaries,
1046
+ 'code': code,
1047
+ 'plotly_figs': plotly_figs,
1048
+ 'synthesis': [s.synthesized_report for s in synthesis],
1049
+ 'final_conclusion': final_conclusion.final_conclusion
1050
+ }
1051
+
1052
+ yield {
1053
+ "step": "conclusion",
1054
+ "status": "completed",
1055
+ "message": "Analysis completed successfully",
1056
+ "progress": 100,
1057
+ "final_result": return_dict
1058
+ }
1059
+
1060
+ logger.log_message("Return dict created")
1061
+
1062
+ except Exception as e:
1063
+ logger.log_message(f"Error in deep analysis: {str(e)}", logging.ERROR)
1064
+ yield {
1065
+ "step": "error",
1066
+ "status": "failed",
1067
+ "message": f"Deep analysis failed: {str(e)}",
1068
+ "progress": 0,
1069
+ "error": str(e)
1070
+ }
1071
+
1072
+
1073
+ async def execute_deep_analysis(self, goal, dataset_info, session_df=None):
1074
+ """
1075
+ Legacy method for backward compatibility.
1076
+ Executes the streaming analysis and returns the final result.
1077
+ """
1078
+ final_result = None
1079
+ async for update in self.execute_deep_analysis_streaming(goal, dataset_info, session_df):
1080
+ if update.get("step") == "conclusion" and update.get("status") == "completed":
1081
+ final_result = update.get("final_result")
1082
+ elif update.get("step") == "error":
1083
+ raise Exception(update.get("message", "Unknown error"))
1084
+
1085
+ return final_result
src/agents/marketing_analytics_agents.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+
3
+ # Contains the DSPy agents for quantitative finance
4
+
5
+ class bidding_strategy_agent(dspy.Signature):
6
+ # Analytics Agent for optimizing bidding strategies
7
+ """You are a bidding strategy analytics agent specialized in marketing analytics.
8
+ Your task is to take marketing campaign data and a user-defined goal, and output Python code that performs
9
+ bidding strategy analysis and optimization.
10
+ You should use libraries like numpy, pandas, and scikit-learn for the analysis.
11
+
12
+ Bidding strategy tasks include:
13
+ - Analyzing historical bid performance
14
+ - Optimizing bid values across channels
15
+ - Forecasting campaign performance
16
+ - A/B testing bid strategies
17
+ - ROI and conversion rate analysis
18
+ - Budget allocation optimization
19
+
20
+ Make sure your output is as intended!
21
+
22
+ You may be given recent agent interactions as a hint! With the first being the latest
23
+ You are logged in streamlit use st.write instead of print
24
+
25
+ """
26
+ dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df,columns. set df as copy of df")
27
+ goal = dspy.InputField(desc="The user defined goal ")
28
+ code = dspy.OutputField(desc="The code that performs the bidding strategy analysis")
29
+ commentary = dspy.OutputField(desc="The comments about what bidding strategy analysis is being performed")
30
+
31
+ class marketing_reporting_agent(dspy.Signature):
32
+ # Analytics Agent for generating marketing reports
33
+ """You are a marketing reporting agent specialized in creating data-driven marketing reports.
34
+ Your task is to take marketing data, a user-defined goal, and report instructions to generate
35
+ Python code that creates insightful marketing reports and visualizations.
36
+ You should use libraries like numpy, pandas for the analysis and only plotly for visualization.
37
+
38
+
39
+ Make sure your output matches the report instructions and goal!
40
+
41
+ You are logged in streamlit use st.write instead of print
42
+ Use st.plotly_chart() for interactive plots
43
+ """
44
+ dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df,columns. set df as copy of df")
45
+ goal = dspy.InputField(desc="The user defined goal")
46
+ report_instructions = dspy.InputField(desc="Specific instructions for report format, metrics, and visualizations")
47
+ code = dspy.OutputField(desc="The code that generates the marketing report")
48
+
49
+
50
+ class customer_analytics_agent(dspy.Signature):
51
+ # Analytics Agent for customer value and acquisition analysis
52
+ """You are a customer analytics agent specialized in analyzing customer behavior and value.
53
+ Your task is to take customer data and a user-defined goal, and output Python code that performs
54
+ customer lifetime value, acquisition cost, and ROI analysis.
55
+ You should use libraries like numpy, pandas, scikit-learn and lifetimes for the analysis.
56
+
57
+ Customer analytics tasks include:
58
+ - Customer Lifetime Value (CLV/LTV) modeling
59
+ - Customer Acquisition Cost (CAC) analysis
60
+ - Customer segmentation and clustering
61
+ - Churn prediction and prevention
62
+ - Customer journey mapping
63
+ - ROI and retention metrics
64
+ - Purchase behavior analysis
65
+
66
+ Make sure your output is as intended!
67
+
68
+ You may be given recent agent interactions as a hint! With the first being the latest
69
+ You are logged in streamlit use st.write instead of print
70
+
71
+ """
72
+ dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df,columns. set df as copy of df")
73
+ goal = dspy.InputField(desc="The user defined goal ")
74
+ code = dspy.OutputField(desc="The code that performs the customer analytics")
75
+ commentary = dspy.OutputField(desc="The comments about what customer analysis is being performed")
src/agents/memory_agents.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+
3
+ class memory_summarize_agent(dspy.Signature):
4
+ """
5
+ You are an AI agent which helps summarize other agent responses and user-input.
6
+ Keep these instructions in mind:
7
+
8
+ - Analyze the provided text.
9
+ - Present the extracted details in bullet points:
10
+ - User Query: The user query/goal summarized, with only important information retained
11
+ - Agent: Include agent name
12
+ - Stack_Used: All python packages used
13
+ - Actions: What actions did the agent_name take, summarize them like "Agent visualized a line chart using plotly"
14
+
15
+ """
16
+ agent_response = dspy.InputField(desc="What the agents output, commentary and code")
17
+ user_goal = dspy.InputField(desc= "User query or intended goal")
18
+ summary = dspy.OutputField(desc ="The summary generated in the format requested")
19
+
20
+ class error_memory_agent(dspy.Signature):
21
+ """
22
+ Prompt for error_summarize Agent:
23
+
24
+ Agent Name: error_summarize
25
+
26
+ Purpose: To generate a concise summary of an error in Python code and provide a clear correction, along with relevant metadata and user query information. This summary will help in understanding the error and applying the correction.
27
+
28
+ Input Data:
29
+
30
+ Incorrect Python Code: (A snippet of code that produced an error)
31
+ Meta Data:
32
+ Agent Name: (Name of the agent that processed the code)
33
+ Agent Version: (Version of the agent that processed the code)
34
+ Timestamp: (When the error occurred)
35
+ User Query: (The query or task that led to the incorrect code execution)
36
+ Human-Defined Correction: (The corrected code or solution provided by a human expert)
37
+ Processing Instructions:
38
+
39
+ Error Analysis:
40
+
41
+ Analyze the incorrect Python code to determine the type of error and its cause.
42
+ Summary Creation:
43
+
44
+ Generate a brief summary of the error, highlighting the key issue in the code.
45
+ Provide a short explanation of the correction that resolves the issue.
46
+ Output Formatting:
47
+
48
+ Format the summary to include:
49
+ Error Summary: A concise description of the error.
50
+ Correction: A brief explanation of how to correct the error.
51
+ Integration:
52
+
53
+ Ensure the summary is clear and informative for future reference.
54
+ Output Data:
55
+
56
+ Error Summary:
57
+ Error Summary: (Brief description of the error)
58
+ Correction: (Concise explanation of the fix)
59
+ Example Output:
60
+
61
+ Error Summary: The IndexError occurred because the code attempted to access an element at an index that is out of range for the list.
62
+ Correction: Ensure the index is within the bounds of the list. For example, use if index < len(my_list): to check the index before accessing the list element.
63
+ """
64
+ incorrect_code = dspy.InputField(desc="Error causing code")
65
+ error_metadata = dspy.InputField(desc="The description of the error generated, with user/agent information for context")
66
+ correction = dspy.InputField(desc="Correction suggested by AI or done manually by human")
67
+ summary = dspy.OutputField(desc="The description which must contain information about the error and how to correct it")
68
+
src/agents/retrievers/retrievers.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file handles the data-preprocessing and creates retrievers
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from datetime import datetime
6
+
7
+ # instructions also stored here
8
+ instructions ="""
9
+ Here are the instructions for the AI system with the specified agents:
10
+
11
+ ### AI System Instructions
12
+
13
+ #### Agents
14
+ - `@data_viz_agent`: Handles queries related to data visualization.
15
+ - `@sk_learn_agent`: Handles queries related to machine learning using scikit-learn.
16
+ - `@statistical_analytics_agent`: Handles queries related to statistical analysis.
17
+ - `@preprocessing_agent`: Handles queries related to data preprocessing.
18
+
19
+ #### Query Routing
20
+
21
+ 1. **Direct Agent Routing**:
22
+ - If the user specifies an agent in their query using `@agent_name`, the query will be directly routed to the specified agent.
23
+ - Example: `@data_viz_agent Create a bar chart from the following data.`
24
+
25
+ 2. **Planner-Based Routing**:
26
+ - If the user does not specify an agent, the query will be routed to the system's planner.
27
+ - The planner will analyze the query and determine the most appropriate agent to handle the request.
28
+ - Example: `Generate a confusion matrix from this dataset.`
29
+
30
+ PLEASE READ THE INSTRUCTIONS! Thank you
31
+ """
32
+
33
+ # For every column collects some useful information like top10 categories and min,max etc if applicable
34
+ def return_vals(df,c):
35
+ if isinstance(df[c].iloc[10], (int, float, complex)):
36
+ return {'max_value':max(df[c]),'min_value': min(df[c]), 'mean_value':np.mean(df[c])}
37
+ elif(isinstance(df[c].iloc[10],datetime)):
38
+ return {str(max(df[c])), str(min(df[c])), str(np.mean(df[c]))}
39
+ else:
40
+ return {'top_10_values':df[c].value_counts()[:10], 'total_categoy_count':len(df[c].unique())}
41
+
42
+ #removes `,` from numeric columns
43
+ def correct_num(df,c):
44
+ try:
45
+ df[c] = df[c].fillna('0').str.replace(',','').astype(float)
46
+ return df[c]
47
+ except:
48
+ return df[c]
49
+
50
+
51
+
52
+ # does most of the pre-processing
53
+ def make_data(df, desc):
54
+ dict_ = {}
55
+ dict_['df_name'] = "The data is loaded as df"
56
+ dict_['Description'] = desc
57
+ dict_['dataframe_head_view'] = df.head(2).to_markdown()
58
+ # dict_['all_column_names'] = str(list(df.columns[:20]))
59
+
60
+ # for c in df.columns:
61
+ # df[c] = correct_num(df,c)
62
+ # try:
63
+ # dict_[c] = {'column_name':c,'type':str(type(df[c].iloc[0])), 'column_information':return_vals(df,c)}
64
+ # except:
65
+ # dict_[c] = {'column_name':c,'type':str(type(df[c].iloc[0])), 'column_information':'NA'}
66
+ return dict_
67
+
68
+
69
+
70
+ # These are stored styling instructions for data_viz_agent, helps generate good graphs
71
+ styling_instructions =[
72
+ """
73
+ Dont ignore any of these instructions.
74
+ For a line chart always use plotly_white template, reduce x axes & y axes line to 0.2 & x & y grid width to 1.
75
+ Always give a title and make bold using html tag axis label and try to use multiple colors if more than one line
76
+ Annotate the min and max of the line
77
+ Display numbers in thousand(K) or Million(M) if larger than 1000/100000
78
+ Show percentages in 2 decimal points with '%' sign
79
+ Default size of chart should be height =1200 and width =1000
80
+
81
+ """
82
+
83
+ , """
84
+ Dont ignore any of these instructions.
85
+ For a bar chart always use plotly_white template, reduce x axes & y axes line to 0.2 & x & y grid width to 1.
86
+ Always give a title and make bold using html tag axis label
87
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000.
88
+ Annotate the values of the bar chart
89
+ If variable is a percentage show in 2 decimal points with '%' sign.
90
+ Default size of chart should be height =1200 and width =1000
91
+ """
92
+ ,
93
+
94
+ """
95
+ For a histogram chart choose a bin_size of 50
96
+ Do not ignore any of these instructions
97
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
98
+ Always give a title and make bold using html tag axis label
99
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
100
+ If variable is a percentage show in 2 decimal points with '%'
101
+ Default size of chart should be height =1200 and width =1000
102
+ """,
103
+
104
+
105
+ """
106
+ For a pie chart only show top 10 categories, bundle rest as others
107
+ Do not ignore any of these instructions
108
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
109
+ Always give a title and make bold using html tag axis label
110
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
111
+ If variable is a percentage show in 2 decimal points with '%'
112
+ Default size of chart should be height =1200 and width =1000
113
+ """,
114
+
115
+ """
116
+ Do not ignore any of these instructions
117
+ always use plotly_white template, reduce x & y axes line to 0.2 & x & y grid width to 1.
118
+ Always give a title and make bold using html tag axis label
119
+ Always display numbers in thousand(K) or Million(M) if larger than 1000/100000. Add annotations x values
120
+ Don't add K/M if number already in , or value is not a number
121
+ If variable is a percentage show in 2 decimal points with '%'
122
+ Default size of chart should be height =1200 and width =1000
123
+ """,
124
+ """
125
+ For a heat map
126
+ Use the 'plotly_white' template for a clean, white background.
127
+ Set a chart title
128
+ Style the X-axis with a black line color, 0.2 line width, 1 grid width, format 1000/1000000 as K/M
129
+ Do not format non-numerical numbers
130
+ .style the Y-axis with a black line color, 0.2 line width, 1 grid width format 1000/1000000 as K/M
131
+ Do not format non-numerical numbers
132
+
133
+ . Set the figure dimensions to a height of 1200 pixels and a width of 1000 pixels.
134
+ """,
135
+ """
136
+ For a Histogram, used for returns/distribution plotting
137
+ Use the 'plotly_white' template for a clean, white background.
138
+ Set a chart title
139
+ Style the X-axis 1 grid width, format 1000/1000000 as K/M
140
+ Do not format non-numerical numbers
141
+ .style the Y-axis, 1 grid width format 1000/1000000 as K/M
142
+ Do not format non-numerical numbers
143
+
144
+ Use an opacity of 0.75
145
+
146
+ Set the figure dimensions to a height of 1200 pixels and a width of 1000 pixels.
147
+ """
148
+
149
+ ]
150
+
151
+
152
+
153
+
src/db/__init__.py ADDED
File without changes
src/db/init_db.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from sqlalchemy import create_engine, event
5
+ from sqlalchemy.orm import sessionmaker
6
+ from src.db.schemas.models import Base
7
+ from src.utils.logger import Logger
8
+
9
+ logger = Logger("init_db", see_time=True, console_log=True)
10
+ load_dotenv()
11
+
12
+ # Create the database engine based on environment variable
13
+ DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///chat_database.db")
14
+
15
+ # Determine database type and set appropriate engine configurations
16
+ if DATABASE_URL.startswith('postgresql'):
17
+ # PostgreSQL-specific configuration
18
+ engine = create_engine(
19
+ DATABASE_URL,
20
+ pool_size=10,
21
+ max_overflow=20,
22
+ pool_pre_ping=True, # Check connection validity before use
23
+ pool_recycle=300 # Recycle connections after 5 minutes
24
+ )
25
+ is_postgresql = True
26
+ logger.log_message("Using PostgreSQL database engine", logging.INFO)
27
+ else:
28
+ # SQLite configuration
29
+ engine = create_engine(DATABASE_URL)
30
+ is_postgresql = False
31
+ # For SQLite, enable foreign key constraints
32
+ @event.listens_for(engine, "connect")
33
+ def set_sqlite_pragma(dbapi_connection, connection_record):
34
+ cursor = dbapi_connection.cursor()
35
+ cursor.execute("PRAGMA foreign_keys=ON")
36
+ cursor.close()
37
+ logger.log_message("Using SQLite database engine", logging.INFO)
38
+
39
+ # Create session factory
40
+ Session = sessionmaker(bind=engine)
41
+ session_factory = Session
42
+
43
+ # Database initialization function
44
+ def init_db():
45
+ # Create all tables
46
+ Base.metadata.create_all(engine)
47
+ logger.log_message("Database and tables created successfully.", logging.INFO)
48
+ logger.log_message(f"Models: {Base.metadata.tables.keys()}", logging.INFO)
49
+
50
+ # Utility function to get a new session
51
+ def get_session():
52
+ return Session()
53
+
54
+ def get_db():
55
+ db = Session()
56
+ try:
57
+ yield db
58
+ except Exception as e:
59
+ logger.log_message(f"Error getting database session: {e}", logging.ERROR)
60
+ finally:
61
+ db.close()
62
+
63
+ # Add function to check if using PostgreSQL
64
+ def is_postgres_db():
65
+ return is_postgresql
66
+
67
+ if __name__ == "__main__":
68
+ init_db()
src/db/schemas/__init__.py ADDED
File without changes
src/db/schemas/models.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, DateTime, Text, Float, Boolean, JSON, UniqueConstraint
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker, relationship
4
+ from datetime import datetime, UTC
5
+
6
+ # Define the base class for declarative models
7
+ Base = declarative_base()
8
+
9
+ # Define the Users table
10
+ class User(Base):
11
+ __tablename__ = 'users'
12
+
13
+ user_id = Column(Integer, primary_key=True, autoincrement=True)
14
+ username = Column(String, unique=True, nullable=False)
15
+ email = Column(String, unique=True, nullable=False)
16
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
17
+ # Add relationship for cascade options
18
+ chats = relationship("Chat", back_populates="user", cascade="all, delete-orphan")
19
+ usage_records = relationship("ModelUsage", back_populates="user")
20
+ deep_analysis_reports = relationship("DeepAnalysisReport", back_populates="user", cascade="all, delete-orphan")
21
+ template_preferences = relationship("UserTemplatePreference", back_populates="user", cascade="all, delete-orphan")
22
+
23
+ # Define the Chats table
24
+ class Chat(Base):
25
+ __tablename__ = 'chats'
26
+
27
+ chat_id = Column(Integer, primary_key=True, autoincrement=True)
28
+ user_id = Column(Integer, ForeignKey('users.user_id', ondelete="CASCADE"), nullable=True)
29
+ title = Column(String, default='New Chat')
30
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
31
+ # Add relationships for cascade options
32
+ user = relationship("User", back_populates="chats")
33
+ messages = relationship("Message", back_populates="chat", cascade="all, delete-orphan")
34
+ usage_records = relationship("ModelUsage", back_populates="chat")
35
+
36
+ # Define the Messages table
37
+ class Message(Base):
38
+ __tablename__ = 'messages'
39
+
40
+ message_id = Column(Integer, primary_key=True, autoincrement=True)
41
+ chat_id = Column(Integer, ForeignKey('chats.chat_id', ondelete="CASCADE"), nullable=False)
42
+ sender = Column(String, nullable=False) # 'user' or 'ai'
43
+ content = Column(Text, nullable=False)
44
+ timestamp = Column(DateTime, default=lambda: datetime.now(UTC))
45
+ # Add relationship for cascade options
46
+ chat = relationship("Chat", back_populates="messages")
47
+ feedback = relationship("MessageFeedback", back_populates="message", uselist=False, cascade="all, delete-orphan")
48
+
49
+ # Define the Model Usage table
50
+ class ModelUsage(Base):
51
+ """Tracks AI model usage metrics for analytics and billing purposes."""
52
+ __tablename__ = 'model_usage'
53
+
54
+ usage_id = Column(Integer, primary_key=True)
55
+ user_id = Column(Integer, ForeignKey('users.user_id', ondelete="SET NULL"), nullable=True)
56
+ chat_id = Column(Integer, ForeignKey('chats.chat_id', ondelete="SET NULL"), nullable=True)
57
+ model_name = Column(String(100), nullable=False)
58
+ provider = Column(String(50), nullable=False)
59
+ prompt_tokens = Column(Integer, default=0)
60
+ completion_tokens = Column(Integer, default=0)
61
+ total_tokens = Column(Integer, default=0)
62
+ query_size = Column(Integer, default=0) # Size in characters
63
+ response_size = Column(Integer, default=0) # Size in characters
64
+ cost = Column(Float, default=0.0) # Cost in USD
65
+ timestamp = Column(DateTime, default=lambda: datetime.now(UTC))
66
+ is_streaming = Column(Boolean, default=False)
67
+ request_time_ms = Column(Integer, default=0) # Request processing time in milliseconds
68
+ # Add relationships
69
+ user = relationship("User", back_populates="usage_records")
70
+ chat = relationship("Chat", back_populates="usage_records")
71
+
72
+ # Define the Code Execution table
73
+ class CodeExecution(Base):
74
+ """Tracks code execution attempts and results for analysis and debugging."""
75
+ __tablename__ = 'code_executions'
76
+
77
+ execution_id = Column(Integer, primary_key=True, autoincrement=True)
78
+ message_id = Column(Integer, ForeignKey('messages.message_id', ondelete="CASCADE"), nullable=True)
79
+ chat_id = Column(Integer, ForeignKey('chats.chat_id', ondelete="CASCADE"), nullable=True)
80
+ user_id = Column(Integer, ForeignKey('users.user_id', ondelete="SET NULL"), nullable=True)
81
+
82
+ # Code tracking
83
+ initial_code = Column(Text, nullable=True) # First version of code submitted
84
+ latest_code = Column(Text, nullable=True) # Most recent version of code
85
+
86
+ # Execution results
87
+ is_successful = Column(Boolean, default=False)
88
+ output = Column(Text, nullable=True) # Full output including errors
89
+
90
+ # Model and agent information
91
+ model_provider = Column(String(50), nullable=True)
92
+ model_name = Column(String(100), nullable=True)
93
+ model_temperature = Column(Float, nullable=True)
94
+ model_max_tokens = Column(Integer, nullable=True)
95
+
96
+ # Failure information
97
+ failed_agents = Column(Text, nullable=True) # JSON list of agent names that failed
98
+ error_messages = Column(Text, nullable=True) # JSON map of error messages by agent
99
+
100
+ # Metadata
101
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
102
+ updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))
103
+
104
+ class MessageFeedback(Base):
105
+ """Tracks user feedback and model settings for each message."""
106
+ __tablename__ = 'message_feedback'
107
+
108
+ feedback_id = Column(Integer, primary_key=True, autoincrement=True)
109
+ message_id = Column(Integer, ForeignKey('messages.message_id', ondelete="CASCADE"), nullable=False)
110
+
111
+ # User feedback
112
+ rating = Column(Integer, nullable=True) # Star rating (1-5)
113
+
114
+ # Model settings used for this message
115
+ model_name = Column(String(100), nullable=True)
116
+ model_provider = Column(String(50), nullable=True)
117
+ temperature = Column(Float, nullable=True)
118
+ max_tokens = Column(Integer, nullable=True)
119
+
120
+ # Metadata
121
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
122
+ updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))
123
+
124
+ # Relationship
125
+ message = relationship("Message", back_populates="feedback")
126
+
127
+ class DeepAnalysisReport(Base):
128
+ """Stores deep analysis reports with comprehensive analysis data and metadata."""
129
+ __tablename__ = 'deep_analysis_reports'
130
+
131
+ report_id = Column(Integer, primary_key=True, autoincrement=True)
132
+ report_uuid = Column(String(100), unique=True, nullable=False) # Frontend generated ID
133
+ user_id = Column(Integer, ForeignKey('users.user_id', ondelete="CASCADE"), nullable=True)
134
+
135
+ # Analysis objective and status
136
+ goal = Column(Text, nullable=False) # The analysis objective/question
137
+ status = Column(String(20), nullable=False, default='pending') # 'pending', 'running', 'completed', 'failed'
138
+
139
+ # Timing information
140
+ start_time = Column(DateTime, default=lambda: datetime.now(UTC))
141
+ end_time = Column(DateTime, nullable=True)
142
+ duration_seconds = Column(Integer, nullable=True) # Calculated duration
143
+
144
+ # Analysis components (stored as text/JSON)
145
+ deep_questions = Column(Text, nullable=True) # Generated analytical questions
146
+ deep_plan = Column(Text, nullable=True) # Analysis plan
147
+ summaries = Column(JSON, nullable=True) # Array of analysis summaries
148
+ analysis_code = Column(Text, nullable=True) # Generated Python code
149
+ plotly_figures = Column(JSON, nullable=True) # Array of Plotly figure data
150
+ synthesis = Column(JSON, nullable=True) # Array of synthesis insights
151
+ final_conclusion = Column(Text, nullable=True) # Final analysis conclusion
152
+
153
+ # Report output
154
+ html_report = Column(Text, nullable=True) # Complete HTML report
155
+ report_summary = Column(Text, nullable=True) # Brief summary for listing
156
+
157
+ # Execution tracking
158
+ progress_percentage = Column(Integer, default=0) # Progress 0-100
159
+ steps_completed = Column(JSON, nullable=True) # Array of completed step names
160
+ error_message = Column(Text, nullable=True) # Error details if failed
161
+
162
+ # Model and cost tracking
163
+ model_provider = Column(String(50), nullable=True)
164
+ model_name = Column(String(100), nullable=True)
165
+ total_tokens_used = Column(Integer, default=0)
166
+ estimated_cost = Column(Float, default=0.0) # Cost in USD
167
+ credits_consumed = Column(Integer, default=0) # Credits deducted for this analysis
168
+
169
+ # Metadata
170
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
171
+ updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))
172
+
173
+ # Relationships
174
+ user = relationship("User", back_populates="deep_analysis_reports")
175
+
176
+ class AgentTemplate(Base):
177
+ """Stores predefined agent templates that users can enable/disable."""
178
+ __tablename__ = 'agent_templates'
179
+
180
+ template_id = Column(Integer, primary_key=True, autoincrement=True)
181
+
182
+ # Template definition
183
+ template_name = Column(String(100), nullable=False, unique=True) # e.g., 'pytorch_specialist', 'data_cleaning_expert'
184
+ display_name = Column(String(200), nullable=True) # User-friendly display name
185
+ description = Column(Text, nullable=False) # Short description for template selection
186
+ prompt_template = Column(Text, nullable=False) # Main prompt/instructions for agent behavior
187
+
188
+ # Template appearance
189
+ icon_url = Column(String(500), nullable=True) # URL to template icon (CDN, data URL, or relative path)
190
+
191
+ # Template categorization
192
+ category = Column(String(50), nullable=True) # 'Visualization', 'Modelling', 'Data Manipulation'
193
+ is_premium_only = Column(Boolean, default=False) # True if template requires premium subscription
194
+
195
+ # Agent variant support
196
+ variant_type = Column(String(20), default='individual') # 'planner', 'individual', or 'both'
197
+ base_agent = Column(String(100), nullable=True) # Base agent name for variants (e.g., 'preprocessing_agent')
198
+
199
+ # Status and metadata
200
+ is_active = Column(Boolean, default=True)
201
+
202
+ # Timestamps
203
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
204
+ updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))
205
+
206
+ # Relationships
207
+ user_preferences = relationship("UserTemplatePreference", back_populates="template", cascade="all, delete-orphan")
208
+
209
+ class UserTemplatePreference(Base):
210
+ """Tracks user preferences and usage for agent templates."""
211
+ __tablename__ = 'user_template_preferences'
212
+
213
+ preference_id = Column(Integer, primary_key=True, autoincrement=True)
214
+ user_id = Column(Integer, ForeignKey('users.user_id', ondelete="CASCADE"), nullable=False)
215
+ template_id = Column(Integer, ForeignKey('agent_templates.template_id', ondelete="CASCADE"), nullable=False)
216
+
217
+ # User preferences
218
+ is_enabled = Column(Boolean, default=True) # Whether user has this template enabled
219
+
220
+ # Usage tracking
221
+ usage_count = Column(Integer, default=0) # Track how many times user has used this template
222
+ last_used_at = Column(DateTime, nullable=True) # Last time user used this template
223
+
224
+ # Timestamps
225
+ created_at = Column(DateTime, default=lambda: datetime.now(UTC))
226
+ updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))
227
+
228
+ # Relationships
229
+ user = relationship("User", back_populates="template_preferences")
230
+ template = relationship("AgentTemplate", back_populates="user_preferences")
231
+
232
+ # Constraints - user can only have one preference record per template
233
+ __table_args__ = (
234
+ UniqueConstraint('user_id', 'template_id', name='unique_user_template_preference'),
235
+ )
236
+
237
+
src/managers/ai_manager.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ from src.db.schemas.models import ModelUsage
4
+ from src.db.init_db import session_factory
5
+ from datetime import datetime, UTC
6
+ from src.routes.analytics_routes import handle_new_model_usage
7
+ import asyncio
8
+
9
+ from src.utils.logger import Logger
10
+ from src.utils.model_registry import get_provider_for_model, calculate_cost
11
+
12
+ logger = Logger(name="ai_manager", see_time=True, console_log=True)
13
+
14
+ class AI_Manager:
15
+ """Manages AI model interactions and usage tracking"""
16
+
17
+ def __init__(self):
18
+ self.tokenizer = None
19
+ # Initialize tokenizer - could use tiktoken or another tokenizer
20
+ try:
21
+ import tiktoken
22
+ self.tokenizer = tiktoken.get_encoding("cl100k_base")
23
+ except ImportError:
24
+ logger.log_message("Tiktoken not available, using simple tokenizer", level=logging.WARNING)
25
+ self.tokenizer = SimpleTokenizer()
26
+
27
+ def save_usage_to_db(self, user_id, chat_id, model_name, provider,
28
+ prompt_tokens, completion_tokens, total_tokens,
29
+ query_size, response_size, cost, request_time_ms,
30
+ is_streaming=False):
31
+ """Save model usage data to the database"""
32
+ try:
33
+ session = session_factory()
34
+
35
+ usage = ModelUsage(
36
+ user_id=user_id,
37
+ chat_id=chat_id,
38
+ model_name=model_name,
39
+ provider=provider,
40
+ prompt_tokens=prompt_tokens,
41
+ completion_tokens=completion_tokens,
42
+ total_tokens=total_tokens,
43
+ query_size=query_size,
44
+ response_size=response_size,
45
+ cost=cost,
46
+ timestamp=datetime.now(UTC),
47
+ is_streaming=is_streaming,
48
+ request_time_ms=request_time_ms
49
+ )
50
+
51
+ session.add(usage)
52
+ session.commit()
53
+ # logger.info(f"Saved usage data to database for chat {chat_id}: {total_tokens} tokens, ${cost:.6f}")
54
+
55
+ # Broadcast the event asynchronously
56
+ asyncio.create_task(handle_new_model_usage(usage))
57
+
58
+ except Exception as e:
59
+ session.rollback()
60
+ logger.log_message(f"Error saving usage data to database for chat {chat_id}: {str(e)}", level=logging.ERROR)
61
+ finally:
62
+ session.close()
63
+
64
+ def calculate_cost(self, model_name, input_tokens, output_tokens):
65
+ """Calculate the cost for using the model based on tokens"""
66
+ if not model_name:
67
+ return 0
68
+
69
+ # Get provider for logging
70
+ model_provider = get_provider_for_model(model_name)
71
+ logger.log_message(f"[> ] Model Name: {model_name}, Model Provider: {model_provider}", level=logging.INFO)
72
+
73
+ # Use the centralized calculate_cost function
74
+ return calculate_cost(model_name, input_tokens, output_tokens)
75
+
76
+ def get_provider_for_model(self, model_name):
77
+ """Determine the provider based on model name"""
78
+ # Use the centralized get_provider_for_model function
79
+ return get_provider_for_model(model_name)
80
+
81
+ class SimpleTokenizer:
82
+ """A very simple tokenizer implementation for fallback"""
83
+ def encode(self, text):
84
+ return len(text.split())
src/managers/chat_manager.py ADDED
@@ -0,0 +1,944 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine, func, exists
2
+ from sqlalchemy.orm import sessionmaker, scoped_session
3
+ from sqlalchemy.exc import SQLAlchemyError
4
+ from src.db.schemas.models import Base, User, Chat, Message, ModelUsage, MessageFeedback
5
+ import logging
6
+ from typing import List, Dict, Optional, Any
7
+ from datetime import datetime, UTC
8
+ from src.utils.logger import Logger
9
+ from src.utils.model_registry import MODEL_COSTS
10
+ import re
11
+
12
+ logger = Logger("chat_manager", see_time=True, console_log=False)
13
+
14
+
15
+ class ChatManager:
16
+ """
17
+ Manages chat operations including creating, storing, retrieving, and updating chats and messages.
18
+ Provides an interface between the application and the database for chat-related operations.
19
+ """
20
+
21
+ def __init__(self, db_url):
22
+ """
23
+ Initialize the ChatManager with a database connection.
24
+
25
+ Args:
26
+ db_url: Database connection URL (defaults to SQLite)
27
+ """
28
+ self.engine = create_engine(db_url)
29
+ Base.metadata.create_all(self.engine) # Ensure tables exist
30
+ self.Session = scoped_session(sessionmaker(bind=self.engine))
31
+
32
+ # Add price mappings for different models
33
+ self.model_costs = MODEL_COSTS
34
+
35
+
36
+ # Add model providers mapping
37
+ self.model_providers = {
38
+ "gpt-": "openai",
39
+ "claude-": "anthropic",
40
+ "llama-": "groq",
41
+ "mistral-": "groq",
42
+ }
43
+
44
+ def create_chat(self, user_id: Optional[int] = None) -> Dict[str, Any]:
45
+ """
46
+ Create a new chat session.
47
+
48
+ Args:
49
+ user_id: Optional user ID if authentication is enabled
50
+
51
+ Returns:
52
+ Dictionary containing chat information
53
+ """
54
+ session = self.Session()
55
+ try:
56
+ # Create a new chat
57
+ chat = Chat(
58
+ user_id=user_id,
59
+ title='New Chat',
60
+ created_at=datetime.now(UTC)
61
+ )
62
+ session.add(chat)
63
+ session.flush() # Flush to get the ID before commit
64
+
65
+ chat_id = chat.chat_id # Get the ID now
66
+ session.commit()
67
+
68
+ logger.log_message(f"Created new chat {chat_id} for user {user_id}", level=logging.INFO)
69
+
70
+ return {
71
+ "chat_id": chat_id,
72
+ "user_id": chat.user_id,
73
+ "title": chat.title,
74
+ "created_at": chat.created_at.isoformat()
75
+ }
76
+ except SQLAlchemyError as e:
77
+ session.rollback()
78
+ logger.log_message(f"Error creating chat: {str(e)}", level=logging.ERROR)
79
+ raise
80
+ finally:
81
+ session.close()
82
+
83
+ def add_message(self, chat_id: int, content: str, sender: str, user_id: Optional[int] = None) -> Dict[str, Any]:
84
+ """
85
+ Add a message to a chat.
86
+
87
+ Args:
88
+ chat_id: ID of the chat to add the message to
89
+ content: Message content
90
+ sender: Message sender ('user' or 'ai')
91
+ user_id: Optional user ID to verify ownership
92
+
93
+ Returns:
94
+ Dictionary containing message information
95
+ """
96
+ session = self.Session()
97
+ try:
98
+ # Check if chat exists and belongs to the user if user_id is provided
99
+ query = session.query(Chat).filter(Chat.chat_id == chat_id)
100
+ if user_id is not None:
101
+ query = query.filter((Chat.user_id == user_id) | (Chat.user_id.is_(None)))
102
+
103
+ chat = query.first()
104
+ if not chat:
105
+ raise ValueError(f"Chat with ID {chat_id} not found or access denied")
106
+
107
+ ##! Ensure content length is reasonable for PostgreSQL
108
+ # max_content_length = 10000 # PostgreSQL can handle large text but let's be cautious
109
+ # if content and len(content) > max_content_length:
110
+ # logger.log_message(f"Truncating message content from {len(content)} to {max_content_length} characters",
111
+ # level=logging.WARNING)
112
+ # content = content[:max_content_length]
113
+
114
+ # Create a new message
115
+ message = Message(
116
+ chat_id=chat_id,
117
+ content=content,
118
+ sender=sender,
119
+ timestamp=datetime.now(UTC)
120
+ )
121
+ session.add(message)
122
+ session.flush() # Flush to get the ID before commit
123
+
124
+ message_id = message.message_id # Get ID now
125
+
126
+ # If this is the first AI response and chat title is still default,
127
+ # update the chat title based on the first user query
128
+ if sender == 'ai':
129
+ first_ai_message = session.query(Message).filter(
130
+ Message.chat_id == chat_id,
131
+ Message.sender == 'ai'
132
+ ).first()
133
+
134
+ if not first_ai_message and chat.title == 'New Chat':
135
+ # Get the user's first message
136
+ first_user_message = session.query(Message).filter(
137
+ Message.chat_id == chat_id,
138
+ Message.sender == 'user'
139
+ ).order_by(Message.timestamp).first()
140
+
141
+ if first_user_message:
142
+ # Generate title from user query
143
+ new_title = self.generate_title_from_query(first_user_message.content)
144
+ chat.title = new_title
145
+
146
+ session.commit()
147
+
148
+ return {
149
+ "message_id": message_id,
150
+ "chat_id": message.chat_id,
151
+ "content": message.content,
152
+ "sender": message.sender,
153
+ "timestamp": message.timestamp.isoformat()
154
+ }
155
+ except SQLAlchemyError as e:
156
+ session.rollback()
157
+ logger.log_message(f"Error adding message: {str(e)}", level=logging.ERROR)
158
+ raise
159
+ finally:
160
+ session.close()
161
+
162
+
163
+ def get_chat(self, chat_id: int, user_id: Optional[int] = None) -> Dict[str, Any]:
164
+ """
165
+ Get a chat by ID with all its messages.
166
+
167
+ Args:
168
+ chat_id: ID of the chat to retrieve
169
+ user_id: Optional user ID to verify ownership
170
+
171
+ Returns:
172
+ Dictionary containing chat information and messages
173
+ """
174
+ session = self.Session()
175
+ try:
176
+ # Get the chat
177
+ query = session.query(Chat).filter(Chat.chat_id == chat_id)
178
+
179
+ # If user_id is provided, ensure the chat belongs to this user
180
+ if user_id is not None:
181
+ query = query.filter((Chat.user_id == user_id) | (Chat.user_id.is_(None)))
182
+
183
+ chat = query.first()
184
+ if not chat:
185
+ raise ValueError(f"Chat with ID {chat_id} not found or access denied")
186
+
187
+ # Get the chat messages ordered by timestamp
188
+ messages = session.query(Message).filter(
189
+ Message.chat_id == chat_id
190
+ ).order_by(Message.timestamp).all()
191
+
192
+ # Create a safe serializable dictionary
193
+ return {
194
+ "chat_id": chat.chat_id,
195
+ "title": chat.title,
196
+ "created_at": chat.created_at.isoformat() if chat.created_at else None,
197
+ "user_id": chat.user_id,
198
+ "messages": [
199
+ {
200
+ "message_id": msg.message_id,
201
+ "chat_id": msg.chat_id,
202
+ "content": msg.content,
203
+ "sender": msg.sender,
204
+ "timestamp": msg.timestamp.isoformat() if msg.timestamp else None
205
+ } for msg in messages
206
+ ]
207
+ }
208
+ except SQLAlchemyError as e:
209
+ logger.log_message(f"Error retrieving chat: {str(e)}", level=logging.ERROR)
210
+ raise
211
+ finally:
212
+ session.close()
213
+
214
+ def get_user_chats(self, user_id: Optional[int] = None, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
215
+ """
216
+ Get recent chats for a user, or all chats if no user_id is provided.
217
+
218
+ Args:
219
+ user_id: Optional user ID to filter chats
220
+ limit: Maximum number of chats to return
221
+ offset: Number of chats to skip (for pagination)
222
+
223
+ Returns:
224
+ List of dictionaries containing chat information
225
+ """
226
+ session = self.Session()
227
+ try:
228
+ query = session.query(Chat)
229
+
230
+ # Filter by user_id if provided
231
+ if user_id is not None:
232
+ query = query.filter(Chat.user_id == user_id)
233
+
234
+ # Apply safe limits for both SQLite and PostgreSQL
235
+ safe_limit = min(max(1, limit), 100) # Between 1 and 100
236
+ safe_offset = max(0, offset) # At least 0
237
+
238
+ chats = query.order_by(Chat.created_at.desc()).limit(safe_limit).offset(safe_offset).all()
239
+
240
+ return [
241
+ {
242
+ "chat_id": chat.chat_id,
243
+ "user_id": chat.user_id,
244
+ "title": chat.title,
245
+ "created_at": chat.created_at.isoformat() if chat.created_at else None
246
+ } for chat in chats
247
+ ]
248
+ except SQLAlchemyError as e:
249
+ logger.log_message(f"Error retrieving chats: {str(e)}", level=logging.ERROR)
250
+ return []
251
+ finally:
252
+ session.close()
253
+
254
+ def delete_chat(self, chat_id: int, user_id: Optional[int] = None) -> bool:
255
+ """
256
+ Delete a chat and all its messages while preserving model usage records.
257
+
258
+ Args:
259
+ chat_id: ID of the chat to delete
260
+ user_id: Optional user ID to verify ownership
261
+
262
+ Returns:
263
+ True if deletion was successful, False otherwise
264
+ """
265
+ session = self.Session()
266
+ try:
267
+ # Fetch chat with ownership check if user_id provided
268
+ query = session.query(Chat).filter(Chat.chat_id == chat_id)
269
+ if user_id is not None:
270
+ query = query.filter(Chat.user_id == user_id)
271
+
272
+ chat = query.first()
273
+ if not chat:
274
+ return False # Chat not found or ownership mismatch
275
+
276
+ # ORM-based deletion with model_usage preservation
277
+ # The SET NULL in the foreign key should handle this, but we ensure it explicitly for both
278
+ # SQLite and PostgreSQL compatibility
279
+
280
+ # For SQLite which might not respect ondelete="SET NULL" always:
281
+ # Update model_usage records to set chat_id to NULL
282
+ session.query(ModelUsage).filter(ModelUsage.chat_id == chat_id).update(
283
+ {"chat_id": None}, synchronize_session=False
284
+ )
285
+
286
+ # Now delete the chat - relationships will handle cascading to messages
287
+ session.delete(chat)
288
+ session.commit()
289
+ return True
290
+ except SQLAlchemyError as e:
291
+ session.rollback()
292
+ logger.log_message(f"Error deleting chat: {str(e)}", level=logging.ERROR)
293
+ return False
294
+ finally:
295
+ session.close()
296
+
297
+
298
+
299
+ def get_or_create_user(self, username: str, email: str) -> Dict[str, Any]:
300
+ """
301
+ Get an existing user by email or create a new one if not found.
302
+
303
+ Args:
304
+ username: User's display name
305
+ email: User's email address
306
+
307
+ Returns:
308
+ Dictionary containing user information
309
+ """
310
+ session = self.Session()
311
+ try:
312
+ # Validate and sanitize inputs
313
+ if not email or not isinstance(email, str):
314
+ raise ValueError("Valid email is required")
315
+
316
+ # Limit input length for PostgreSQL compatibility
317
+ max_length = 255 # Standard limit for varchar fields
318
+ if username and len(username) > max_length:
319
+ username = username[:max_length]
320
+ if email and len(email) > max_length:
321
+ email = email[:max_length]
322
+
323
+ # Try to find existing user by email
324
+ user = session.query(User).filter(User.email == email).first()
325
+
326
+ if not user:
327
+ # Create new user if not found
328
+ user = User(username=username, email=email)
329
+ session.add(user)
330
+ session.flush() # Get ID before committing
331
+ user_id = user.user_id
332
+ session.commit()
333
+ logger.log_message(f"Created new user: {username} ({email})", level=logging.INFO)
334
+ else:
335
+ user_id = user.user_id
336
+
337
+ return {
338
+ "user_id": user_id,
339
+ "username": user.username,
340
+ "email": user.email,
341
+ "created_at": user.created_at.isoformat() if user.created_at else None
342
+ }
343
+ except SQLAlchemyError as e:
344
+ session.rollback()
345
+ logger.log_message(f"Error getting/creating user: {str(e)}", level=logging.ERROR)
346
+ raise
347
+ finally:
348
+ session.close()
349
+
350
+ def update_chat(self, chat_id: int, title: Optional[str] = None, user_id: Optional[int] = None) -> Dict[str, Any]:
351
+ """
352
+ Update a chat's title or user_id.
353
+
354
+ Args:
355
+ chat_id: ID of the chat to update
356
+ title: New title for the chat (optional)
357
+ user_id: New user ID for the chat (optional)
358
+
359
+ Returns:
360
+ Dictionary containing updated chat information
361
+ """
362
+ session = self.Session()
363
+ try:
364
+ # Get the chat
365
+ chat = session.query(Chat).filter(Chat.chat_id == chat_id).first()
366
+ if not chat:
367
+ raise ValueError(f"Chat with ID {chat_id} not found")
368
+
369
+ # Update fields if provided
370
+ if title is not None:
371
+ # Limit title length for PostgreSQL compatibility
372
+ if len(title) > 255: # Assuming String column has a reasonable length
373
+ title = title[:255]
374
+ chat.title = title
375
+
376
+ if user_id is not None:
377
+ chat.user_id = user_id
378
+
379
+ session.commit()
380
+
381
+ return {
382
+ "chat_id": chat.chat_id,
383
+ "title": chat.title,
384
+ "created_at": chat.created_at.isoformat() if chat.created_at else None,
385
+ "user_id": chat.user_id
386
+ }
387
+ except SQLAlchemyError as e:
388
+ session.rollback()
389
+ logger.log_message(f"Error updating chat: {str(e)}", level=logging.ERROR)
390
+ raise
391
+ finally:
392
+ session.close()
393
+
394
+ def generate_title_from_query(self, query: str) -> str:
395
+ """
396
+ Generate a title for a chat based on the first query.
397
+
398
+ Args:
399
+ query: The user's first query in the chat
400
+
401
+ Returns:
402
+ A generated title string
403
+ """
404
+ try:
405
+ # Validate input
406
+ if not query or not isinstance(query, str):
407
+ return "New Chat"
408
+
409
+ # Simple title generation - take first few words
410
+ words = query.strip().split()
411
+ if len(words) > 3:
412
+ title = "Chat about " + " ".join(words[0:3]) + "..."
413
+ else:
414
+ title = "Chat about " + query.strip()
415
+
416
+ # Limit title length for PostgreSQL compatibility
417
+ max_title_length = 255
418
+ if len(title) > max_title_length:
419
+ title = title[:max_title_length-3] + "..."
420
+
421
+ return title
422
+ except Exception as e:
423
+ logger.log_message(f"Error generating title: {str(e)}", level=logging.ERROR)
424
+ return "New Chat"
425
+
426
+ def delete_empty_chats(self, user_id: Optional[int] = None, is_admin: bool = False) -> int:
427
+ """
428
+ Delete empty chats (chats with no messages) for a user.
429
+
430
+ Args:
431
+ user_id: ID of the user whose empty chats should be deleted
432
+ is_admin: Whether this is an admin user
433
+
434
+ Returns:
435
+ Number of chats deleted
436
+ """
437
+ session = self.Session()
438
+ try:
439
+ # Get all chats for the user
440
+ query = session.query(Chat)
441
+ if user_id is not None:
442
+ query = query.filter(Chat.user_id == user_id)
443
+ elif not is_admin:
444
+ return 0 # Don't delete anything if not a user or admin
445
+
446
+ # Get chats with no messages using a subquery - works in both SQLite and PostgreSQL
447
+ empty_chats = query.filter(
448
+ ~exists().where(Message.chat_id == Chat.chat_id)
449
+ ).all()
450
+
451
+ # Collect chat IDs to delete
452
+ chat_ids = [chat.chat_id for chat in empty_chats]
453
+
454
+ deleted_count = 0
455
+ if chat_ids:
456
+ # Update model_usage records to set chat_id to NULL for any associated usage records
457
+ session.query(ModelUsage).filter(ModelUsage.chat_id.in_(chat_ids)).update(
458
+ {"chat_id": None}, synchronize_session=False
459
+ )
460
+
461
+ # Delete the empty chats one by one to ensure proper relationship handling
462
+ for chat_id in chat_ids:
463
+ chat = session.query(Chat).filter(Chat.chat_id == chat_id).first()
464
+ if chat:
465
+ session.delete(chat)
466
+ deleted_count += 1
467
+
468
+ session.commit()
469
+
470
+ return deleted_count
471
+ except SQLAlchemyError as e:
472
+ session.rollback()
473
+ logger.log_message(f"Error deleting empty chats: {str(e)}", level=logging.ERROR)
474
+ return 0
475
+ finally:
476
+ session.close()
477
+
478
+ def get_usage_summary(self, start_date: Optional[datetime] = None,
479
+ end_date: Optional[datetime] = None) -> Dict[str, Any]:
480
+ """
481
+ Get a summary of model usage including total costs, tokens, and usage by model.
482
+
483
+ Args:
484
+ start_date: Optional start date for the summary period
485
+ end_date: Optional end date for the summary period
486
+
487
+ Returns:
488
+ Dictionary containing usage summary
489
+ """
490
+ session = self.Session()
491
+ try:
492
+ # Build base query - convert None values to default values for compatibility
493
+ base_query = session.query(ModelUsage)
494
+
495
+ # Apply date filters
496
+ if start_date:
497
+ base_query = base_query.filter(ModelUsage.timestamp >= start_date)
498
+ if end_date:
499
+ base_query = base_query.filter(ModelUsage.timestamp <= end_date)
500
+
501
+ # Get summary data using aggregate functions
502
+ summary_query = session.query(
503
+ func.coalesce(func.sum(ModelUsage.cost), 0.0).label("total_cost"),
504
+ func.coalesce(func.sum(ModelUsage.prompt_tokens), 0).label("total_prompt_tokens"),
505
+ func.coalesce(func.sum(ModelUsage.completion_tokens), 0).label("total_completion_tokens"),
506
+ func.coalesce(func.sum(ModelUsage.total_tokens), 0).label("total_tokens"),
507
+ func.count(ModelUsage.usage_id).label("request_count"),
508
+ func.coalesce(func.avg(ModelUsage.request_time_ms), 0.0).label("avg_request_time")
509
+ ).select_from(base_query.subquery())
510
+
511
+ result = summary_query.first()
512
+
513
+ # Get usage breakdown by model - using the same base query for consistency
514
+ model_query = session.query(
515
+ ModelUsage.model_name,
516
+ func.coalesce(func.sum(ModelUsage.cost), 0.0).label("model_cost"),
517
+ func.coalesce(func.sum(ModelUsage.total_tokens), 0).label("model_tokens"),
518
+ func.count(ModelUsage.usage_id).label("model_requests")
519
+ ).select_from(base_query.subquery()).group_by(ModelUsage.model_name)
520
+
521
+ model_breakdown = model_query.all()
522
+
523
+ # Get usage breakdown by provider using the same base query
524
+ provider_query = session.query(
525
+ ModelUsage.provider,
526
+ func.coalesce(func.sum(ModelUsage.cost), 0.0).label("provider_cost"),
527
+ func.coalesce(func.sum(ModelUsage.total_tokens), 0).label("provider_tokens"),
528
+ func.count(ModelUsage.usage_id).label("provider_requests")
529
+ ).select_from(base_query.subquery()).group_by(ModelUsage.provider)
530
+
531
+ provider_breakdown = provider_query.all()
532
+
533
+ # Get top users by cost
534
+ user_query = session.query(
535
+ ModelUsage.user_id,
536
+ func.coalesce(func.sum(ModelUsage.cost), 0.0).label("user_cost"),
537
+ func.coalesce(func.sum(ModelUsage.total_tokens), 0).label("user_tokens"),
538
+ func.count(ModelUsage.usage_id).label("user_requests")
539
+ ).select_from(base_query.subquery()).group_by(ModelUsage.user_id).order_by(
540
+ func.sum(ModelUsage.cost).desc()
541
+ ).limit(10)
542
+
543
+ user_breakdown = user_query.all()
544
+
545
+ # Handle the result data carefully to avoid None/NULL issues
546
+ return {
547
+ "summary": {
548
+ "total_cost": float(result.total_cost) if result and result.total_cost is not None else 0.0,
549
+ "total_prompt_tokens": int(result.total_prompt_tokens) if result and result.total_prompt_tokens is not None else 0,
550
+ "total_completion_tokens": int(result.total_completion_tokens) if result and result.total_completion_tokens is not None else 0,
551
+ "total_tokens": int(result.total_tokens) if result and result.total_tokens is not None else 0,
552
+ "request_count": int(result.request_count) if result and result.request_count is not None else 0,
553
+ "avg_request_time_ms": float(result.avg_request_time) if result and result.avg_request_time is not None else 0.0
554
+ },
555
+ "model_breakdown": [
556
+ {
557
+ "model_name": model.model_name,
558
+ "cost": float(model.model_cost) if model.model_cost is not None else 0.0,
559
+ "tokens": int(model.model_tokens) if model.model_tokens is not None else 0,
560
+ "requests": int(model.model_requests) if model.model_requests is not None else 0
561
+ } for model in model_breakdown
562
+ ],
563
+ "provider_breakdown": [
564
+ {
565
+ "provider": provider.provider,
566
+ "cost": float(provider.provider_cost) if provider.provider_cost is not None else 0.0,
567
+ "tokens": int(provider.provider_tokens) if provider.provider_tokens is not None else 0,
568
+ "requests": int(provider.provider_requests) if provider.provider_requests is not None else 0
569
+ } for provider in provider_breakdown
570
+ ],
571
+ "top_users": [
572
+ {
573
+ "user_id": user.user_id,
574
+ "cost": float(user.user_cost) if user.user_cost is not None else 0.0,
575
+ "tokens": int(user.user_tokens) if user.user_tokens is not None else 0,
576
+ "requests": int(user.user_requests) if user.user_requests is not None else 0
577
+ } for user in user_breakdown
578
+ ]
579
+ }
580
+
581
+ except SQLAlchemyError as e:
582
+ logger.log_message(f"Error retrieving usage summary: {str(e)}", level=logging.ERROR)
583
+ return {
584
+ "summary": {
585
+ "total_cost": 0.0,
586
+ "total_tokens": 0,
587
+ "request_count": 0
588
+ },
589
+ "model_breakdown": [],
590
+ "provider_breakdown": [],
591
+ "top_users": []
592
+ }
593
+ finally:
594
+ session.close()
595
+
596
+ def get_recent_chat_history(self, chat_id: int, limit: int = 5) -> List[Dict[str, Any]]:
597
+ """
598
+ Get recent message history for a chat, limited to the last 'limit' messages.
599
+
600
+ Args:
601
+ chat_id: ID of the chat to get history for
602
+ limit: Maximum number of recent messages to return
603
+
604
+ Returns:
605
+ List of dictionaries containing message information
606
+ """
607
+ session = self.Session()
608
+ try:
609
+ # Ensure safe limit for both databases
610
+ safe_limit = min(max(1, limit), 50) * 2 # Between 2 and 100 messages
611
+
612
+ # Use subquery for more efficient pagination in PostgreSQL
613
+ subquery = session.query(Message).filter(
614
+ Message.chat_id == chat_id
615
+ ).order_by(Message.timestamp.desc()).limit(safe_limit).subquery()
616
+
617
+ # Query from the subquery and sort in chronological order
618
+ messages = session.query(Message).from_statement(
619
+ session.query(subquery).order_by(subquery.c.timestamp).statement
620
+ ).all()
621
+
622
+ return [
623
+ {
624
+ "message_id": msg.message_id,
625
+ "chat_id": msg.chat_id,
626
+ "content": msg.content,
627
+ "sender": msg.sender,
628
+ "timestamp": msg.timestamp.isoformat() if msg.timestamp else None
629
+ } for msg in messages
630
+ ]
631
+ except SQLAlchemyError as e:
632
+ logger.log_message(f"Error retrieving chat history: {str(e)}", level=logging.ERROR)
633
+ return []
634
+ finally:
635
+ session.close()
636
+
637
+
638
+ def extract_response_history(self, messages: List[Dict[str, Any]]) -> str:
639
+ """
640
+ Extract response history from message history.
641
+
642
+ Args:
643
+ messages: List of message dictionaries
644
+
645
+ Returns:
646
+ String containing combined response history in a structured format
647
+ """
648
+
649
+ summaries = []
650
+ user_messages = []
651
+
652
+ # Input validation
653
+ if not messages or not isinstance(messages, list):
654
+ return ""
655
+
656
+ try:
657
+ for msg in messages:
658
+ # Skip invalid messages
659
+ if not isinstance(msg, dict):
660
+ continue
661
+
662
+ # Get User Messages
663
+ if msg.get("sender") == "user":
664
+ user_messages.append(msg)
665
+ # Ensure content exists and is from AI before extracting summary
666
+ if msg.get("sender") == "ai" and "content" in msg and msg["content"]:
667
+ content = msg["content"]
668
+ # Use a safer regex pattern with timeout protection
669
+ try:
670
+ matches = re.findall(r"### Summary\n(.*?)(?=\n\n##|\Z)", content, re.DOTALL)
671
+ summaries.extend(match.strip() for match in matches if match)
672
+ except Exception as e:
673
+ logger.log_message(f"Error extracting summaries: {str(e)}", level=logging.ERROR)
674
+
675
+ # Combine user messages with summaries in a structured format
676
+ combined_conversations = []
677
+ for i, user_msg in enumerate(user_messages):
678
+ if i < len(summaries):
679
+ # Ensure content exists and is not too long
680
+ user_content = user_msg.get('content', '')
681
+ if user_content and isinstance(user_content, str):
682
+ # Truncate if needed
683
+ if len(user_content) > 500:
684
+ user_content = user_content[:497] + "..."
685
+
686
+ summary = summaries[i]
687
+ if len(summary) > 500:
688
+ summary = summary[:497] + "..."
689
+
690
+ combined_conversations.append(f"Query: {user_content}\nSummary: {summary}")
691
+
692
+ # Return the last 3 conversations to maintain context
693
+ formatted_context = "\n\n".join(combined_conversations[-3:])
694
+
695
+ # Add a clear header to indicate this is past interaction history
696
+ if formatted_context:
697
+ return f"### Previous Interaction History:\n{formatted_context}"
698
+ return ""
699
+ except Exception as e:
700
+ logger.log_message(f"Error in extract_response_history: {str(e)}", level=logging.ERROR)
701
+ return ""
702
+
703
+ def add_message_feedback(self, message_id: int, rating: int,
704
+ model_settings: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
705
+ """
706
+ Add or update feedback for a message.
707
+
708
+ Args:
709
+ message_id: ID of the message to add feedback for
710
+ rating: Star rating (1-5)
711
+ model_settings: Optional dictionary containing model settings (name, provider, temperature, etc.)
712
+
713
+ Returns:
714
+ Dictionary containing feedback information
715
+ """
716
+ session = self.Session()
717
+ try:
718
+ # Check if message exists
719
+ message = session.query(Message).filter(Message.message_id == message_id).first()
720
+ if not message:
721
+ raise ValueError(f"Message with ID {message_id} not found")
722
+
723
+ # Check if feedback already exists
724
+ existing_feedback = session.query(MessageFeedback).filter(
725
+ MessageFeedback.message_id == message_id
726
+ ).first()
727
+
728
+ now = datetime.now(UTC)
729
+
730
+ # Extract model settings
731
+ model_name = None
732
+ model_provider = None
733
+ temperature = None
734
+ max_tokens = None
735
+
736
+ if model_settings:
737
+ model_name = model_settings.get('model_name')
738
+ model_provider = model_settings.get('model_provider')
739
+ temperature = model_settings.get('temperature')
740
+ max_tokens = model_settings.get('max_tokens')
741
+
742
+ if existing_feedback:
743
+ # Update existing feedback
744
+ existing_feedback.rating = rating
745
+ existing_feedback.model_name = model_name
746
+ existing_feedback.model_provider = model_provider
747
+ existing_feedback.temperature = temperature
748
+ existing_feedback.max_tokens = max_tokens
749
+ existing_feedback.updated_at = now
750
+ feedback_record = existing_feedback
751
+ else:
752
+ # Create new feedback
753
+ feedback_record = MessageFeedback(
754
+ message_id=message_id,
755
+ rating=rating,
756
+ model_name=model_name,
757
+ model_provider=model_provider,
758
+ temperature=temperature,
759
+ max_tokens=max_tokens,
760
+ created_at=now,
761
+ updated_at=now
762
+ )
763
+ session.add(feedback_record)
764
+
765
+ session.commit()
766
+
767
+ return {
768
+ "feedback_id": feedback_record.feedback_id,
769
+ "message_id": feedback_record.message_id,
770
+ "rating": feedback_record.rating,
771
+ "model_name": feedback_record.model_name,
772
+ "model_provider": feedback_record.model_provider,
773
+ "temperature": feedback_record.temperature,
774
+ "max_tokens": feedback_record.max_tokens,
775
+ "created_at": feedback_record.created_at.isoformat(),
776
+ "updated_at": feedback_record.updated_at.isoformat()
777
+ }
778
+ except SQLAlchemyError as e:
779
+ session.rollback()
780
+ logger.log_message(f"Error adding feedback: {str(e)}", level=logging.ERROR)
781
+ raise
782
+ finally:
783
+ session.close()
784
+
785
+ def get_message_feedback(self, message_id: int) -> Optional[Dict[str, Any]]:
786
+ """
787
+ Get feedback for a specific message.
788
+
789
+ Args:
790
+ message_id: ID of the message to get feedback for
791
+
792
+ Returns:
793
+ Dictionary containing feedback information or None if no feedback exists
794
+ """
795
+ session = self.Session()
796
+ try:
797
+ feedback = session.query(MessageFeedback).filter(
798
+ MessageFeedback.message_id == message_id
799
+ ).first()
800
+
801
+ if not feedback:
802
+ return None
803
+
804
+ return {
805
+ "feedback_id": feedback.feedback_id,
806
+ "message_id": feedback.message_id,
807
+ "rating": feedback.rating,
808
+ "model_name": feedback.model_name,
809
+ "model_provider": feedback.model_provider,
810
+ "temperature": feedback.temperature,
811
+ "max_tokens": feedback.max_tokens,
812
+ "created_at": feedback.created_at.isoformat(),
813
+ "updated_at": feedback.updated_at.isoformat()
814
+ }
815
+ except SQLAlchemyError as e:
816
+ logger.log_message(f"Error getting feedback: {str(e)}", level=logging.ERROR)
817
+ raise
818
+ finally:
819
+ session.close()
820
+
821
+ def get_chat_feedback(self, chat_id: int) -> List[Dict[str, Any]]:
822
+ """
823
+ Get all feedback for messages in a specific chat.
824
+
825
+ Args:
826
+ chat_id: ID of the chat to get feedback for
827
+
828
+ Returns:
829
+ List of dictionaries containing feedback information
830
+ """
831
+ session = self.Session()
832
+ try:
833
+ feedback_records = session.query(MessageFeedback).join(
834
+ Message, Message.message_id == MessageFeedback.message_id
835
+ ).filter(
836
+ Message.chat_id == chat_id
837
+ ).all()
838
+
839
+ return [{
840
+ "feedback_id": feedback.feedback_id,
841
+ "message_id": feedback.message_id,
842
+ "rating": feedback.rating,
843
+ "model_name": feedback.model_name,
844
+ "model_provider": feedback.model_provider,
845
+ "temperature": feedback.temperature,
846
+ "max_tokens": feedback.max_tokens,
847
+ "created_at": feedback.created_at.isoformat(),
848
+ "updated_at": feedback.updated_at.isoformat()
849
+ } for feedback in feedback_records]
850
+ except SQLAlchemyError as e:
851
+ logger.log_message(f"Error getting chat feedback: {str(e)}", level=logging.ERROR)
852
+ raise
853
+ finally:
854
+ session.close()
855
+
856
+ def get_feedback_statistics(self, user_id: Optional[int] = None,
857
+ start_date: Optional[datetime] = None,
858
+ end_date: Optional[datetime] = None) -> Dict[str, Any]:
859
+ """
860
+ Get feedback statistics for analysis.
861
+
862
+ Args:
863
+ user_id: Optional user ID to filter by
864
+ start_date: Optional start date to filter by
865
+ end_date: Optional end date to filter by
866
+
867
+ Returns:
868
+ Dictionary containing feedback statistics
869
+ """
870
+ session = self.Session()
871
+ try:
872
+ # Base query for all feedback
873
+ query = session.query(MessageFeedback).join(
874
+ Message, Message.message_id == MessageFeedback.message_id
875
+ )
876
+
877
+ # Apply filters if provided
878
+ if user_id is not None:
879
+ query = query.join(Chat, Chat.chat_id == Message.chat_id).filter(
880
+ Chat.user_id == user_id
881
+ )
882
+
883
+ if start_date is not None:
884
+ query = query.filter(MessageFeedback.created_at >= start_date)
885
+
886
+ if end_date is not None:
887
+ query = query.filter(MessageFeedback.created_at <= end_date)
888
+
889
+ # Get all feedback records
890
+ feedback_records = query.all()
891
+
892
+ # Calculate statistics
893
+ if not feedback_records:
894
+ return {
895
+ "total_feedback_count": 0,
896
+ "average_rating": 0,
897
+ "rating_distribution": {
898
+ "1": 0, "2": 0, "3": 0, "4": 0, "5": 0
899
+ },
900
+ "model_ratings": {}
901
+ }
902
+
903
+ # Calculate average rating
904
+ ratings = [record.rating for record in feedback_records if record.rating is not None]
905
+ average_rating = sum(ratings) / len(ratings) if ratings else 0
906
+
907
+ # Calculate rating distribution
908
+ rating_distribution = {
909
+ "1": 0, "2": 0, "3": 0, "4": 0, "5": 0
910
+ }
911
+
912
+ for record in feedback_records:
913
+ if record.rating is not None:
914
+ rating_distribution[str(record.rating)] += 1
915
+
916
+ # Calculate ratings by model
917
+ model_ratings = {}
918
+ for record in feedback_records:
919
+ if record.model_name and record.rating is not None:
920
+ if record.model_name not in model_ratings:
921
+ model_ratings[record.model_name] = {
922
+ "count": 0,
923
+ "total": 0,
924
+ "average": 0
925
+ }
926
+
927
+ model_ratings[record.model_name]["count"] += 1
928
+ model_ratings[record.model_name]["total"] += record.rating
929
+
930
+ # Calculate average for each model
931
+ for model_name, data in model_ratings.items():
932
+ data["average"] = data["total"] / data["count"] if data["count"] > 0 else 0
933
+
934
+ return {
935
+ "total_feedback_count": len(feedback_records),
936
+ "average_rating": average_rating,
937
+ "rating_distribution": rating_distribution,
938
+ "model_ratings": model_ratings
939
+ }
940
+ except SQLAlchemyError as e:
941
+ logger.log_message(f"Error getting feedback statistics: {str(e)}", level=logging.ERROR)
942
+ raise
943
+ finally:
944
+ session.close()
src/managers/session_manager.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import time
4
+ import uuid
5
+ import logging
6
+ import pandas as pd
7
+ from typing import Dict, Any, List
8
+
9
+ from llama_index.core import Document, VectorStoreIndex
10
+ from src.utils.logger import Logger
11
+ from src.managers.user_manager import get_current_user
12
+ from src.agents.agents import auto_analyst
13
+ from src.agents.retrievers.retrievers import make_data
14
+ from src.managers.chat_manager import ChatManager
15
+ from dotenv import load_dotenv
16
+
17
+ load_dotenv()
18
+
19
+ # Initialize logger
20
+ logger = Logger("session_manager", see_time=False, console_log=False)
21
+
22
+ class SessionManager:
23
+ """
24
+ Manages session-specific state, including datasets, retrievers, and AI systems.
25
+ Handles creation, retrieval, and updating of sessions.
26
+ """
27
+
28
+ def __init__(self, styling_instructions: List[str], available_agents: Dict):
29
+ """
30
+ Initialize SessionManager with styling instructions and available agents
31
+
32
+ Args:
33
+ styling_instructions: List of styling instructions for visualization
34
+ available_agents: Dictionary of available agents (deprecated - agents now loaded from DB)
35
+ """
36
+ self.styling_instructions = styling_instructions
37
+ self._sessions = {}
38
+ self._default_df = None
39
+ self._default_retrievers = None
40
+ self._default_ai_system = None
41
+ self._make_data = None
42
+ # Initialize chat manager
43
+ self._dataset_description = "Housing Dataset"
44
+ self._default_name = "Housing.csv"
45
+
46
+ self._dataset_description = """This dataset contains residential property information with details about pricing, physical characteristics, and amenities. The data can be used for real estate market analysis, property valuation, and understanding the relationship between house features and prices.
47
+
48
+ Key Features:
49
+ - Property prices range from 1.75M to 13.3M (currency units)
50
+ - Living areas from 1,650 to 16,200 (square units)
51
+ - Properties vary from 1-6 bedrooms and 1-4 bathrooms
52
+ - Various amenities tracked including parking, air conditioning, and hot water heating
53
+
54
+ TECHNICAL CONSIDERATIONS FOR ANALYSIS:
55
+
56
+ Numeric Columns:
57
+ - price (int): Large values suggesting currency units; range 1.75M-13.3M
58
+ - area (int): Square units measurement; range 1,650-16,200
59
+ - bedrooms (int): Discrete values 1-6
60
+ - bathrooms (int): Discrete values 1-4
61
+ - stories (int): Discrete values 1-4
62
+ - parking (int): Discrete values 0-3
63
+
64
+ Binary Categorical Columns (stored as str):
65
+ - mainroad (str): 'yes'/'no' - Consider boolean conversion
66
+ - guestroom (str): 'yes'/'no' - Consider boolean conversion
67
+ - basement (str): 'yes'/'no' - Consider boolean conversion
68
+ - hotwaterheating (str): 'yes'/'no' - Consider boolean conversion
69
+ - airconditioning (str): 'yes'/'no' - Consider boolean conversion
70
+ - prefarea (str): 'yes'/'no' - Consider boolean conversion
71
+
72
+ Other Categorical:
73
+ - furnishingstatus (str): Categories include 'furnished', 'semi-furnished' - Consider one-hot encoding
74
+
75
+ Data Handling Recommendations:
76
+ 1. Binary variables should be converted to boolean or numeric (0/1) for analysis
77
+ 2. Consider normalizing price and area values for certain analyses
78
+ 3. Furnishing status will need categorical encoding for numerical analysis
79
+ 4. No null values detected in the dataset
80
+ 5. All numeric columns are properly typed as numbers (no string conversion needed)
81
+ 6. Consider treating bedrooms, bathrooms, stories, and parking as categorical despite numeric storage
82
+
83
+ This dataset appears clean with consistent formatting and no missing values, making it suitable for immediate analysis with appropriate categorical encoding.
84
+ """
85
+ self.styling_instructions = styling_instructions
86
+ self.available_agents = available_agents
87
+ self.chat_manager = ChatManager(db_url=os.getenv("DATABASE_URL"))
88
+
89
+ self.initialize_default_dataset()
90
+
91
+ def initialize_default_dataset(self):
92
+ """Initialize the default dataset and store it"""
93
+ try:
94
+ self._default_df = pd.read_csv("Housing.csv")
95
+ self._make_data = make_data(self._default_df, self._dataset_description)
96
+ self._default_retrievers = self.initialize_retrievers(self.styling_instructions, [str(self._make_data)])
97
+ # Create default AI system - agents will be loaded from database
98
+ self._default_ai_system = auto_analyst(agents=[], retrievers=self._default_retrievers)
99
+ except Exception as e:
100
+ logger.log_message(f"Error initializing default dataset: {str(e)}", level=logging.ERROR)
101
+ raise e
102
+
103
+ def initialize_retrievers(self, styling_instructions: List[str], doc: List[str]):
104
+ """
105
+ Initialize retrievers for styling and data
106
+
107
+ Args:
108
+ styling_instructions: List of styling instructions
109
+ doc: List of document strings
110
+
111
+ Returns:
112
+ Dictionary containing style_index and dataframe_index
113
+ """
114
+ try:
115
+ style_index = VectorStoreIndex.from_documents([Document(text=x) for x in styling_instructions])
116
+ data_index = VectorStoreIndex.from_documents([Document(text=x) for x in doc])
117
+ return {"style_index": style_index, "dataframe_index": data_index}
118
+ except Exception as e:
119
+ logger.log_message(f"Error initializing retrievers: {str(e)}", level=logging.ERROR)
120
+ raise e
121
+
122
+ def get_session_state(self, session_id: str) -> Dict[str, Any]:
123
+ """
124
+ Get or create session-specific state
125
+
126
+ Args:
127
+ session_id: The session identifier
128
+
129
+ Returns:
130
+ Dictionary containing session state
131
+ """
132
+ # Use the global model config from app_state when available
133
+ # Get the most up-to-date model config
134
+ if hasattr(self, '_app_model_config') and self._app_model_config:
135
+ default_model_config = self._app_model_config
136
+ else:
137
+ default_model_config = {
138
+ "provider": os.getenv("MODEL_PROVIDER", "openai"),
139
+ "model": os.getenv("MODEL_NAME", "gpt-4o-mini"),
140
+ "api_key": os.getenv("OPENAI_API_KEY"),
141
+ "temperature": float(os.getenv("TEMPERATURE", 1.0)),
142
+ "max_tokens": int(os.getenv("MAX_TOKENS", 6000))
143
+ }
144
+
145
+ if session_id not in self._sessions:
146
+ # Check if we need to create a brand new session
147
+ logger.log_message(f"Creating new session state for session_id: {session_id}", level=logging.INFO)
148
+
149
+ # Initialize with default state
150
+ self._sessions[session_id] = {
151
+ "current_df": self._default_df.copy() if self._default_df is not None else None,
152
+ "retrievers": self._default_retrievers,
153
+ "ai_system": self._default_ai_system,
154
+ "make_data": self._make_data,
155
+ "description": self._dataset_description,
156
+ "name": self._default_name,
157
+ "model_config": default_model_config,
158
+ "creation_time": time.time()
159
+ }
160
+ else:
161
+ # Verify dataset integrity in existing session
162
+ session = self._sessions[session_id]
163
+
164
+ # Always update model_config to match global settings
165
+ session["model_config"] = default_model_config
166
+
167
+ # If dataset is somehow missing, restore it
168
+ if "current_df" not in session or session["current_df"] is None:
169
+ logger.log_message(f"Restoring missing dataset for session {session_id}", level=logging.WARNING)
170
+ session["current_df"] = self._default_df.copy() if self._default_df is not None else None
171
+ session["retrievers"] = self._default_retrievers
172
+ session["ai_system"] = self._default_ai_system
173
+ session["description"] = self._dataset_description
174
+ session["name"] = self._default_name
175
+
176
+ # Ensure we have the basic required fields
177
+ if "name" not in session:
178
+ session["name"] = self._default_name
179
+ if "description" not in session:
180
+ session["description"] = self._dataset_description
181
+
182
+ # Update last accessed time
183
+ session["last_accessed"] = time.time()
184
+
185
+ return self._sessions[session_id]
186
+
187
+ def clear_session_state(self, session_id: str):
188
+ """
189
+ Clear session-specific state
190
+
191
+ Args:
192
+ session_id: The session identifier
193
+ """
194
+ if session_id in self._sessions:
195
+ del self._sessions[session_id]
196
+
197
+
198
+ def update_session_dataset(self, session_id: str, df, name: str, desc: str):
199
+ """
200
+ Update dataset for a specific session
201
+
202
+ Args:
203
+ session_id: The session identifier
204
+ df: Pandas DataFrame containing the dataset
205
+ name: Name of the dataset
206
+ desc: Description of the dataset
207
+ """
208
+ try:
209
+ self._make_data = make_data(df, desc)
210
+ retrievers = self.initialize_retrievers(self.styling_instructions, [str(self._make_data)])
211
+
212
+ # Check if session has a user_id to create user-specific AI system
213
+ current_user_id = None
214
+ if session_id in self._sessions and "user_id" in self._sessions[session_id]:
215
+ current_user_id = self._sessions[session_id]["user_id"]
216
+
217
+ ai_system = self.create_ai_system_for_user(retrievers, current_user_id)
218
+
219
+ # Get default model config for new sessions
220
+ default_model_config = {
221
+ "provider": os.getenv("MODEL_PROVIDER", "openai"),
222
+ "model": os.getenv("MODEL_NAME", "gpt-4o-mini"),
223
+ "api_key": os.getenv("OPENAI_API_KEY"),
224
+ "temperature": float(os.getenv("TEMPERATURE", 1.0)),
225
+ "max_tokens": int(os.getenv("MAX_TOKENS", 6000))
226
+ }
227
+
228
+ # Create a completely fresh session state for the new dataset
229
+ # This ensures no remnants of the previous dataset remain
230
+ session_state = {
231
+ "current_df": df,
232
+ "retrievers": retrievers,
233
+ "ai_system": ai_system,
234
+ "make_data": self._make_data,
235
+ "description": desc,
236
+ "name": name,
237
+ "model_config": default_model_config, # Initialize with default
238
+ }
239
+
240
+ # Preserve user_id, chat_id, and model_config if they exist in the current session
241
+ if session_id in self._sessions:
242
+ if "user_id" in self._sessions[session_id]:
243
+ session_state["user_id"] = self._sessions[session_id]["user_id"]
244
+ if "chat_id" in self._sessions[session_id]:
245
+ session_state["chat_id"] = self._sessions[session_id]["chat_id"]
246
+ if "model_config" in self._sessions[session_id]:
247
+ # Preserve the user's model configuration
248
+ session_state["model_config"] = self._sessions[session_id]["model_config"]
249
+
250
+ # Replace the entire session with the new state
251
+ self._sessions[session_id] = session_state
252
+
253
+ logger.log_message(f"Updated session {session_id} with completely fresh dataset state: {name}", level=logging.INFO)
254
+ except Exception as e:
255
+ logger.log_message(f"Error updating dataset for session {session_id}: {str(e)}", level=logging.ERROR)
256
+ raise e
257
+
258
+ def reset_session_to_default(self, session_id: str):
259
+ """
260
+ Reset a session to use the default dataset
261
+
262
+ Args:
263
+ session_id: The session identifier
264
+ """
265
+ try:
266
+ # Get default model config from environment
267
+ default_model_config = {
268
+ "provider": os.getenv("MODEL_PROVIDER", "openai"),
269
+ "model": os.getenv("MODEL_NAME", "gpt-4o-mini"),
270
+ "api_key": os.getenv("OPENAI_API_KEY"),
271
+ "temperature": float(os.getenv("TEMPERATURE", 1.0)),
272
+ "max_tokens": int(os.getenv("MAX_TOKENS", 6000))
273
+ }
274
+
275
+ # Clear any custom data associated with the session first
276
+ if session_id in self._sessions:
277
+ del self._sessions[session_id]
278
+ logger.log_message(f"Cleared existing state for session {session_id} before reset.", level=logging.INFO)
279
+
280
+ # Initialize with default state
281
+ self._sessions[session_id] = {
282
+ "current_df": self._default_df.copy(), # Use a copy
283
+ "retrievers": self._default_retrievers,
284
+ "ai_system": self._default_ai_system,
285
+ "description": self._dataset_description,
286
+ "name": self._default_name, # Explicitly set the default name
287
+ "make_data": None, # Clear any custom make_data
288
+ "model_config": default_model_config # Initialize with default model config
289
+ }
290
+ logger.log_message(f"Reset session {session_id} to default dataset: {self._default_name}", level=logging.INFO)
291
+ except Exception as e:
292
+ logger.log_message(f"Error resetting session {session_id}: {str(e)}", level=logging.ERROR)
293
+ raise e
294
+
295
+ def create_ai_system_for_user(self, retrievers, user_id=None):
296
+ """
297
+ Create an AI system with user-specific agents (including custom agents)
298
+
299
+ Args:
300
+ retrievers: The retrievers for the AI system
301
+ user_id: Optional user ID to load custom agents for
302
+
303
+ Returns:
304
+ An auto_analyst instance with all available agents (standard + custom)
305
+ """
306
+ try:
307
+ if user_id:
308
+ # Import here to avoid circular imports
309
+ from src.db.init_db import session_factory
310
+
311
+ # Create a database session
312
+ db_session = session_factory()
313
+ try:
314
+ # Create AI system with user context to load custom agents
315
+ ai_system = auto_analyst(
316
+ agents=[],
317
+ retrievers=retrievers,
318
+ user_id=user_id,
319
+ db_session=db_session
320
+ )
321
+ logger.log_message(f"Created AI system for user {user_id}", level=logging.INFO)
322
+ return ai_system
323
+ finally:
324
+ db_session.close()
325
+ else:
326
+ # Create standard AI system without custom agents
327
+ return auto_analyst(agents=[], retrievers=retrievers)
328
+
329
+ except Exception as e:
330
+ logger.log_message(f"Error creating AI system for user {user_id}: {str(e)}", level=logging.ERROR)
331
+ # Fallback to standard AI system
332
+ return auto_analyst(agents=[], retrievers=retrievers)
333
+
334
+ def set_session_user(self, session_id: str, user_id: int, chat_id: int = None):
335
+ """
336
+ Associate a user with a session
337
+
338
+ Args:
339
+ session_id: The session identifier
340
+ user_id: The authenticated user ID
341
+ chat_id: Optional chat ID for tracking conversation
342
+
343
+ Returns:
344
+ Updated session state dictionary
345
+ """
346
+ # Ensure we have a session state for this session ID
347
+ if session_id not in self._sessions:
348
+ self.get_session_state(session_id) # Initialize with defaults
349
+
350
+ # Store user ID
351
+ self._sessions[session_id]["user_id"] = user_id
352
+
353
+ # Generate or use chat ID
354
+ if chat_id:
355
+ chat_id_to_use = chat_id
356
+ else:
357
+ # Check if chat_id already exists
358
+ if "chat_id" not in self._sessions[session_id] or not self._sessions[session_id]["chat_id"]:
359
+ # Use current timestamp + random number to generate a more readable ID
360
+ import random
361
+ chat_id_to_use = int(time.time() * 1000) % 1000000 + random.randint(1, 999)
362
+ else:
363
+ chat_id_to_use = self._sessions[session_id]["chat_id"]
364
+
365
+ # Store chat ID
366
+ self._sessions[session_id]["chat_id"] = chat_id_to_use
367
+
368
+ # Recreate AI system with user context to load custom agents
369
+ try:
370
+ session_retrievers = self._sessions[session_id]["retrievers"]
371
+ user_ai_system = self.create_ai_system_for_user(session_retrievers, user_id)
372
+ self._sessions[session_id]["ai_system"] = user_ai_system
373
+ logger.log_message(f"Updated AI system for session {session_id} with user {user_id}", level=logging.INFO)
374
+ except Exception as e:
375
+ logger.log_message(f"Error updating AI system for user {user_id}: {str(e)}", level=logging.ERROR)
376
+ # Continue with existing AI system if update fails
377
+
378
+ # Make sure this data gets saved
379
+ logger.log_message(f"Associated session {session_id} with user_id={user_id}, chat_id={chat_id_to_use}", level=logging.INFO)
380
+
381
+ # Return the updated session data
382
+ return self._sessions[session_id]
383
+
384
+ async def get_session_id(request, session_manager):
385
+ """
386
+ Get the session ID from the request, create/associate a user if needed
387
+
388
+ Args:
389
+ request: FastAPI Request object
390
+ session_manager: SessionManager instance
391
+
392
+ Returns:
393
+ Session ID string
394
+ """
395
+ # First try to get from query params
396
+ session_id = request.query_params.get("session_id")
397
+
398
+ # If not in query params, try to get from headers
399
+ if not session_id:
400
+ session_id = request.headers.get("X-Session-ID")
401
+
402
+ # If still not found, generate a new one
403
+ if not session_id:
404
+ session_id = str(uuid.uuid4())
405
+
406
+ # Get or create the session state
407
+ session_state = session_manager.get_session_state(session_id)
408
+
409
+ # First, check if we already have a user associated with this session
410
+ if session_state.get("user_id") is not None:
411
+ return session_id
412
+
413
+ # Next, try to get authenticated user using the API key
414
+ current_user = await get_current_user(request)
415
+ if current_user:
416
+ # Use the authenticated user instead of creating a guest
417
+ session_manager.set_session_user(
418
+ session_id=session_id,
419
+ user_id=current_user.user_id
420
+ )
421
+ logger.log_message(f"Associated session {session_id} with authenticated user_id {current_user.user_id}", level=logging.INFO)
422
+ return session_id
423
+
424
+ # Check if a user_id was provided in the request params
425
+ user_id_param = request.query_params.get("user_id")
426
+ if user_id_param:
427
+ try:
428
+ user_id = int(user_id_param)
429
+ session_manager.set_session_user(session_id=session_id, user_id=user_id)
430
+ logger.log_message(f"Associated session {session_id} with provided user_id {user_id}", level=logging.INFO)
431
+ return session_id
432
+ except (ValueError, TypeError):
433
+ logger.log_message(f"Invalid user_id in query params: {user_id_param}", level=logging.WARNING)
434
+
435
+ # No user was found or created - just return the session ID without associating a user
436
+ logger.log_message(f"No authenticated user found for session {session_id}, continuing without user association", level=logging.INFO)
437
+ return session_id