Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
66089c2
1
Parent(s):
ca4e625
added more robust uploading
Browse files- requirements.txt +1 -0
- src/routes/session_routes.py +265 -126
- src/utils/test.ipynb +1 -36
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
aiofiles==24.1.0
|
2 |
beautifulsoup4==4.13.4
|
|
|
3 |
dspy==3.0.3
|
4 |
litellm==1.75.2
|
5 |
email_validator==2.2.0
|
|
|
1 |
aiofiles==24.1.0
|
2 |
beautifulsoup4==4.13.4
|
3 |
+
chardet==5.2.0
|
4 |
dspy==3.0.3
|
5 |
litellm==1.75.2
|
6 |
email_validator==2.2.0
|
src/routes/session_routes.py
CHANGED
@@ -24,10 +24,162 @@ import dspy
|
|
24 |
import re
|
25 |
# from fastapi.responses import JSONResponse
|
26 |
import time
|
|
|
27 |
|
28 |
logger = Logger("session_routes", see_time=False, console_log=False)
|
29 |
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def apply_model_safeguards(model_name: str, provider: str, temperature: float, max_tokens: int) -> dict:
|
32 |
"""Apply model-specific safeguards for temperature and max_tokens based on official API limits"""
|
33 |
model_str = str(model_name).lower()
|
@@ -112,23 +264,34 @@ async def get_excel_sheets(
|
|
112 |
):
|
113 |
"""Get the list of sheet names from an Excel file"""
|
114 |
try:
|
115 |
-
# Read the uploaded Excel file
|
116 |
contents = await file.read()
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
# logger.log_message(f"Found {len(sheet_names)} sheets in Excel file: {', '.join(sheet_names)}", level=logging.INFO)
|
126 |
|
127 |
-
# Return the sheet names
|
128 |
-
return {"sheets": sheet_names}
|
129 |
except Exception as e:
|
130 |
logger.log_message(f"Error getting Excel sheets: {str(e)}", level=logging.ERROR)
|
131 |
-
raise HTTPException(
|
|
|
|
|
|
|
132 |
|
133 |
|
134 |
|
@@ -164,8 +327,24 @@ async def upload_excel(
|
|
164 |
contents = await file.read()
|
165 |
|
166 |
try:
|
167 |
-
#
|
168 |
-
excel_file =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
sheet_names = excel_file.sheet_names
|
170 |
|
171 |
# Parse selected sheets if provided; else use all sheets
|
@@ -178,47 +357,31 @@ async def upload_excel(
|
|
178 |
except Exception:
|
179 |
pass
|
180 |
|
181 |
-
# Get session state and DuckDB connection
|
182 |
-
session_state = app_state.get_session_state(session_id)
|
183 |
-
|
184 |
datasets = {}
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
# Process all sheets and register them in DuckDB
|
189 |
processed_sheets = []
|
190 |
|
191 |
for sheet_name in target_sheets:
|
192 |
try:
|
193 |
-
#
|
194 |
-
sheet_df =
|
195 |
-
sheet_df.replace({np.nan: None, np.inf: None, -np.inf: None}, inplace=True)
|
196 |
-
|
197 |
|
198 |
-
#
|
199 |
-
|
200 |
-
sheet_df.dropna(how='all', inplace=True)
|
201 |
-
sheet_df.dropna(how='all', axis=1, inplace=True)
|
202 |
|
203 |
-
#
|
204 |
-
sheet_df.
|
205 |
-
|
206 |
-
# 3. Skip empty sheets
|
207 |
-
if sheet_df.empty:
|
208 |
continue
|
209 |
|
210 |
-
# Register
|
211 |
clean_sheet_name = clean_dataset_name(sheet_name)
|
212 |
-
|
213 |
-
|
214 |
-
# First drop the table if it exists
|
215 |
-
|
216 |
-
|
217 |
-
# Then register the new table
|
218 |
-
datasets[clean_sheet_name] = sheet_df # Store the DataFrame, not the name
|
219 |
-
|
220 |
-
|
221 |
processed_sheets.append(clean_sheet_name)
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
except Exception as e:
|
224 |
logger.log_message(f"Error processing sheet '{sheet_name}': {str(e)}", level=logging.WARNING)
|
@@ -229,22 +392,23 @@ async def upload_excel(
|
|
229 |
|
230 |
# Update the session description (no primary dataset needed)
|
231 |
desc = description
|
232 |
-
app_state.update_session_dataset(session_id,datasets,processed_sheets,desc)
|
233 |
-
|
234 |
-
|
235 |
|
236 |
-
logger.log_message(f"
|
237 |
|
238 |
return {
|
239 |
-
"message": "Excel file processed successfully",
|
240 |
-
"session_id": session_id,
|
241 |
"sheets_processed": processed_sheets,
|
242 |
"total_sheets": len(processed_sheets)
|
243 |
}
|
244 |
|
245 |
except Exception as e:
|
246 |
logger.log_message(f"Error processing Excel file: {str(e)}", level=logging.ERROR)
|
247 |
-
raise HTTPException(
|
|
|
|
|
|
|
248 |
|
249 |
except Exception as e:
|
250 |
logger.log_message(f"Error in upload_excel: {str(e)}", level=logging.ERROR)
|
@@ -339,47 +503,32 @@ async def upload_dataframe(
|
|
339 |
# Ensure it's a safe Python identifier
|
340 |
|
341 |
|
342 |
-
# Read and process the CSV file
|
343 |
content = await file.read()
|
344 |
-
new_df = None
|
345 |
-
last_exception = None
|
346 |
-
|
347 |
-
# Try encodings with delimiter auto-detection
|
348 |
-
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
349 |
-
delimiters_to_try = [',', ';', '\t', '|', ':', ' ']
|
350 |
-
|
351 |
-
for encoding in encodings_to_try:
|
352 |
-
try:
|
353 |
-
csv_content = content.decode(encoding)
|
354 |
-
sample = csv_content[:1024]
|
355 |
-
try:
|
356 |
-
import csv as _csv
|
357 |
-
dialect = _csv.Sniffer().sniff(sample, delimiters=delimiters_to_try)
|
358 |
-
delimiter = dialect.delimiter
|
359 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=delimiter, engine='python')[columns]
|
360 |
-
except Exception:
|
361 |
-
# Fallback to pandas automatic detection
|
362 |
-
try:
|
363 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=None, engine='python')[columns]
|
364 |
-
except Exception:
|
365 |
-
# Final fallback: brute-force common delimiters
|
366 |
-
for d in delimiters_to_try:
|
367 |
-
try:
|
368 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=d, engine='python')[columns]
|
369 |
-
break
|
370 |
-
except Exception:
|
371 |
-
new_df = None
|
372 |
-
if new_df is not None:
|
373 |
-
new_df.replace({np.nan: None, np.inf: None, -np.inf: None}, inplace=True)
|
374 |
-
logger.log_message(f"Successfully read CSV with encoding: {encoding}", level=logging.INFO)
|
375 |
-
break
|
376 |
-
except Exception as e:
|
377 |
-
last_exception = e
|
378 |
-
logger.log_message(f"Failed to read CSV with encoding {encoding}: {str(e)}", level=logging.WARNING)
|
379 |
-
continue
|
380 |
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
# Format the description
|
385 |
desc = f" exact_python_name: `{name}` Dataset: {description}"
|
@@ -397,7 +546,12 @@ async def upload_dataframe(
|
|
397 |
|
398 |
logger.log_message(f"Successfully uploaded dataset '{name}' for session {session_id}", level=logging.INFO)
|
399 |
|
400 |
-
return JSONResponse(content=sanitize_json({
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
except Exception as e:
|
403 |
logger.log_message(f"Error in upload_dataframe: {str(e)}", level=logging.ERROR)
|
@@ -923,45 +1077,30 @@ async def preview_csv_upload(
|
|
923 |
):
|
924 |
"""Preview CSV file without modifying session"""
|
925 |
try:
|
926 |
-
# Process file and return preview data only
|
927 |
content = await file.read()
|
928 |
-
# Try encodings with delimiter auto-detection
|
929 |
-
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
930 |
-
delimiters_to_try = [',', ';', '\t', '|', ':', ' ']
|
931 |
-
new_df = None
|
932 |
-
last_exception = None
|
933 |
-
|
934 |
-
for encoding in encodings_to_try:
|
935 |
-
try:
|
936 |
-
csv_content = content.decode(encoding)
|
937 |
-
sample = csv_content[:4096]
|
938 |
-
try:
|
939 |
-
import csv as _csv
|
940 |
-
dialect = _csv.Sniffer().sniff(sample, delimiters=delimiters_to_try)
|
941 |
-
delimiter = dialect.delimiter
|
942 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=delimiter, engine='python')
|
943 |
-
except Exception:
|
944 |
-
# Fallback to pandas automatic detection
|
945 |
-
try:
|
946 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=None, engine='python')
|
947 |
-
except Exception:
|
948 |
-
# Final fallback: brute-force common delimiters
|
949 |
-
for d in delimiters_to_try:
|
950 |
-
try:
|
951 |
-
new_df = pd.read_csv(io.StringIO(csv_content), sep=d, engine='python')
|
952 |
-
break
|
953 |
-
except Exception:
|
954 |
-
new_df = None
|
955 |
-
if new_df is not None:
|
956 |
-
logger.log_message(f"Successfully read CSV with encoding: {encoding}", level=logging.INFO)
|
957 |
-
break
|
958 |
-
except Exception as e:
|
959 |
-
last_exception = e
|
960 |
-
logger.log_message(f"Failed to read CSV with encoding {encoding}: {str(e)}", level=logging.WARNING)
|
961 |
-
continue
|
962 |
|
963 |
-
|
964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
965 |
|
966 |
# Clean and validate the name
|
967 |
name = file.filename.replace('.csv', '').replace(' ', '_').lower().strip()
|
|
|
24 |
import re
|
25 |
# from fastapi.responses import JSONResponse
|
26 |
import time
|
27 |
+
import chardet
|
28 |
|
29 |
logger = Logger("session_routes", see_time=False, console_log=False)
|
30 |
|
31 |
|
32 |
+
def read_csv_robust(content: bytes, columns: Optional[List[str]] = None) -> tuple:
|
33 |
+
"""
|
34 |
+
Robust CSV reader with multiple fallback strategies.
|
35 |
+
Returns: (DataFrame, success_message)
|
36 |
+
"""
|
37 |
+
new_df = None
|
38 |
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
|
39 |
+
delimiters_to_try = [',', ';', '\t', '|']
|
40 |
+
|
41 |
+
# Try auto-detect encoding first using chardet
|
42 |
+
try:
|
43 |
+
detected = chardet.detect(content[:100000]) # Sample first 100KB
|
44 |
+
if detected and detected.get('encoding') and detected.get('confidence', 0) > 0.7:
|
45 |
+
detected_encoding = detected['encoding']
|
46 |
+
if detected_encoding not in encodings_to_try:
|
47 |
+
encodings_to_try.insert(0, detected_encoding)
|
48 |
+
logger.log_message(f"Detected encoding: {detected_encoding} (confidence: {detected['confidence']:.2f})", level=logging.INFO)
|
49 |
+
except Exception as e:
|
50 |
+
logger.log_message(f"Encoding detection failed: {str(e)}", level=logging.WARNING)
|
51 |
+
|
52 |
+
# Try different encoding and delimiter combinations
|
53 |
+
for encoding in encodings_to_try:
|
54 |
+
try:
|
55 |
+
csv_content = content.decode(encoding)
|
56 |
+
sample = csv_content[:4096]
|
57 |
+
|
58 |
+
# Strategy 1: Try csv.Sniffer for delimiter detection
|
59 |
+
try:
|
60 |
+
import csv as _csv
|
61 |
+
dialect = _csv.Sniffer().sniff(sample, delimiters=''.join(delimiters_to_try))
|
62 |
+
delimiter = dialect.delimiter
|
63 |
+
|
64 |
+
new_df = pd.read_csv(
|
65 |
+
io.StringIO(csv_content),
|
66 |
+
sep=delimiter,
|
67 |
+
engine='python',
|
68 |
+
on_bad_lines='skip',
|
69 |
+
encoding_errors='replace',
|
70 |
+
low_memory=False
|
71 |
+
)
|
72 |
+
|
73 |
+
# Validate: Check if we got meaningful data (more than 1 column)
|
74 |
+
if new_df.shape[1] > 1:
|
75 |
+
if columns:
|
76 |
+
new_df = new_df[columns]
|
77 |
+
logger.log_message(f"✓ CSV read with encoding={encoding}, delimiter='{delimiter}' ({new_df.shape[0]} rows, {new_df.shape[1]} cols)", level=logging.INFO)
|
78 |
+
return new_df, f"Successfully parsed with {encoding} encoding and '{delimiter}' delimiter"
|
79 |
+
except Exception:
|
80 |
+
pass
|
81 |
+
|
82 |
+
# Strategy 2: Pandas automatic delimiter detection
|
83 |
+
try:
|
84 |
+
new_df = pd.read_csv(
|
85 |
+
io.StringIO(csv_content),
|
86 |
+
sep=None,
|
87 |
+
engine='python',
|
88 |
+
on_bad_lines='skip',
|
89 |
+
encoding_errors='replace',
|
90 |
+
low_memory=False
|
91 |
+
)
|
92 |
+
|
93 |
+
if new_df.shape[1] > 1:
|
94 |
+
if columns:
|
95 |
+
new_df = new_df[columns]
|
96 |
+
logger.log_message(f"✓ CSV read with encoding={encoding}, auto-detected delimiter ({new_df.shape[0]} rows, {new_df.shape[1]} cols)", level=logging.INFO)
|
97 |
+
return new_df, f"Successfully parsed with {encoding} encoding and auto-detected delimiter"
|
98 |
+
except Exception:
|
99 |
+
pass
|
100 |
+
|
101 |
+
# Strategy 3: Brute-force common delimiters
|
102 |
+
for delimiter in delimiters_to_try:
|
103 |
+
try:
|
104 |
+
new_df = pd.read_csv(
|
105 |
+
io.StringIO(csv_content),
|
106 |
+
sep=delimiter,
|
107 |
+
engine='python',
|
108 |
+
on_bad_lines='skip',
|
109 |
+
encoding_errors='replace',
|
110 |
+
low_memory=False
|
111 |
+
)
|
112 |
+
|
113 |
+
if new_df.shape[1] > 1:
|
114 |
+
if columns:
|
115 |
+
new_df = new_df[columns]
|
116 |
+
logger.log_message(f"✓ CSV read with encoding={encoding}, delimiter='{delimiter}' ({new_df.shape[0]} rows, {new_df.shape[1]} cols)", level=logging.INFO)
|
117 |
+
return new_df, f"Successfully parsed with {encoding} encoding and '{delimiter}' delimiter"
|
118 |
+
except Exception:
|
119 |
+
continue
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
logger.log_message(f"Failed encoding {encoding}: {str(e)}", level=logging.WARNING)
|
123 |
+
continue
|
124 |
+
|
125 |
+
raise ValueError(f"Could not parse CSV with any encoding/delimiter combination. Tried encodings: {encodings_to_try}")
|
126 |
+
|
127 |
+
|
128 |
+
def read_excel_robust(contents: bytes, sheet_name=None) -> pd.DataFrame:
|
129 |
+
"""
|
130 |
+
Robust Excel reader with multiple engine fallbacks.
|
131 |
+
"""
|
132 |
+
engines = ['openpyxl', 'xlrd', None] # None will use default
|
133 |
+
|
134 |
+
for engine in engines:
|
135 |
+
try:
|
136 |
+
if engine:
|
137 |
+
df = pd.read_excel(
|
138 |
+
io.BytesIO(contents),
|
139 |
+
sheet_name=sheet_name if sheet_name else 0,
|
140 |
+
engine=engine
|
141 |
+
)
|
142 |
+
else:
|
143 |
+
df = pd.read_excel(
|
144 |
+
io.BytesIO(contents),
|
145 |
+
sheet_name=sheet_name if sheet_name else 0
|
146 |
+
)
|
147 |
+
|
148 |
+
logger.log_message(f"✓ Excel read with engine={engine or 'default'}", level=logging.INFO)
|
149 |
+
return df
|
150 |
+
|
151 |
+
except Exception as e:
|
152 |
+
logger.log_message(f"Failed Excel engine {engine}: {str(e)}", level=logging.WARNING)
|
153 |
+
continue
|
154 |
+
|
155 |
+
# Last resort: Try reading as CSV (sometimes .xlsx are actually CSV)
|
156 |
+
logger.log_message("All Excel engines failed, attempting to read as CSV...", level=logging.WARNING)
|
157 |
+
try:
|
158 |
+
df, msg = read_csv_robust(contents)
|
159 |
+
logger.log_message(f"✓ File read as CSV (may have been misnamed): {msg}", level=logging.INFO)
|
160 |
+
return df
|
161 |
+
except Exception as e:
|
162 |
+
raise ValueError(f"Could not read file as Excel or CSV. Last error: {str(e)}")
|
163 |
+
|
164 |
+
|
165 |
+
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
166 |
+
"""Clean dataframe after reading"""
|
167 |
+
# Strip whitespace from column names
|
168 |
+
df.columns = df.columns.str.strip()
|
169 |
+
|
170 |
+
# Remove completely empty rows and columns
|
171 |
+
df.dropna(how='all', inplace=True)
|
172 |
+
df.dropna(axis=1, how='all', inplace=True)
|
173 |
+
|
174 |
+
# Replace problematic values
|
175 |
+
df.replace({np.nan: None, np.inf: None, -np.inf: None}, inplace=True)
|
176 |
+
|
177 |
+
# Reset index
|
178 |
+
df.reset_index(drop=True, inplace=True)
|
179 |
+
|
180 |
+
return df
|
181 |
+
|
182 |
+
|
183 |
def apply_model_safeguards(model_name: str, provider: str, temperature: float, max_tokens: int) -> dict:
|
184 |
"""Apply model-specific safeguards for temperature and max_tokens based on official API limits"""
|
185 |
model_str = str(model_name).lower()
|
|
|
264 |
):
|
265 |
"""Get the list of sheet names from an Excel file"""
|
266 |
try:
|
|
|
267 |
contents = await file.read()
|
268 |
|
269 |
+
# Try multiple engines to read Excel
|
270 |
+
engines = ['openpyxl', 'xlrd', None]
|
271 |
|
272 |
+
for engine in engines:
|
273 |
+
try:
|
274 |
+
if engine:
|
275 |
+
excel_file = pd.ExcelFile(io.BytesIO(contents), engine=engine)
|
276 |
+
else:
|
277 |
+
excel_file = pd.ExcelFile(io.BytesIO(contents))
|
278 |
+
|
279 |
+
sheet_names = excel_file.sheet_names
|
280 |
+
logger.log_message(f"✓ Found {len(sheet_names)} sheets using engine={engine or 'default'}", level=logging.INFO)
|
281 |
+
|
282 |
+
return {"sheets": sheet_names}
|
283 |
+
except Exception as e:
|
284 |
+
logger.log_message(f"Failed to read with engine {engine}: {str(e)}", level=logging.WARNING)
|
285 |
+
continue
|
286 |
|
287 |
+
raise ValueError("Could not read Excel file with any available engine")
|
|
|
288 |
|
|
|
|
|
289 |
except Exception as e:
|
290 |
logger.log_message(f"Error getting Excel sheets: {str(e)}", level=logging.ERROR)
|
291 |
+
raise HTTPException(
|
292 |
+
status_code=400,
|
293 |
+
detail=f"Could not read Excel file. Please ensure it's a valid .xlsx or .xls file. Error: {str(e)}"
|
294 |
+
)
|
295 |
|
296 |
|
297 |
|
|
|
327 |
contents = await file.read()
|
328 |
|
329 |
try:
|
330 |
+
# Use robust Excel reader to get sheet names
|
331 |
+
excel_file = None
|
332 |
+
engines = ['openpyxl', 'xlrd', None]
|
333 |
+
|
334 |
+
for engine in engines:
|
335 |
+
try:
|
336 |
+
if engine:
|
337 |
+
excel_file = pd.ExcelFile(io.BytesIO(contents), engine=engine)
|
338 |
+
else:
|
339 |
+
excel_file = pd.ExcelFile(io.BytesIO(contents))
|
340 |
+
logger.log_message(f"✓ Excel file loaded with engine={engine or 'default'}", level=logging.INFO)
|
341 |
+
break
|
342 |
+
except Exception:
|
343 |
+
continue
|
344 |
+
|
345 |
+
if not excel_file:
|
346 |
+
raise ValueError("Could not load Excel file with any available engine")
|
347 |
+
|
348 |
sheet_names = excel_file.sheet_names
|
349 |
|
350 |
# Parse selected sheets if provided; else use all sheets
|
|
|
357 |
except Exception:
|
358 |
pass
|
359 |
|
|
|
|
|
|
|
360 |
datasets = {}
|
|
|
|
|
|
|
|
|
361 |
processed_sheets = []
|
362 |
|
363 |
for sheet_name in target_sheets:
|
364 |
try:
|
365 |
+
# Use robust Excel reader for each sheet
|
366 |
+
sheet_df = read_excel_robust(contents, sheet_name=sheet_name)
|
|
|
|
|
367 |
|
368 |
+
# Clean the dataframe
|
369 |
+
sheet_df = clean_dataframe(sheet_df)
|
|
|
|
|
370 |
|
371 |
+
# Skip empty sheets
|
372 |
+
if sheet_df.empty or sheet_df.shape[1] == 0:
|
373 |
+
logger.log_message(f"Skipping empty sheet: {sheet_name}", level=logging.WARNING)
|
|
|
|
|
374 |
continue
|
375 |
|
376 |
+
# Register sheet with clean name
|
377 |
clean_sheet_name = clean_dataset_name(sheet_name)
|
378 |
+
datasets[clean_sheet_name] = sheet_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
processed_sheets.append(clean_sheet_name)
|
380 |
+
|
381 |
+
logger.log_message(
|
382 |
+
f"✓ Processed sheet '{sheet_name}' → '{clean_sheet_name}': {sheet_df.shape[0]} rows × {sheet_df.shape[1]} cols",
|
383 |
+
level=logging.INFO
|
384 |
+
)
|
385 |
|
386 |
except Exception as e:
|
387 |
logger.log_message(f"Error processing sheet '{sheet_name}': {str(e)}", level=logging.WARNING)
|
|
|
392 |
|
393 |
# Update the session description (no primary dataset needed)
|
394 |
desc = description
|
395 |
+
app_state.update_session_dataset(session_id, datasets, processed_sheets, desc)
|
|
|
|
|
396 |
|
397 |
+
logger.log_message(f"✓ Excel upload complete: {len(processed_sheets)} sheets processed", level=logging.INFO)
|
398 |
|
399 |
return {
|
400 |
+
"message": "Excel file processed successfully",
|
401 |
+
"session_id": session_id,
|
402 |
"sheets_processed": processed_sheets,
|
403 |
"total_sheets": len(processed_sheets)
|
404 |
}
|
405 |
|
406 |
except Exception as e:
|
407 |
logger.log_message(f"Error processing Excel file: {str(e)}", level=logging.ERROR)
|
408 |
+
raise HTTPException(
|
409 |
+
status_code=400,
|
410 |
+
detail=f"Could not process Excel file. Please ensure it's a valid .xlsx or .xls file. Error: {str(e)}"
|
411 |
+
)
|
412 |
|
413 |
except Exception as e:
|
414 |
logger.log_message(f"Error in upload_excel: {str(e)}", level=logging.ERROR)
|
|
|
503 |
# Ensure it's a safe Python identifier
|
504 |
|
505 |
|
506 |
+
# Read and process the CSV file using robust reader
|
507 |
content = await file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
|
509 |
+
try:
|
510 |
+
# Use the robust CSV reader
|
511 |
+
new_df, success_msg = read_csv_robust(content, columns=columns)
|
512 |
+
|
513 |
+
# Clean the dataframe
|
514 |
+
new_df = clean_dataframe(new_df)
|
515 |
+
|
516 |
+
# Validate we have data
|
517 |
+
if new_df.empty:
|
518 |
+
raise ValueError("CSV file contains no valid data after cleaning")
|
519 |
+
|
520 |
+
if new_df.shape[1] == 0:
|
521 |
+
raise ValueError("CSV file has no valid columns")
|
522 |
+
|
523 |
+
logger.log_message(f"CSV parsed successfully: {success_msg}", level=logging.INFO)
|
524 |
+
logger.log_message(f"Final dataset shape: {new_df.shape[0]} rows × {new_df.shape[1]} columns", level=logging.INFO)
|
525 |
+
|
526 |
+
except Exception as e:
|
527 |
+
logger.log_message(f"Failed to read CSV: {str(e)}", level=logging.ERROR)
|
528 |
+
raise HTTPException(
|
529 |
+
status_code=400,
|
530 |
+
detail=f"Could not read CSV file: {str(e)}. Please ensure the file is properly formatted."
|
531 |
+
)
|
532 |
|
533 |
# Format the description
|
534 |
desc = f" exact_python_name: `{name}` Dataset: {description}"
|
|
|
546 |
|
547 |
logger.log_message(f"Successfully uploaded dataset '{name}' for session {session_id}", level=logging.INFO)
|
548 |
|
549 |
+
return JSONResponse(content=sanitize_json({
|
550 |
+
"message": "Dataframe uploaded successfully",
|
551 |
+
"session_id": session_id,
|
552 |
+
"rows": int(new_df.shape[0]),
|
553 |
+
"columns": int(new_df.shape[1])
|
554 |
+
}))
|
555 |
|
556 |
except Exception as e:
|
557 |
logger.log_message(f"Error in upload_dataframe: {str(e)}", level=logging.ERROR)
|
|
|
1077 |
):
|
1078 |
"""Preview CSV file without modifying session"""
|
1079 |
try:
|
|
|
1080 |
content = await file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1081 |
|
1082 |
+
# Use robust CSV reader
|
1083 |
+
try:
|
1084 |
+
new_df, success_msg = read_csv_robust(content)
|
1085 |
+
|
1086 |
+
# Clean the dataframe
|
1087 |
+
new_df = clean_dataframe(new_df)
|
1088 |
+
|
1089 |
+
# Validate
|
1090 |
+
if new_df.empty:
|
1091 |
+
raise ValueError("CSV file contains no valid data")
|
1092 |
+
|
1093 |
+
if new_df.shape[1] == 0:
|
1094 |
+
raise ValueError("CSV file has no valid columns")
|
1095 |
+
|
1096 |
+
logger.log_message(f"CSV preview: {success_msg}", level=logging.INFO)
|
1097 |
+
|
1098 |
+
except Exception as e:
|
1099 |
+
logger.log_message(f"Failed to read CSV for preview: {str(e)}", level=logging.ERROR)
|
1100 |
+
raise HTTPException(
|
1101 |
+
status_code=400,
|
1102 |
+
detail=f"Could not read CSV file: {str(e)}. Please ensure the file is properly formatted."
|
1103 |
+
)
|
1104 |
|
1105 |
# Clean and validate the name
|
1106 |
name = file.filename.replace('.csv', '').replace(' ', '_').lower().strip()
|
src/utils/test.ipynb
CHANGED
@@ -17,42 +17,7 @@
|
|
17 |
"id": "3a1670ba",
|
18 |
"metadata": {},
|
19 |
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"# Anthropic Model API References (Python, Bash, JS) - No fetch, just listing\n",
|
22 |
-
"\n",
|
23 |
-
"# --- Anthropic Model API Reference ---\n",
|
24 |
-
"\n",
|
25 |
-
"# 1. Python (using anthropic SDK)\n",
|
26 |
-
"\"\"\"\n",
|
27 |
-
"import anthropic\n",
|
28 |
-
"\n",
|
29 |
-
"client = anthropic.Anthropic(\n",
|
30 |
-
" api_key=\"YOUR_ANTHROPIC_API_KEY\"\n",
|
31 |
-
")\n",
|
32 |
-
"\n",
|
33 |
-
"# List available models (returns a list of ModelInfo objects)\n",
|
34 |
-
"models = client.models.list(limit=20)\n",
|
35 |
-
"print(models)\n",
|
36 |
-
"\"\"\"\n",
|
37 |
-
"\n",
|
38 |
-
"# 2. Bash (using curl)\n",
|
39 |
-
"\"\"\"\n",
|
40 |
-
"curl https://api.anthropic.com/v1/models \\\n",
|
41 |
-
" --header \"x-api-key: $ANTHROPIC_API_KEY\" \\\n",
|
42 |
-
" --header \"anthropic-version: 2023-06-01\"\n",
|
43 |
-
"\"\"\"\n",
|
44 |
-
"\n",
|
45 |
-
"# 3. JavaScript (using @anthropic-ai/sdk)\n",
|
46 |
-
"\"\"\"\n",
|
47 |
-
"import Anthropic from '@anthropic-ai/sdk';\n",
|
48 |
-
"\n",
|
49 |
-
"const anthropic = new Anthropic({\n",
|
50 |
-
" apiKey: process.env.ANTHROPIC_API_KEY,\n",
|
51 |
-
"});\n",
|
52 |
-
"\n",
|
53 |
-
"\n",
|
54 |
-
"\n"
|
55 |
-
]
|
56 |
},
|
57 |
{
|
58 |
"cell_type": "code",
|
|
|
17 |
"id": "3a1670ba",
|
18 |
"metadata": {},
|
19 |
"outputs": [],
|
20 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|