Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| from typing import Tuple | |
| from PIL import Image | |
| from dotenv import load_dotenv | |
| from langchain_groq import ChatGroq | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| import matplotlib.pyplot as plt | |
| import json | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| import uuid | |
| # FORCE reload environment variables | |
| load_dotenv(override=True) | |
| # Get API keys with explicit None handling and debugging | |
| Groq_Token = os.getenv("GROQ_API_KEY") | |
| hf_token = os.getenv("HF_TOKEN") | |
| gemini_token = os.getenv("GEMINI_TOKEN") | |
| # Debug print (remove in production) | |
| # print(f"Debug - Groq Token: {'Present' if Groq_Token else 'Missing'}") | |
| # print(f"Debug - Groq Token Value: {Groq_Token[:10] + '...' if Groq_Token else 'None'}") | |
| # print(f"Debug - Gemini Token: {'Present' if gemini_token else 'Missing'}") | |
| models = { | |
| "gpt-oss-120b": "openai/gpt-oss-120b", | |
| "qwen3-32b": "qwen/qwen3-32b", | |
| "gpt-oss-20b": "openai/gpt-oss-20b", | |
| "llama4 maverik":"meta-llama/llama-4-maverick-17b-128e-instruct", | |
| "llama3.3": "llama-3.3-70b-versatile", | |
| "deepseek-R1": "deepseek-r1-distill-llama-70b", | |
| "gemini-2.5-flash": "gemini-2.5-flash", | |
| "gemini-2.5-pro": "gemini-2.5-pro", | |
| "gemini-2.5-flash-lite": "gemini-2.5-flash-lite", | |
| "gemini-2.0-flash": "gemini-2.0-flash", | |
| "gemini-2.0-flash-lite": "gemini-2.0-flash-lite", | |
| # "llama4 scout":"meta-llama/llama-4-scout-17b-16e-instruct" | |
| # "llama3.1": "llama-3.1-8b-instant" | |
| } | |
| def log_interaction(user_query, model_name, response_content, generated_code, execution_time, error_message=None, is_image=False): | |
| """Log user interactions to Hugging Face dataset""" | |
| try: | |
| if not hf_token or hf_token.strip() == "": | |
| print("Warning: HF_TOKEN not available, skipping logging") | |
| return | |
| # Create log entry | |
| log_entry = { | |
| "timestamp": datetime.now().isoformat(), | |
| "session_id": str(uuid.uuid4()), | |
| "user_query": user_query, | |
| "model_name": model_name, | |
| "response_content": str(response_content), | |
| "generated_code": generated_code or "", | |
| "execution_time_seconds": execution_time, | |
| "error_message": error_message or "", | |
| "is_image_output": is_image, | |
| "success": error_message is None | |
| } | |
| # Create DataFrame | |
| df = pd.DataFrame([log_entry]) | |
| # Create unique filename with timestamp | |
| timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| random_id = str(uuid.uuid4())[:8] | |
| filename = f"interaction_log_{timestamp_str}_{random_id}.parquet" | |
| # Save locally first | |
| local_path = f"/tmp/{filename}" | |
| df.to_parquet(local_path, index=False) | |
| # Upload to Hugging Face | |
| api = HfApi(token=hf_token) | |
| api.upload_file( | |
| path_or_fileobj=local_path, | |
| path_in_repo=f"data/{filename}", | |
| repo_id="SustainabilityLabIITGN/VayuChat_logs", | |
| repo_type="dataset", | |
| ) | |
| # Clean up local file | |
| if os.path.exists(local_path): | |
| os.remove(local_path) | |
| print(f"Successfully logged interaction to HuggingFace: {filename}") | |
| except Exception as e: | |
| print(f"Error logging interaction: {e}") | |
| def preprocess_and_load_df(path: str) -> pd.DataFrame: | |
| """Load and preprocess the dataframe""" | |
| try: | |
| df = pd.read_csv(path) | |
| df["Timestamp"] = pd.to_datetime(df["Timestamp"]) | |
| return df | |
| except Exception as e: | |
| raise Exception(f"Error loading dataframe: {e}") | |
| def get_from_user(prompt): | |
| """Format user prompt""" | |
| return {"role": "user", "content": prompt} | |
| def ask_question(model_name, question): | |
| """Ask question with comprehensive error handling and logging""" | |
| start_time = datetime.now() | |
| # ------------------------ | |
| # Helper functions | |
| # ------------------------ | |
| def make_error_response(msg, log_msg, content=None): | |
| """Build error response + log it""" | |
| execution_time = (datetime.now() - start_time).total_seconds() | |
| log_interaction( | |
| user_query=question, | |
| model_name=model_name, | |
| response_content=content or msg, | |
| generated_code="", | |
| execution_time=execution_time, | |
| error_message=log_msg, | |
| is_image=False | |
| ) | |
| return { | |
| "role": "assistant", | |
| "content": content or msg, | |
| "gen_code": "", | |
| "ex_code": "", | |
| "last_prompt": question, | |
| "error": log_msg | |
| } | |
| def validate_api_token(token, token_name, msg_if_missing): | |
| """Check for missing/empty API tokens""" | |
| if not token or token.strip() == "": | |
| return make_error_response( | |
| msg="Missing or empty API token", | |
| log_msg="Missing or empty API token", | |
| content=msg_if_missing | |
| ) | |
| return None # OK | |
| def run_safe_exec(full_code, df=None, extra_globals=None): | |
| """Safely execute generated code and handle errors""" | |
| local_vars = {} | |
| global_vars = { | |
| 'pd': pd, 'plt': plt, 'os': os, | |
| 'sns': __import__('seaborn'), | |
| 'uuid': __import__('uuid'), | |
| 'calendar': __import__('calendar'), | |
| 'np': __import__('numpy'), | |
| 'df': df # <-- pass your DataFrame here | |
| } | |
| # allow user to inject more globals (optional) | |
| if extra_globals: | |
| global_vars.update(extra_globals) | |
| try: | |
| exec(full_code, global_vars, local_vars) | |
| return ( | |
| local_vars.get('answer', "Code executed but no result was saved in 'answer' variable"), | |
| None | |
| ) | |
| except Exception as code_error: | |
| return None, str(code_error) | |
| # ------------------------ | |
| # Step 1: Reload env vars | |
| # ------------------------ | |
| load_dotenv(override=True) | |
| fresh_groq_token = os.getenv("GROQ_API_KEY") | |
| fresh_gemini_token = os.getenv("GEMINI_TOKEN") | |
| # ------------------------ | |
| # Step 2: Init LLM | |
| # ------------------------ | |
| try: | |
| if "gemini" in model_name: | |
| token_error = validate_api_token( | |
| fresh_gemini_token, | |
| "GEMINI_TOKEN", | |
| "Gemini API token not available or empty. Please set GEMINI_TOKEN in your environment variable." | |
| ) | |
| if token_error: | |
| return token_error | |
| try: | |
| llm = ChatGoogleGenerativeAI( | |
| model=models[model_name], | |
| google_api_key=fresh_gemini_token, | |
| temperature=0 | |
| ) | |
| # Gemini requires async call | |
| llm.invoke("Test") | |
| # print("Gemini API key test successful") | |
| except Exception as api_error: | |
| return make_error_response( | |
| msg="API Connection Error", | |
| log_msg=str(api_error), | |
| content="API Key Error: Your Gemini API key appears to be invalid, expired, or restricted. Please check your GEMINI_TOKEN in the .env file." | |
| if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower() | |
| else f"API Connection Error: {api_error}" | |
| ) | |
| else: | |
| token_error = validate_api_token( | |
| fresh_groq_token, | |
| "GROQ_API_KEY", | |
| "Groq API token not available or empty. Please set GROQ_API_KEY in your environment variables and restart the application." | |
| ) | |
| if token_error: | |
| return token_error | |
| try: | |
| llm = ChatGroq( | |
| model=models[model_name], | |
| api_key=fresh_groq_token, | |
| temperature=0 | |
| ) | |
| llm.invoke("Test") # test API key | |
| # print("Groq API key test successful") | |
| except Exception as api_error: | |
| return make_error_response( | |
| msg="API Connection Error", | |
| log_msg=str(api_error), | |
| content="API Key Error: Your Groq API key appears to be invalid, expired, or restricted. Please check your GROQ_API_KEY in the .env file." | |
| if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower() | |
| else f"API Connection Error: {api_error}" | |
| ) | |
| except Exception as e: | |
| return make_error_response(str(e), str(e)) | |
| # ------------------------ | |
| # Step 3: Check AQ_met_data.csv | |
| # ------------------------ | |
| if not os.path.exists("AQ_met_data.csv"): | |
| return make_error_response( | |
| msg="Data file not found", | |
| log_msg="Data file not found", | |
| content="AQ_met_data.csv file not found. Please ensure the data file is in the correct location." | |
| ) | |
| df = pd.read_csv("AQ_met_data.csv") | |
| df["Timestamp"] = pd.to_datetime(df["Timestamp"]) | |
| new_line = "\n" | |
| states_df = pd.read_csv("states_data.csv") | |
| ncap_df = pd.read_csv("ncap_funding_data.csv") | |
| # Template for user query | |
| template = f"""```python | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import uuid | |
| import calendar | |
| import numpy as np | |
| # Set professional matplotlib styling | |
| plt.style.use('vayuchat.mplstyle') | |
| df = pd.read_csv("AQ_met_data.csv") | |
| df["Timestamp"] = pd.to_datetime(df["Timestamp"]) | |
| states_df = pd.read_csv("states_data.csv") | |
| ncap_df = pd.read_csv("ncap_funding_data.csv") | |
| # df is pandas DataFrame with air quality data from India. Data frequency is daily from 2017 to 2024. The data has the following columns and data types: | |
| {new_line.join(map(lambda x: '# '+x, str(df.dtypes).split(new_line)))} | |
| # states_df is a pandas DataFrame of state-wise population, area and whether state is union territory or not of India. | |
| {new_line.join(map(lambda x: '# '+x, str(states_df.dtypes).split(new_line)))} | |
| # ncap_df is a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP). | |
| {new_line.join(map(lambda x: '# '+x, str(ncap_df.dtypes).split(new_line)))} | |
| # Question: {question.strip()} | |
| # Generate code to answer the question and save result in 'answer' variable | |
| # If creating a plot, save it with a unique filename and store the filename in 'answer' | |
| # If returning text/numbers, store the result directly in 'answer' | |
| ```""" | |
| # Read system prompt from txt file | |
| with open("new_system_prompt.txt", "r", encoding="utf-8") as f: | |
| system_prompt = f.read().strip() | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_prompt | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"""Complete the following code to answer the user's question: | |
| {template}""" | |
| } | |
| ] | |
| # ------------------------ | |
| # Step 4: Call model | |
| # ------------------------ | |
| try: | |
| response = llm.invoke(messages) | |
| answer = response.content | |
| except Exception as e: | |
| return make_error_response(f"Error: {e}", str(e)) | |
| # ------------------------ | |
| # Step 5: Extract code | |
| # ------------------------ | |
| code_part = answer.split("```python")[1].split("```")[0] if "```python" in answer else answer | |
| full_code = f""" | |
| {template.split("```python")[1].split("```")[0]} | |
| {code_part} | |
| """ | |
| answer_result, code_error = run_safe_exec(full_code, df, extra_globals={'states_df': states_df, 'ncap_df': ncap_df}) | |
| execution_time = (datetime.now() - start_time).total_seconds() | |
| if code_error: | |
| # Friendly error messages | |
| msg = "I encountered an error while analyzing your data. " | |
| if "syntax" in code_error.lower(): | |
| msg += "There was a syntax error in the generated code. Please try rephrasing your question." | |
| elif "not defined" in code_error.lower(): | |
| msg += "Variable naming error occurred. Please try asking the question again." | |
| elif "division by zero" in code_error.lower(): | |
| msg += "Calculation involved division by zero, possibly due to missing data." | |
| elif "no data" in code_error.lower() or "empty" in code_error.lower(): | |
| msg += "No relevant data was found for your query." | |
| else: | |
| msg += f"Technical error: {code_error}" | |
| msg += "\n\n💡 **Suggestions:**\n- Try rephrasing your question\n- Use simpler terms\n- Check if the data exists for your specified criteria" | |
| log_interaction( | |
| user_query=question, | |
| model_name=model_name, | |
| response_content=msg, | |
| generated_code=full_code, | |
| execution_time=execution_time, | |
| error_message=code_error, | |
| is_image=False | |
| ) | |
| return { | |
| "role": "assistant", | |
| "content": msg, | |
| "gen_code": full_code, | |
| "ex_code": full_code, | |
| "last_prompt": question, | |
| "error": code_error | |
| } | |
| # ------------------------ | |
| # Step 7: Success logging | |
| # ------------------------ | |
| is_image = isinstance(answer_result, str) and answer_result.endswith(('.png', '.jpg', '.jpeg')) | |
| log_interaction( | |
| user_query=question, | |
| model_name=model_name, | |
| response_content=str(answer_result), | |
| generated_code=full_code, | |
| execution_time=execution_time, | |
| error_message=None, | |
| is_image=is_image | |
| ) | |
| return { | |
| "role": "assistant", | |
| "content": answer_result, | |
| "gen_code": full_code, | |
| "ex_code": full_code, | |
| "last_prompt": question, | |
| "error": None | |
| } |