DataAnalysisApp / utils.py
LeannJoy's picture
Update utils.py
c01a6a6 verified
import pandas as pd
import csv
import io
from langchain_community.llms import HuggingFaceEndpoint
# FIX: Changed import path from langchain_community to langchain_experimental
from langchain_experimental.agents import create_pandas_dataframe_agent
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
# --- Hugging Face Model Configuration ---
HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
def detect_delimiter(file_content: bytes) -> str:
"""Detects the delimiter of a CSV file content."""
try:
# Decode the first few lines to sample the content
sample = file_content.decode('utf-8').splitlines()[:5]
if not sample:
return ',' # Default to comma if empty
# Use csv.Sniffer to guess the dialect (and thus the delimiter)
dialect = csv.Sniffer().sniff('\n'.join(sample))
return dialect.delimiter
except Exception:
# Fallback to a comma if sniffing fails
return ','
def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
"""
Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.
Args:
uploaded_file_content: The byte content of the uploaded CSV file.
query: The natural language question from the user.
hf_api_token: The API token for the Hugging Face Hub.
Returns:
The response generated by the agent.
"""
if not hf_api_token:
# Updated error message for Hugging Face
return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."
try:
# 1. Robustly read CSV content using detected delimiter
delimiter = detect_delimiter(uploaded_file_content)
data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
df = pd.read_csv(data_io, sep=delimiter)
# 2. Initialize the LLM using HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
repo_id=HF_REPO_ID,
huggingfacehub_api_token=hf_api_token,
temperature=0.0, # Keep reasoning deterministic
max_new_tokens=512
)
# 3. Create the Pandas DataFrame Agent
# CRITICAL SECURITY NOTE: Must set allow_dangerous_code=True
# as LangChain now requires an explicit opt-in for code execution agents.
agent = create_pandas_dataframe_agent(
llm,
df,
verbose=True,
allow_dangerous_code=True, # Added to prevent runtime ValueError
# Include a system prompt to guide the agent's behavior
agent_kwargs={
"system_message": (
"You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
"named 'df'. Use Python code only to answer questions about the data. "
"Do not make up facts. Always show the code you executed before giving the final answer."
)
}
)
# 4. Run the query
response = agent.run(query)
return response
except Exception as e:
# Catch and report any exceptions during processing
return f"An error occurred during analysis: {e}"