Spaces:

LeannJoy
/

DataAnalysisApp

Sleeping

App Files Files Community

DataAnalysisApp / utils.py

LeannJoy

Update utils.py

c01a6a6 verified 3 months ago

raw

history blame contribute delete

3.25 kB

	import pandas as pd
	import csv
	import io
	from langchain_community.llms import HuggingFaceEndpoint
	# FIX: Changed import path from langchain_community to langchain_experimental
	from langchain_experimental.agents import create_pandas_dataframe_agent
	from dotenv import load_dotenv
	import os

	# Load environment variables from .env file
	load_dotenv()

	# --- Hugging Face Model Configuration ---
	HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"

	def detect_delimiter(file_content: bytes) -> str:
	"""Detects the delimiter of a CSV file content."""
	try:
	# Decode the first few lines to sample the content
	sample = file_content.decode('utf-8').splitlines()[:5]
	if not sample:
	return ',' # Default to comma if empty

	# Use csv.Sniffer to guess the dialect (and thus the delimiter)
	dialect = csv.Sniffer().sniff('\n'.join(sample))
	return dialect.delimiter
	except Exception:
	# Fallback to a comma if sniffing fails
	return ','

	def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
	"""
	Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.

	Args:
	uploaded_file_content: The byte content of the uploaded CSV file.
	query: The natural language question from the user.
	hf_api_token: The API token for the Hugging Face Hub.

	Returns:
	The response generated by the agent.
	"""
	if not hf_api_token:
	# Updated error message for Hugging Face
	return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."

	try:
	# 1. Robustly read CSV content using detected delimiter
	delimiter = detect_delimiter(uploaded_file_content)
	data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
	df = pd.read_csv(data_io, sep=delimiter)

	# 2. Initialize the LLM using HuggingFaceEndpoint
	llm = HuggingFaceEndpoint(
	repo_id=HF_REPO_ID,
	huggingfacehub_api_token=hf_api_token,
	temperature=0.0, # Keep reasoning deterministic
	max_new_tokens=512
	)

	# 3. Create the Pandas DataFrame Agent
	# CRITICAL SECURITY NOTE: Must set allow_dangerous_code=True
	# as LangChain now requires an explicit opt-in for code execution agents.
	agent = create_pandas_dataframe_agent(
	llm,
	df,
	verbose=True,
	allow_dangerous_code=True, # Added to prevent runtime ValueError
	# Include a system prompt to guide the agent's behavior
	agent_kwargs={
	"system_message": (
	"You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
	"named 'df'. Use Python code only to answer questions about the data. "
	"Do not make up facts. Always show the code you executed before giving the final answer."
	)
	}
	)

	# 4. Run the query
	response = agent.run(query)

	return response

	except Exception as e:
	# Catch and report any exceptions during processing
	return f"An error occurred during analysis: {e}"