offline_data_analysis

Sleeping

App Files Files Community

offline_data_analysis / app.py

edouardlgp

Update app.py

760ba3d verified 5 months ago

raw

history blame contribute delete

18.3 kB

	import gradio as gr
	import os
	import pandas as pd
	import requests
	import json
	from typing import List, Tuple
	import chardet

	# -- LLM Client Class --
	class OllamaClient:
	def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"):
	self.model_name = model_name
	self.base_url = base_url

	def list_models(self):
	"""List all available models from Ollama server"""
	try:
	response = requests.get(f"{self.base_url}/api/tags")
	if response.status_code == 200:
	data = response.json()
	return [model['name'] for model in data.get('models', [])]
	return []
	except Exception as e:
	print(f"Error listing models: {e}")
	return []

	def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.7):
	# Convert messages to Ollama format
	ollama_messages = []
	for msg in messages:
	if msg["role"] == "system":
	ollama_messages.append({"role": "system", "content": msg["content"]})
	elif msg["role"] in ["user", "assistant"]:
	ollama_messages.append({"role": msg["role"], "content": msg["content"]})

	# Prepare the request data
	data = {
	"model": self.model_name,
	"messages": ollama_messages,
	"options": {
	"temperature": temperature,
	"top_p": top_p,
	"num_predict": max_tokens
	},
	"stream": stream
	}

	# Make the request to Ollama API
	response = requests.post(
	f"{self.base_url}/api/chat",
	json=data,
	stream=stream
	)

	if response.status_code != 200:
	raise Exception(f"Ollama API error: {response.text}")

	if stream:
	for line in response.iter_lines():
	if line:
	decoded_line = line.decode('utf-8')
	try:
	chunk = json.loads(decoded_line)
	if "message" in chunk and "content" in chunk["message"]:
	yield {"content": chunk["message"]["content"]}
	except json.JSONDecodeError:
	continue
	else:
	result = response.json()
	yield {"content": result["message"]["content"]}

	# -- check content --
	def analyze_file_content(content, file_type):
	"""Analyze file content and return structural summary"""
	if file_type in ['parquet', 'csv']:
	try:
	lines = content.split('\n')
	header = lines[0]
	columns = header.count('\|') - 1 if '\|' in header else len(header.split(','))
	rows = len(lines) - 3
	return f"📊 Dataset Structure: {columns} columns, {rows} data samples"
	except:
	return "❌ Dataset structure analysis failed"

	lines = content.split('\n')
	total_lines = len(lines)
	non_empty_lines = len([line for line in lines if line.strip()])

	if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
	functions = len([line for line in lines if 'def ' in line])
	classes = len([line for line in lines if 'class ' in line])
	imports = len([line for line in lines if 'import ' in line or 'from ' in line])
	return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"

	paragraphs = content.count('\n\n') + 1
	words = len(content.split())
	return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"

	# -- Basic stats on content --
	def get_column_stats(df, col):
	stats = {
	'type': str(df[col].dtype),
	'missing': df[col].isna().sum(),
	'unique': df[col].nunique()
	}

	if pd.api.types.is_numeric_dtype(df[col]):
	stats.update({
	'min': df[col].min(),
	'max': df[col].max(),
	'mean': df[col].mean()
	})
	else:
	stats['examples'] = df[col].dropna().head(3).tolist()

	return stats

	# -- Identify Encoding --
	def detect_file_encoding(file_path):
	"""Improved encoding detection with fallback options"""
	try:
	with open(file_path, 'rb') as f:
	rawdata = f.read(100000) # Read more data for better detection

	# Try chardet first
	result = chardet.detect(rawdata)
	encoding = result['encoding']
	confidence = result['confidence']

	# If confidence is low, try some common encodings
	if confidence < 0.9:
	for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
	try:
	rawdata.decode(test_encoding)
	return test_encoding
	except UnicodeDecodeError:
	continue

	return encoding if encoding else 'utf-8'
	except Exception as e:
	print(f"Encoding detection error: {e}")
	return 'utf-8' # Default fallback

	# -- Read file --
	def read_uploaded_file(file):
	if file is None:
	return "", ""
	try:
	file_ext = os.path.splitext(file.name)[1].lower()

	if file_ext == '.parquet':
	df = pd.read_parquet(file.name, engine='pyarrow')
	content = df.head(10).to_markdown(index=False)
	return content, "parquet"

	if file_ext == '.csv':
	# First try to detect encoding
	try:
	encoding = detect_file_encoding(file.name)

	# Try reading with different delimiters
	delimiters = [',', ';', '\t', '\|']
	df = None
	best_delimiter = ','
	max_columns = 1

	# First pass to find the best delimiter
	for delimiter in delimiters:
	try:
	with open(file.name, 'r', encoding=encoding) as f:
	first_line = f.readline()
	current_columns = len(first_line.split(delimiter))
	if current_columns > max_columns:
	max_columns = current_columns
	best_delimiter = delimiter
	except:
	continue

	# Now read with the best found delimiter
	try:
	df = pd.read_csv(
	file.name,
	encoding=encoding,
	delimiter=best_delimiter,
	on_bad_lines='warn',
	engine='python',
	quotechar='"'
	)
	except:
	# Fallback to pandas auto-detection
	df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn')

	if df is None or len(df.columns) < 1:
	return "❌ Could not parse CSV file - no valid columns detected", "error"

	# Generate comprehensive data summary
	content = "📊 CSV Metadata:\n"
	content += f"- Rows: {len(df):,}\n"
	content += f"- Columns: {len(df.columns):,}\n"
	content += f"- Missing Values: {df.isna().sum().sum():,}\n\n"

	content += "🔍 Column Details:\n"
	for col in df.columns:
	stats = get_column_stats(df, col)
	content += f"### {col}\n"
	content += f"- Type: {stats['type']}\n"
	content += f"- Unique: {stats['unique']}\n"
	content += f"- Missing: {stats['missing']}\n"

	if 'examples' in stats:
	content += f"- Examples: {stats['examples']}\n"
	else:
	content += (
	f"- Range: {stats['min']} to {stats['max']}\n"
	f"- Mean: {stats['mean']:.2f}\n"
	)
	content += "\n"

	content += "📋 Sample Data (First 3 Rows):\n"
	content += df.head(3).to_markdown(index=False)

	return content, "csv"
	except Exception as e:
	return f"❌ Error reading CSV file: {str(e)}", "error"
	else:
	encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
	for encoding in encodings:
	try:
	with open(file.name, 'r', encoding=encoding) as f:
	content = f.read()
	return content, "text"
	except UnicodeDecodeError:
	continue
	raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
	except Exception as e:
	return f"❌ Error reading file: {str(e)}", "error"

	def format_history(history):
	formatted_history = []
	for user_msg, assistant_msg in history:
	formatted_history.append({"role": "user", "content": user_msg})
	if assistant_msg:
	formatted_history.append({"role": "assistant", "content": assistant_msg})
	return formatted_history

	def chat(message,
	history,
	uploaded_file,
	system_message="",
	max_tokens=4000,
	temperature=0.3,
	top_p=0.9,
	selected_model="phi3:latest"):

	system_prefix = """
	You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions.

	Analyze the uploaded file in depth from the following perspectives:

	1. 📋 Overall file structure and format
	2. ⭐ Data Quality and completeness evaluation
	3. 💡 Suggested data fixes and improvements
	4. 📈 Data characteristics, meaning and patterns
	5. 📊 Key component analysis and potential segmentations
	6. 🎯 Insights and suggested persuasive story telling

	Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way.

	Format the analysis results in Markdown and include specific examples where possible.
	"""

	if uploaded_file:
	content, file_type = read_uploaded_file(uploaded_file)
	if file_type == "error":
	return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]

	file_summary = analyze_file_content(content, file_type)

	if file_type in ['parquet', 'csv']:
	system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
	else:
	system_message += f"\n\nFile Content:\n```\n{content}\n```"

	if message == "Starting file analysis...":
	message = f"""[Structure Analysis] {file_summary}
	Please provide detailed analysis from these perspectives:
	1. 📋 Overall file structure and format
	2. ⭐ Data Quality and completeness evaluation
	3. 💡 Suggested data fixes and improvements
	4. 📈 Data characteristics, meaning and patterns
	5. 📊 Key component analysis and potential segmentations
	6. 🎯 Insights and suggested persuasive story telling"""

	messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]

	# Convert history to message format
	if history is not None:
	for item in history:
	if isinstance(item, dict):
	messages.append(item)
	elif isinstance(item, (list, tuple)) and len(item) == 2:
	messages.append({"role": "user", "content": item[0]})
	if item[1]:
	messages.append({"role": "assistant", "content": item[1]})

	messages.append({"role": "user", "content": message})

	try:
	client = OllamaClient(model_name=selected_model)
	partial_message = ""
	current_history = []

	for response in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = response.get('content', '')
	if token:
	partial_message += token
	current_history = [
	{"role": "user", "content": message},
	{"role": "assistant", "content": partial_message}
	]
	yield "", current_history

	except Exception as e:
	error_msg = f"❌ Inference error: {str(e)}"
	error_history = [
	{"role": "user", "content": message},
	{"role": "assistant", "content": error_msg}
	]
	yield "", error_history

	css = """
	footer {visibility: hidden}
	"""

	with gr.Blocks(theme="gstaff/xkcd",
	css=css,
	title="Offline Sensitive Survey Data Analysis") as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1000px; margin: 0 auto;">
	<h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Sensitive Survey Data Analysis</h1>
	<h3 style="font-size: 1.2em; margin: 1em;">Leveraging your Local Ollama Inference Server</h3>
	</div>
	"""
	)

	# Store the current model in a state variable
	current_model = gr.State("phi3:latest")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	height=500,
	label="Chat Interface",
	type="messages"
	)
	msg = gr.Textbox(
	label="Type your message",
	show_label=False,
	placeholder="Ask me anything about the uploaded data file... ",
	container=False
	)
	with gr.Row():
	send = gr.Button("Send")
	clear = gr.ClearButton([msg, chatbot])

	with gr.Column(scale=1):
	gr.Markdown("### Upload File \nSupport: CSV, Parquet files, Text")
	file_upload = gr.File(
	label="Upload File",
	file_types=[".csv", ".parquet",".txt"],
	type="filepath"
	)

	with gr.Accordion("Model Settings", open=False):
	model_dropdown = gr.Dropdown(
	label="Available Models",
	choices=[],
	interactive=True
	)
	refresh_models = gr.Button("Select Model")

	with gr.Accordion("Advanced Settings ⚙️", open=False):
	system_message = gr.Textbox(label="Override System Message 📝", value="")
	max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens (maximum number of words for generated response)")
	temperature = gr.Slider(minimum=0, maximum=1, value=0.3, label="Temperature (higher = more creative)")
	top_p = gr.Slider(minimum=0, maximum=1, value=0.7, label="Top P (word choices by probability threshold)")

	# Function to load available models
	def load_models():
	client = OllamaClient()
	models = client.list_models()
	return gr.Dropdown(choices=models, value=models[0] if models else "phi3:latest")

	# Refresh models button click handler
	refresh_models.click(
	load_models,
	outputs=model_dropdown
	)

	# Model dropdown change handler
	model_dropdown.change(
	lambda x: x,
	inputs=model_dropdown,
	outputs=current_model
	)

	# Load models when app starts
	demo.load(
	load_models,
	outputs=model_dropdown
	)

	# Event bindings
	msg.submit(
	chat,
	inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
	outputs=[msg, chatbot],
	queue=True
	).then(
	lambda: gr.update(interactive=True),
	None,
	[msg]
	)

	send.click(
	chat,
	inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
	outputs=[msg, chatbot],
	queue=True
	).then(
	lambda: gr.update(interactive=True),
	None,
	[msg]
	)

	# Auto-analysis on file upload with this hidden component
	auto_analyze_trigger = gr.Textbox(value="Analyze this file", visible=False)
	file_upload.change(
	lambda: gr.Chatbot(value=[]), # Clear chat history
	outputs=[chatbot],
	queue=True
	).then(
	chat,
	inputs=[auto_analyze_trigger, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
	outputs=[msg, chatbot],
	queue=True
	)


	# Example queries
	with gr.Column():
	gr.Markdown("### Potential Follow-up Queries")
	with gr.Row():
	example_btns = [
	gr.Button("Analyze open-ended responses for sentiment and recurring themes", size="lg", variant="secondary"),
	gr.Button("Compare responses between different groups and identify potential segmentation or cluster analysis", size="lg", variant="secondary"),
	gr.Button("Identify potential outcome variables and suggest a predicting model for it", size="lg", variant="secondary"),
	gr.Button("Generate a Quarto notebook in Python to process this dataset", size="lg", variant="secondary"),
	gr.Button("Generate a Rmd notebook in R to process this dataset", size="lg", variant="secondary"),

	]

	# Add click handlers
	for btn in example_btns:
	btn.click(
	lambda x: x,
	inputs=[gr.Textbox(value=btn.value, visible=False)],
	outputs=msg
	)


	if __name__ == "__main__":
	demo.launch()