edouardlgp's picture
Update app.py
760ba3d verified
import gradio as gr
import os
import pandas as pd
import requests
import json
from typing import List, Tuple
import chardet
# -- LLM Client Class --
class OllamaClient:
def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"):
self.model_name = model_name
self.base_url = base_url
def list_models(self):
"""List all available models from Ollama server"""
try:
response = requests.get(f"{self.base_url}/api/tags")
if response.status_code == 200:
data = response.json()
return [model['name'] for model in data.get('models', [])]
return []
except Exception as e:
print(f"Error listing models: {e}")
return []
def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.7):
# Convert messages to Ollama format
ollama_messages = []
for msg in messages:
if msg["role"] == "system":
ollama_messages.append({"role": "system", "content": msg["content"]})
elif msg["role"] in ["user", "assistant"]:
ollama_messages.append({"role": msg["role"], "content": msg["content"]})
# Prepare the request data
data = {
"model": self.model_name,
"messages": ollama_messages,
"options": {
"temperature": temperature,
"top_p": top_p,
"num_predict": max_tokens
},
"stream": stream
}
# Make the request to Ollama API
response = requests.post(
f"{self.base_url}/api/chat",
json=data,
stream=stream
)
if response.status_code != 200:
raise Exception(f"Ollama API error: {response.text}")
if stream:
for line in response.iter_lines():
if line:
decoded_line = line.decode('utf-8')
try:
chunk = json.loads(decoded_line)
if "message" in chunk and "content" in chunk["message"]:
yield {"content": chunk["message"]["content"]}
except json.JSONDecodeError:
continue
else:
result = response.json()
yield {"content": result["message"]["content"]}
# -- check content --
def analyze_file_content(content, file_type):
"""Analyze file content and return structural summary"""
if file_type in ['parquet', 'csv']:
try:
lines = content.split('\n')
header = lines[0]
columns = header.count('|') - 1 if '|' in header else len(header.split(','))
rows = len(lines) - 3
return f"📊 Dataset Structure: {columns} columns, {rows} data samples"
except:
return "❌ Dataset structure analysis failed"
lines = content.split('\n')
total_lines = len(lines)
non_empty_lines = len([line for line in lines if line.strip()])
if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
functions = len([line for line in lines if 'def ' in line])
classes = len([line for line in lines if 'class ' in line])
imports = len([line for line in lines if 'import ' in line or 'from ' in line])
return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
paragraphs = content.count('\n\n') + 1
words = len(content.split())
return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"
# -- Basic stats on content --
def get_column_stats(df, col):
stats = {
'type': str(df[col].dtype),
'missing': df[col].isna().sum(),
'unique': df[col].nunique()
}
if pd.api.types.is_numeric_dtype(df[col]):
stats.update({
'min': df[col].min(),
'max': df[col].max(),
'mean': df[col].mean()
})
else:
stats['examples'] = df[col].dropna().head(3).tolist()
return stats
# -- Identify Encoding --
def detect_file_encoding(file_path):
"""Improved encoding detection with fallback options"""
try:
with open(file_path, 'rb') as f:
rawdata = f.read(100000) # Read more data for better detection
# Try chardet first
result = chardet.detect(rawdata)
encoding = result['encoding']
confidence = result['confidence']
# If confidence is low, try some common encodings
if confidence < 0.9:
for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
try:
rawdata.decode(test_encoding)
return test_encoding
except UnicodeDecodeError:
continue
return encoding if encoding else 'utf-8'
except Exception as e:
print(f"Encoding detection error: {e}")
return 'utf-8' # Default fallback
# -- Read file --
def read_uploaded_file(file):
if file is None:
return "", ""
try:
file_ext = os.path.splitext(file.name)[1].lower()
if file_ext == '.parquet':
df = pd.read_parquet(file.name, engine='pyarrow')
content = df.head(10).to_markdown(index=False)
return content, "parquet"
if file_ext == '.csv':
# First try to detect encoding
try:
encoding = detect_file_encoding(file.name)
# Try reading with different delimiters
delimiters = [',', ';', '\t', '|']
df = None
best_delimiter = ','
max_columns = 1
# First pass to find the best delimiter
for delimiter in delimiters:
try:
with open(file.name, 'r', encoding=encoding) as f:
first_line = f.readline()
current_columns = len(first_line.split(delimiter))
if current_columns > max_columns:
max_columns = current_columns
best_delimiter = delimiter
except:
continue
# Now read with the best found delimiter
try:
df = pd.read_csv(
file.name,
encoding=encoding,
delimiter=best_delimiter,
on_bad_lines='warn',
engine='python',
quotechar='"'
)
except:
# Fallback to pandas auto-detection
df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn')
if df is None or len(df.columns) < 1:
return "❌ Could not parse CSV file - no valid columns detected", "error"
# Generate comprehensive data summary
content = "📊 CSV Metadata:\n"
content += f"- Rows: {len(df):,}\n"
content += f"- Columns: {len(df.columns):,}\n"
content += f"- Missing Values: {df.isna().sum().sum():,}\n\n"
content += "🔍 Column Details:\n"
for col in df.columns:
stats = get_column_stats(df, col)
content += f"### {col}\n"
content += f"- Type: {stats['type']}\n"
content += f"- Unique: {stats['unique']}\n"
content += f"- Missing: {stats['missing']}\n"
if 'examples' in stats:
content += f"- Examples: {stats['examples']}\n"
else:
content += (
f"- Range: {stats['min']} to {stats['max']}\n"
f"- Mean: {stats['mean']:.2f}\n"
)
content += "\n"
content += "📋 Sample Data (First 3 Rows):\n"
content += df.head(3).to_markdown(index=False)
return content, "csv"
except Exception as e:
return f"❌ Error reading CSV file: {str(e)}", "error"
else:
encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
for encoding in encodings:
try:
with open(file.name, 'r', encoding=encoding) as f:
content = f.read()
return content, "text"
except UnicodeDecodeError:
continue
raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
except Exception as e:
return f"❌ Error reading file: {str(e)}", "error"
def format_history(history):
formatted_history = []
for user_msg, assistant_msg in history:
formatted_history.append({"role": "user", "content": user_msg})
if assistant_msg:
formatted_history.append({"role": "assistant", "content": assistant_msg})
return formatted_history
def chat(message,
history,
uploaded_file,
system_message="",
max_tokens=4000,
temperature=0.3,
top_p=0.9,
selected_model="phi3:latest"):
system_prefix = """
You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions.
Analyze the uploaded file in depth from the following perspectives:
1. 📋 Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. 💡 Suggested data fixes and improvements
4. 📈 Data characteristics, meaning and patterns
5. 📊 Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling
Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way.
Format the analysis results in Markdown and include specific examples where possible.
"""
if uploaded_file:
content, file_type = read_uploaded_file(uploaded_file)
if file_type == "error":
return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]
file_summary = analyze_file_content(content, file_type)
if file_type in ['parquet', 'csv']:
system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
else:
system_message += f"\n\nFile Content:\n```\n{content}\n```"
if message == "Starting file analysis...":
message = f"""[Structure Analysis] {file_summary}
Please provide detailed analysis from these perspectives:
1. 📋 Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. 💡 Suggested data fixes and improvements
4. 📈 Data characteristics, meaning and patterns
5. 📊 Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling"""
messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
# Convert history to message format
if history is not None:
for item in history:
if isinstance(item, dict):
messages.append(item)
elif isinstance(item, (list, tuple)) and len(item) == 2:
messages.append({"role": "user", "content": item[0]})
if item[1]:
messages.append({"role": "assistant", "content": item[1]})
messages.append({"role": "user", "content": message})
try:
client = OllamaClient(model_name=selected_model)
partial_message = ""
current_history = []
for response in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = response.get('content', '')
if token:
partial_message += token
current_history = [
{"role": "user", "content": message},
{"role": "assistant", "content": partial_message}
]
yield "", current_history
except Exception as e:
error_msg = f"❌ Inference error: {str(e)}"
error_history = [
{"role": "user", "content": message},
{"role": "assistant", "content": error_msg}
]
yield "", error_history
css = """
footer {visibility: hidden}
"""
with gr.Blocks(theme="gstaff/xkcd",
css=css,
title="Offline Sensitive Survey Data Analysis") as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 1000px; margin: 0 auto;">
<h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Sensitive Survey Data Analysis</h1>
<h3 style="font-size: 1.2em; margin: 1em;">Leveraging your Local Ollama Inference Server</h3>
</div>
"""
)
# Store the current model in a state variable
current_model = gr.State("phi3:latest")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
height=500,
label="Chat Interface",
type="messages"
)
msg = gr.Textbox(
label="Type your message",
show_label=False,
placeholder="Ask me anything about the uploaded data file... ",
container=False
)
with gr.Row():
send = gr.Button("Send")
clear = gr.ClearButton([msg, chatbot])
with gr.Column(scale=1):
gr.Markdown("### Upload File \nSupport: CSV, Parquet files, Text")
file_upload = gr.File(
label="Upload File",
file_types=[".csv", ".parquet",".txt"],
type="filepath"
)
with gr.Accordion("Model Settings", open=False):
model_dropdown = gr.Dropdown(
label="Available Models",
choices=[],
interactive=True
)
refresh_models = gr.Button("Select Model")
with gr.Accordion("Advanced Settings ⚙️", open=False):
system_message = gr.Textbox(label="Override System Message 📝", value="")
max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens (maximum number of words for generated response)")
temperature = gr.Slider(minimum=0, maximum=1, value=0.3, label="Temperature (higher = more creative)")
top_p = gr.Slider(minimum=0, maximum=1, value=0.7, label="Top P (word choices by probability threshold)")
# Function to load available models
def load_models():
client = OllamaClient()
models = client.list_models()
return gr.Dropdown(choices=models, value=models[0] if models else "phi3:latest")
# Refresh models button click handler
refresh_models.click(
load_models,
outputs=model_dropdown
)
# Model dropdown change handler
model_dropdown.change(
lambda x: x,
inputs=model_dropdown,
outputs=current_model
)
# Load models when app starts
demo.load(
load_models,
outputs=model_dropdown
)
# Event bindings
msg.submit(
chat,
inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
outputs=[msg, chatbot],
queue=True
).then(
lambda: gr.update(interactive=True),
None,
[msg]
)
send.click(
chat,
inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
outputs=[msg, chatbot],
queue=True
).then(
lambda: gr.update(interactive=True),
None,
[msg]
)
# Auto-analysis on file upload with this hidden component
auto_analyze_trigger = gr.Textbox(value="Analyze this file", visible=False)
file_upload.change(
lambda: gr.Chatbot(value=[]), # Clear chat history
outputs=[chatbot],
queue=True
).then(
chat,
inputs=[auto_analyze_trigger, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
outputs=[msg, chatbot],
queue=True
)
# Example queries
with gr.Column():
gr.Markdown("### Potential Follow-up Queries")
with gr.Row():
example_btns = [
gr.Button("Analyze open-ended responses for sentiment and recurring themes", size="lg", variant="secondary"),
gr.Button("Compare responses between different groups and identify potential segmentation or cluster analysis", size="lg", variant="secondary"),
gr.Button("Identify potential outcome variables and suggest a predicting model for it", size="lg", variant="secondary"),
gr.Button("Generate a Quarto notebook in Python to process this dataset", size="lg", variant="secondary"),
gr.Button("Generate a Rmd notebook in R to process this dataset", size="lg", variant="secondary"),
]
# Add click handlers
for btn in example_btns:
btn.click(
lambda x: x,
inputs=[gr.Textbox(value=btn.value, visible=False)],
outputs=msg
)
if __name__ == "__main__":
demo.launch()