Spaces:
Sleeping
Sleeping
| # insight.py | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| import json | |
| # Load Gemma model from Hugging Face | |
| model_id = "google/gemma-3n-E4B-it" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| def call_llm(prompt): | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True) | |
| outputs = model.generate(**inputs, max_new_tokens=1024) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| insight_prompt = """ | |
| You are a senior data analyst. You are given a dataset summary and column statistics after cleaning. | |
| Please perform the following: | |
| 1. Describe the structure of the data in natural language. | |
| 2. Mention any interesting patterns or distributions (e.g. most common values, ranges, anomalies). | |
| 3. Derive any basic insights you can (e.g. relationships between columns, high-cardinality features, outliers). | |
| 4. Point out anything surprising or worth further investigation. | |
| Be specific. Don't explain generic EDA steps β interpret the data as if you're preparing a short report. | |
| Column Summary: | |
| {column_data} | |
| """ | |
| def generate_insights(column_data): | |
| prompt = insight_prompt.format(column_data=json.dumps(column_data, indent=2)) | |
| return call_llm(prompt) | |