import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objs as go from sklearn.preprocessing import StandardScaler from pyod.models.iforest import IForest from datetime import datetime, timedelta class NYCTaxiAnomalyDetector: def __init__(self, data): self.data = data.copy() self.scaler = StandardScaler() def filter_by_date_range(self, start_date, end_date): """ Filter data by specified date range :param start_date: Start date of the range :param end_date: End date of the range :return: Filtered DataFrame """ # Ensure date column is datetime if not pd.api.types.is_datetime64_any_dtype(self.data["date"]): self.data["date"] = pd.to_datetime(self.data["date"]) # Filter data filtered_data = self.data[ (self.data["date"] >= start_date) & (self.data["date"] <= end_date) ] return filtered_data def preprocess_data(self, data, column): """ Preprocess data for anomaly detection :param data: Filtered DataFrame :param column: Column to detect anomalies in :return: Scaled data and original index """ # Ensure the column is numeric data[column] = pd.to_numeric(data[column], errors="coerce") # Remove NaN values clean_data = data[column].dropna() # Scale the data scaled_data = self.scaler.fit_transform(clean_data.values.reshape(-1, 1)) return scaled_data, clean_data.index def detect_anomalies(self, data, column, contamination=0.05): """ Detect anomalies using Isolation Forest :param data: Filtered DataFrame :param column: Column to detect anomalies in :param contamination: Expected proportion of outliers :return: DataFrame with anomaly detection results """ # Preprocess data scaled_data, original_index = self.preprocess_data(data, column) # Apply Isolation Forest clf = IForest(contamination=contamination, random_state=42) y_pred = clf.fit_predict(scaled_data) # Create results DataFrame anomaly_results = pd.DataFrame( { "date": original_index, "value": data.loc[original_index, column], "is_anomaly": y_pred == 1, } ) return anomaly_results class AIContextGenerator: def generate_context(self, anomaly_date): """ Generate potential context for the anomaly :param anomaly_date: Date of the anomaly :return: List of contextual insights """ # Mock contextual insights - replace with actual data sources contexts = [ { "type": "Weather", "description": f"Weather conditions on {anomaly_date.date()}", "severity": "High", }, { "type": "Event", "description": f"City events around {anomaly_date.date()}", "severity": "Medium", }, { "type": "Economic", "description": f"Economic factors on {anomaly_date.date()}", "severity": "Low", }, ] return contexts def load_nyc_taxi_data(): """ Load and preprocess NYC Taxi dataset :return: DataFrame with synthetic taxi traffic data """ # Synthetic data generation dates = pd.date_range(start="2023-01-01", end="2023-12-31", freq="D") base_traffic = np.random.normal(5000, 500, len(dates)) # Introduce some anomalies base_traffic[50] = 10000 # Extreme spike base_traffic[200] = 500 # Extreme drop base_traffic[300] = 12000 # Another spike df = pd.DataFrame({"date": dates, "daily_traffic": base_traffic}) return df def main(): st.set_page_config( page_title="NYC Taxi Traffic Anomaly Detection", page_icon="🚕", layout="wide" ) st.title("🚕 NYC Taxi Traffic Anomaly Detection") # Load Data taxi_data = load_nyc_taxi_data() # Sidebar for Configuration st.sidebar.header("Anomaly Detection Settings") # Date Range Selection st.sidebar.subheader("Date Range") min_date = taxi_data["date"].min().date() max_date = taxi_data["date"].max().date() col1, col2 = st.sidebar.columns(2) with col1: start_date = st.date_input( "Start Date", min_value=min_date, max_value=max_date, value=min_date ) with col2: end_date = st.date_input( "End Date", min_value=min_date, max_value=max_date, value=max_date ) # Anomaly Sensitivity anomaly_threshold = st.sidebar.slider( "Anomaly Sensitivity", min_value=0.01, max_value=0.1, value=0.05, step=0.01, help="Lower values detect fewer but more extreme anomalies", ) # Instantiate Detector detector = NYCTaxiAnomalyDetector(taxi_data) # Filter Data by Date Range filtered_data = detector.filter_by_date_range( pd.to_datetime(start_date), pd.to_datetime(end_date) ) # Detect Anomalies anomalies = detector.detect_anomalies( filtered_data, "daily_traffic", contamination=anomaly_threshold ) # Visualization st.header("Daily Taxi Traffic Trend") fig = px.line( filtered_data, x="date", y="daily_traffic", title=f"NYC Taxi Daily Traffic ({start_date} to {end_date})", labels={"daily_traffic": "Number of Taxi Rides"}, ) # Highlight Anomalies anomaly_points = filtered_data[anomalies["is_anomaly"]] fig.add_trace( go.Scatter( x=anomaly_points["date"], y=anomaly_points["daily_traffic"], mode="markers", name="Anomalies", marker=dict(color="red", size=10, symbol="star"), ) ) st.plotly_chart(fig, use_container_width=True) # Anomaly Details st.header("Anomaly Insights") if not anomaly_points.empty: context_generator = AIContextGenerator() for _, anomaly in anomaly_points.iterrows(): st.subheader(f"Anomaly on {anomaly['date'].date()}") col1, col2 = st.columns(2) with col1: st.metric("Taxi Rides", f"{anomaly['daily_traffic']:.0f}") with col2: contexts = context_generator.generate_context(anomaly["date"]) st.write("### Potential Context") for context in contexts: st.markdown( f""" - **{context['type']}**: {context['description']} (Severity: {context['severity']}) """ ) else: st.info("No significant anomalies detected with current settings.") if __name__ == "__main__": main()