ninarg's picture
Fix: drop NaT timestamps, avoid colon in altair column names
c5e72f6 verified
"""VynFi Γ— pm4py: Interactive Process Mining Demo"""
import streamlit as st
import pandas as pd
from collections import Counter
st.set_page_config(page_title="VynFi Process Mining", page_icon="πŸ“Š", layout="wide")
st.title("πŸ“Š VynFi Γ— pm4py: Process Mining Demo")
st.caption("Synthetic supply-chain event log from [VynFi](https://vynfi.com)")
@st.cache_data
def load_data():
from datasets import load_dataset
ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
df = ds.to_pandas()
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
# Drop rows with NaT timestamps (pm4py can't handle them)
df = df.dropna(subset=["timestamp"])
# Rename for pm4py β€” use safe names without colons for display
df = df.rename(columns={
"case_id": "case_id_pm",
"activity_name": "activity",
"timestamp": "ts",
})
return df
df = load_data()
st.sidebar.header("Dataset")
st.sidebar.metric("Events", f"{len(df):,}")
st.sidebar.metric("Activities", df["activity"].nunique())
st.sidebar.metric("Cases", df["case_id_pm"].nunique())
tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
with tab1:
st.subheader("Directly-Follows Graph")
try:
import pm4py
# Convert to pm4py format
pm_df = df.rename(columns={
"case_id_pm": "case:concept:name",
"activity": "concept:name",
"ts": "time:timestamp",
})
event_log = pm4py.convert_to_event_log(pm_df)
dfg, sa, ea = pm4py.discover_dfg(event_log)
from pm4py.visualization.dfg import visualizer as dfg_vis
gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
parameters={
dfg_vis.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: sa,
dfg_vis.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: ea,
dfg_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg",
})
st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
except Exception as e:
st.warning(f"Could not render DFG: {e}")
st.info("Try the Variants or Statistics tabs instead.")
with tab2:
st.subheader("Process Variants")
variants = {}
for cid, grp in df.sort_values("ts").groupby("case_id_pm"):
variants[cid] = tuple(grp["activity"].tolist())
vc = Counter(variants.values())
total = len(variants)
st.metric("Unique Variants", len(vc))
rows = [{"Trace": " β†’ ".join(t), "Count": c, "Frequency": f"{c/total*100:.1f}%"}
for t, c in vc.most_common(20)]
st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
if vc:
hp = vc.most_common(1)[0]
st.info(f"**Happy path**: {' β†’ '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
with tab3:
st.subheader("Activity Frequency")
ac = df["activity"].value_counts().reset_index()
ac.columns = ["Activity", "Count"]
st.bar_chart(ac, x="Activity", y="Count")
st.subheader("Events Over Time")
if "ts" in df.columns:
weekly = df.set_index("ts").resample("W").size().reset_index()
weekly.columns = ["Week", "Events"]
st.line_chart(weekly, x="Week", y="Events")
with tab4:
st.subheader("Raw Event Data")
st.dataframe(df.head(200), use_container_width=True)
st.divider()
st.caption("[VynFi](https://vynfi.com) Β· [pm4py](https://pm4py.fit.fraunhofer.de/) Β· [Dataset](https://huggingface.co/datasets/VynFi/vynfi-supply-chain-ocel)")