rsm-roguchi commited on
Commit
c75151e
·
0 Parent(s):

Initial clean commit (no binaries, no venv)

Browse files
,dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv
2
+ __pycache__/
3
+ *.pyc
4
+ data/
5
+ artifacts/
6
+ .git
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+
6
+ # compiled / native artifacts
7
+ *.so
8
+ *.dll
9
+ *.dylib
10
+
11
+ # local data & outputs
12
+ data/
13
+ artifacts/
14
+
15
+ # misc
16
+ .DS_Store
17
+ EOF
Dockerfile ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ FROM python:3.13.5-slim
3
+
4
+ WORKDIR /app
5
+
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ COPY requirements.txt ./
13
+ COPY src/ ./src/
14
+
15
+ RUN pip3 install -r requirements.txt
16
+
17
+ EXPOSE 8501
18
+
19
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
+
21
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
22
+ =======
23
+ # ---- base build (dependencies) ----
24
+ FROM python:3.11-slim AS base
25
+
26
+ ENV PYTHONDONTWRITEBYTECODE=1 \
27
+ PYTHONUNBUFFERED=1
28
+
29
+ # System deps (git for pybaseball, plus basic build tooling if needed)
30
+ RUN apt-get update && apt-get install -y --no-install-recommends \
31
+ git && \
32
+ rm -rf /var/lib/apt/lists/*
33
+
34
+ WORKDIR /app
35
+
36
+ # Copy metadata first to leverage Docker layer caching
37
+ COPY pyproject.toml README.md ./
38
+ COPY src ./src
39
+
40
+ # Install package in editable mode (or regular if you prefer)
41
+ RUN pip install --no-cache-dir --upgrade pip && \
42
+ pip install --no-cache-dir -e .
43
+
44
+ # Copy the Streamlit app (optional; used in default CMD)
45
+ COPY app.py ./app.py
46
+
47
+ # Create mount points for persistent cache/artifacts
48
+ RUN mkdir -p /app/data/cache /app/artifacts
49
+
50
+ # ---- runtime image ----
51
+ FROM python:3.11-slim AS runtime
52
+ ENV PYTHONDONTWRITEBYTECODE=1 \
53
+ PYTHONUNBUFFERED=1
54
+
55
+ WORKDIR /app
56
+ COPY --from=base /usr/local/lib/python3.11 /usr/local/lib/python3.11
57
+ COPY --from=base /usr/local/bin /usr/local/bin
58
+ COPY --from=base /app /app
59
+
60
+ # Expose Streamlit port
61
+ EXPOSE 7860
62
+
63
+ # Default: run Streamlit app (Hugging Face Space style)
64
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
65
+ >>>>>>> 63696d41 (Initial Commit)
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pitch Dash
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: Pitching Data Dashboard
12
+ ---
13
+
14
+ # Welcome to Streamlit!
15
+
16
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
+
18
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
+ forums](https://discuss.streamlit.io).
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from data import load_statcast, default_window
4
+ from featurize import infer_ivb_sign, engineer_pitch_features
5
+ from model import fit_kmeans, nearest_comps
6
+ from tags import xy_cluster_tags
7
+ from plots import movement_scatter_xy, radar_quality
8
+ import os, sys
9
+
10
+ sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
11
+
12
+ st.set_page_config(page_title="PitchXY (Handedness-Aware)", layout="wide")
13
+ st.title("⚾ PitchXY — Handedness-Aware Pitch Archetypes & Scouting Cards")
14
+
15
+ with st.sidebar:
16
+ st.header("Data Window")
17
+ dstart, dend = default_window()
18
+ start = st.text_input("Start YYYY-MM-DD", dstart)
19
+ end = st.text_input("End YYYY-MM-DD", dend)
20
+ k = st.slider("Clusters (k)", 5, 12, 8)
21
+ force = st.checkbox("Force re-download", value=False)
22
+
23
+ df_raw = load_statcast(start, end, force=force)
24
+ if df_raw.empty:
25
+ st.warning("No data for that window.")
26
+ st.stop()
27
+
28
+ ivb_sign = infer_ivb_sign(df_raw)
29
+ df_feat = engineer_pitch_features(df_raw, ivb_sign)
30
+ df_fit, scaler, km, nn = fit_kmeans(df_feat, k=k)
31
+ cluster_names = xy_cluster_tags(df_fit)
32
+ df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
33
+
34
+ pitcher = st.selectbox("Pitcher", sorted(df_fit["player_name"].unique()))
35
+ df_p = df_fit[df_fit["player_name"] == pitcher].sort_values("pitch_type")
36
+
37
+ tab1, tab2, tab3 = st.tabs(["Movement", "Scouting Card", "Comps"])
38
+
39
+ with tab1:
40
+ view = st.radio("View", ["Selected pitcher", "All pitchers"], horizontal=True)
41
+ if view == "Selected pitcher":
42
+ st.subheader(f"Movement — {pitcher}")
43
+ st.plotly_chart(
44
+ movement_scatter_xy(df_p, color="pitch_type"), use_container_width=True
45
+ )
46
+ else:
47
+ st.subheader("Movement — All pitchers (cluster context)")
48
+ st.plotly_chart(
49
+ movement_scatter_xy(df_fit, color="cluster_name"), use_container_width=True
50
+ )
51
+
52
+
53
+ with tab2:
54
+ st.subheader(f"Scouting Card — {pitcher}")
55
+ st.dataframe(
56
+ df_p[
57
+ [
58
+ "pitch_type",
59
+ "p_throws",
60
+ "n",
61
+ "velo",
62
+ "ivb_in",
63
+ "hb_as_in",
64
+ "csw",
65
+ "whiff_rate",
66
+ "gb_rate",
67
+ "zone_pct",
68
+ "cluster_name",
69
+ ]
70
+ ]
71
+ )
72
+ for _, row in df_p.iterrows():
73
+ st.markdown(f"### {row['pitch_type']} — {row['cluster_name']}")
74
+ st.plotly_chart(radar_quality(row), use_container_width=True)
75
+
76
+ with tab3:
77
+ for _, row in df_p.iterrows():
78
+ st.markdown(f"#### {row['pitch_type']} comps")
79
+ comps = nearest_comps(row, df_fit, scaler, nn, within_pitch_type=True, k=6)
80
+ st.dataframe(comps)
bin/cli.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import argparse
3
+ from data import load_statcast, default_window
4
+ from featurize import infer_ivb_sign, engineer_pitch_features
5
+ from model import fit_kmeans, nearest_comps
6
+ from tags import xy_cluster_tags
7
+ from plots import movement_scatter_xy
8
+ from utils import ensure_dirs, ARTIFACTS_DIR
9
+ import plotly.io as pio
10
+
11
+
12
+ def main():
13
+ parser = argparse.ArgumentParser(
14
+ description="PitchXY: handedness-aware pitch archetypes"
15
+ )
16
+ parser.add_argument("--start", type=str, help="YYYY-MM-DD")
17
+ parser.add_argument("--end", type=str, help="YYYY-MM-DD")
18
+ parser.add_argument("-k", type=int, default=8)
19
+ parser.add_argument(
20
+ "--pitcher", type=str, help='Filter pitcher by name (e.g. "Cole")'
21
+ )
22
+ parser.add_argument(
23
+ "--save-html", action="store_true", help="Save plots to artifacts/"
24
+ )
25
+ parser.add_argument(
26
+ "--force", action="store_true", help="Force re-download Statcast"
27
+ )
28
+ args = parser.parse_args()
29
+
30
+ ensure_dirs()
31
+ start, end = (
32
+ (args.start, args.end) if (args.start and args.end) else default_window()
33
+ )
34
+ print(f"Window: {start} → {end}")
35
+
36
+ df_raw = load_statcast(start, end, force=args.force)
37
+ ivb_sign = infer_ivb_sign(df_raw)
38
+ print(f"IVB sign inferred = {ivb_sign} (ride should be positive)")
39
+
40
+ df_feat = engineer_pitch_features(df_raw, ivb_sign)
41
+ df_fit, scaler, km, nn = fit_kmeans(df_feat, k=args.k)
42
+ cluster_names = xy_cluster_tags(df_fit)
43
+ df_fit["cluster_name"] = df_fit["cluster"].map(cluster_names)
44
+
45
+ # Save artifacts
46
+ feat_p = ARTIFACTS_DIR / "pitch_features.parquet"
47
+ fit_p = ARTIFACTS_DIR / "pitch_features_clusters.parquet"
48
+ df_feat.to_parquet(feat_p, index=False)
49
+ df_fit.to_parquet(fit_p, index=False)
50
+ print(f"Saved: {feat_p}, {fit_p}")
51
+
52
+ # Optional pitcher card + comps
53
+ if args.pitcher:
54
+ sub = df_fit[
55
+ df_fit["player_name"].str.contains(args.pitcher, case=False, na=False)
56
+ ]
57
+ if sub.empty:
58
+ print(f"No pitcher matched '{args.pitcher}'")
59
+ else:
60
+ name = sub["player_name"].iloc[0]
61
+ df_p = df_fit[df_fit["player_name"] == name].sort_values("pitch_type")
62
+ print(f"\n=== Scouting Card: {name} ===")
63
+ print(
64
+ df_p[
65
+ [
66
+ "pitch_type",
67
+ "p_throws",
68
+ "n",
69
+ "velo",
70
+ "ivb_in",
71
+ "hb_as_in",
72
+ "csw",
73
+ "whiff_rate",
74
+ "gb_rate",
75
+ "zone_pct",
76
+ "cluster_name",
77
+ ]
78
+ ].to_string(index=False)
79
+ )
80
+ for _, row in df_p.iterrows():
81
+ comps = nearest_comps(
82
+ row, df_fit, scaler, nn, within_pitch_type=True, k=6
83
+ )
84
+ print(f"\nNearest comps — {row['pitch_type']} ({row['cluster_name']}):")
85
+ print(comps.to_string(index=False))
86
+
87
+ # Movement plot
88
+ fig = movement_scatter_xy(df_fit, color="cluster_name")
89
+ if args.save_html:
90
+ out = ARTIFACTS_DIR / "movement_all.html"
91
+ pio.write_html(fig, file=str(out), auto_open=False, include_plotlyjs="cdn")
92
+ print(f"Saved plot: {out}")
build.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+ services:
3
+ pitchxy:
4
+ build: .
5
+ image: pitchxy:latest
6
+ ports:
7
+ - "7860:7860"
8
+ volumes:
9
+ - ./data:/app/data # cache persisted on host
10
+ - ./artifacts:/app/artifacts # outputs on host
11
+ environment:
12
+ - PYTHONPATH=/app/src
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pitchxy"
3
+ version = "0.1.0"
4
+ description = "Handedness-aware pitch archetypes & scouting cards (XY ride/drop vs arm/glove side)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ dependencies = [
8
+ "pandas",
9
+ "numpy",
10
+ "pybaseball",
11
+ "scikit-learn",
12
+ "plotly",
13
+ "pyarrow",
14
+ "streamlit" # needed for HF Space app below
15
+ ]
16
+
17
+ [project.scripts]
18
+ pitchxy = "pitchxy.cli:main" # <-- console entry point
19
+
20
+ [build-system]
21
+ requires = ["setuptools>=61"]
22
+ build-backend = "setuptools.build_meta"
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["src"]
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
src/__init__.py ADDED
File without changes
src/data.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from datetime import date
3
+ from pathlib import Path
4
+ import pandas as pd
5
+ from pybaseball import statcast
6
+ from utils import CACHE_DIR
7
+
8
+
9
+ def default_window() -> tuple[str, str]:
10
+ today = date.today()
11
+ start = date(today.year if today.month >= 3 else today.year - 1, 3, 1)
12
+ return start.isoformat(), today.isoformat()
13
+
14
+
15
+ def _cache_path(start: str, end: str) -> Path:
16
+ return CACHE_DIR / f"statcast_{start}_{end}.parquet"
17
+
18
+
19
+ def load_statcast(start_date: str, end_date: str, force: bool = False) -> pd.DataFrame:
20
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
21
+ cp = _cache_path(start_date, end_date)
22
+ if cp.exists() and not force:
23
+ return pd.read_parquet(cp)
24
+ df = statcast(start_dt=start_date, end_dt=end_date)
25
+ if "pitch_type" in df.columns:
26
+ df = df[df["pitch_type"].notna()]
27
+ df.to_parquet(cp, index=False)
28
+ return df
src/featurize.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ INCHES_PER_FOOT = 12.0
6
+
7
+
8
+ def infer_ivb_sign(df_raw: pd.DataFrame) -> int:
9
+ """
10
+ Data-driven IVB orientation: pick +1 or -1 so 'ride' is positive.
11
+ Uses only df_raw['pfx_z'] (no hardcoding of pitch types).
12
+ """
13
+ if "pfx_z" not in df_raw.columns or df_raw["pfx_z"].dropna().empty:
14
+ return -1
15
+ med = df_raw["pfx_z"].median()
16
+ return -1 if med < 0 else +1
17
+
18
+
19
+ def signed_arm_side(hb_in_raw: pd.Series, p_throws: pd.Series) -> pd.Series:
20
+ """
21
+ Convert Statcast pfx_x (catcher-right +) into 'arm-side positive' regardless of handedness.
22
+ RHP → +pfx_x is arm-side ; LHP → -pfx_x is arm-side.
23
+ """
24
+ handed = p_throws.fillna("R").str.upper().str[0]
25
+ sign = np.where(handed == "R", 1.0, -1.0)
26
+ return -hb_in_raw * sign
27
+
28
+
29
+ def _safe_rate(num, den):
30
+ return np.divide(
31
+ num, den, out=np.full_like(num, np.nan, dtype=float), where=den > 0
32
+ )
33
+
34
+
35
+ def engineer_pitch_features(df: pd.DataFrame, ivb_sign: int) -> pd.DataFrame:
36
+ cols = [
37
+ "pitch_type",
38
+ "player_name",
39
+ "game_date",
40
+ "events",
41
+ "description",
42
+ "p_throws",
43
+ "stand",
44
+ "release_pos_x",
45
+ "release_pos_z",
46
+ "pfx_x",
47
+ "pfx_z",
48
+ "release_speed",
49
+ "release_spin_rate",
50
+ "plate_x",
51
+ "plate_z",
52
+ "zone",
53
+ ]
54
+ have = [c for c in cols if c in df.columns]
55
+ df = df[have].copy()
56
+
57
+ # outcomes
58
+ df["is_called_strike"] = (df["description"] == "called_strike").astype(int)
59
+ df["is_swing"] = (
60
+ df["description"]
61
+ .isin(["swinging_strike", "swinging_strike_blocked", "foul", "hit_into_play"])
62
+ .astype(int)
63
+ )
64
+ df["is_whiff"] = (
65
+ df["description"]
66
+ .isin(["swinging_strike", "swinging_strike_blocked"])
67
+ .astype(int)
68
+ )
69
+ df["is_in_play"] = (df["description"] == "hit_into_play").astype(int)
70
+ df["is_gb"] = (
71
+ df["events"]
72
+ .isin(["groundout", "field_error", "single", "double", "triple"])
73
+ .astype(int)
74
+ )
75
+
76
+ # movement (handedness-aware XY)
77
+ df["hb_in_raw"] = df["pfx_x"] * INCHES_PER_FOOT
78
+ df["ivb_in"] = ivb_sign * df["pfx_z"] * INCHES_PER_FOOT # + = ride, − = drop
79
+ df["hb_as_in"] = signed_arm_side(df["hb_in_raw"], df.get("p_throws"))
80
+
81
+ grp = df.groupby(["player_name", "pitch_type", "p_throws"], as_index=False)
82
+ agg = grp.agg(
83
+ n=("pitch_type", "size"),
84
+ velo=("release_speed", "mean"),
85
+ spin=("release_spin_rate", "mean"),
86
+ ivb_in=("ivb_in", "mean"),
87
+ hb_as_in=("hb_as_in", "mean"),
88
+ rel_height=("release_pos_z", "mean"),
89
+ rel_side=("release_pos_x", "mean"),
90
+ cs=("is_called_strike", "sum"),
91
+ swings=("is_swing", "sum"),
92
+ whiffs=("is_whiff", "sum"),
93
+ inplay=("is_in_play", "sum"),
94
+ gb=("is_gb", "sum"),
95
+ )
96
+
97
+ agg["csw"] = _safe_rate(agg["cs"] + agg["whiffs"], agg["n"])
98
+ agg["whiff_rate"] = _safe_rate(agg["whiffs"], agg["swings"])
99
+ agg["gb_rate"] = _safe_rate(agg["gb"], agg["inplay"])
100
+ agg["zone_pct"] = _safe_rate(agg["cs"] + agg["inplay"], agg["n"])
101
+
102
+ keep = [
103
+ "player_name",
104
+ "pitch_type",
105
+ "p_throws",
106
+ "n",
107
+ "velo",
108
+ "spin",
109
+ "ivb_in",
110
+ "hb_as_in",
111
+ "rel_height",
112
+ "rel_side",
113
+ "csw",
114
+ "whiff_rate",
115
+ "gb_rate",
116
+ "zone_pct",
117
+ ]
118
+ return agg[keep].dropna(subset=["velo", "ivb_in", "hb_as_in"])
src/model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler
4
+ from sklearn.cluster import KMeans
5
+ from sklearn.neighbors import NearestNeighbors
6
+
7
+ ARCH_FEATURES = [
8
+ "velo",
9
+ "ivb_in",
10
+ "hb_as_in",
11
+ "rel_height",
12
+ "rel_side",
13
+ "spin",
14
+ "csw",
15
+ "whiff_rate",
16
+ "gb_rate",
17
+ "zone_pct",
18
+ ]
19
+
20
+
21
+ def fit_kmeans(df_feat: pd.DataFrame, k: int = 8, random_state: int = 42):
22
+ df = df_feat.dropna(subset=ARCH_FEATURES).copy()
23
+ X = df[ARCH_FEATURES].values
24
+ scaler = StandardScaler()
25
+ Xs = scaler.fit_transform(X)
26
+ km = KMeans(n_clusters=k, n_init=20, random_state=random_state)
27
+ labels = km.fit_predict(Xs)
28
+ df["cluster"] = labels
29
+
30
+ nn = NearestNeighbors(n_neighbors=6, metric="euclidean")
31
+ nn.fit(Xs)
32
+ return df, scaler, km, nn
33
+
34
+
35
+ def nearest_comps(
36
+ row: pd.Series, df_fit: pd.DataFrame, scaler, nn, within_pitch_type=True, k=6
37
+ ):
38
+ xq = scaler.transform(row[ARCH_FEATURES].values.reshape(1, -1))
39
+ dists, idxs = nn.kneighbors(xq, n_neighbors=k)
40
+ comps = df_fit.iloc[idxs[0]].copy()
41
+ if within_pitch_type:
42
+ comps = comps[comps["pitch_type"] == row["pitch_type"]]
43
+ cols = [
44
+ "player_name",
45
+ "pitch_type",
46
+ "p_throws",
47
+ "velo",
48
+ "ivb_in",
49
+ "hb_as_in",
50
+ "whiff_rate",
51
+ "gb_rate",
52
+ "cluster_name",
53
+ ]
54
+ return comps[cols].head(k - 1)
src/pitchxy.egg-info/PKG-INFO ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: pitchxy
3
+ Version: 0.1.0
4
+ Summary: Handedness-aware pitch archetypes & scouting cards (XY ride/drop vs arm/glove side)
5
+ Requires-Python: >=3.9
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: pandas
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pybaseball
10
+ Requires-Dist: scikit-learn
11
+ Requires-Dist: plotly
12
+ Requires-Dist: pyarrow
13
+ Requires-Dist: streamlit
src/pitchxy.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pyproject.toml
2
+ src/pitchxy.egg-info/PKG-INFO
3
+ src/pitchxy.egg-info/SOURCES.txt
4
+ src/pitchxy.egg-info/dependency_links.txt
5
+ src/pitchxy.egg-info/entry_points.txt
6
+ src/pitchxy.egg-info/requires.txt
7
+ src/pitchxy.egg-info/top_level.txt
src/pitchxy.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/pitchxy.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ pitchxy = pitchxy.cli:main
src/pitchxy.egg-info/requires.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ pybaseball
4
+ scikit-learn
5
+ plotly
6
+ pyarrow
7
+ streamlit
src/pitchxy.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/plots.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+
6
+
7
+ def movement_scatter_xy(
8
+ df: pd.DataFrame, color="pitch_type", facet_by_handedness=False
9
+ ):
10
+ dfp = df.copy()
11
+ if facet_by_handedness:
12
+ fig = px.scatter(
13
+ dfp,
14
+ x="hb_as_in",
15
+ y="ivb_in",
16
+ color=color,
17
+ facet_col="p_throws",
18
+ hover_data=[
19
+ "player_name",
20
+ "pitch_type",
21
+ "p_throws",
22
+ "velo",
23
+ "whiff_rate",
24
+ "gb_rate",
25
+ "csw",
26
+ ],
27
+ )
28
+ else:
29
+ fig = px.scatter(
30
+ dfp,
31
+ x="hb_as_in",
32
+ y="ivb_in",
33
+ color=color,
34
+ hover_data=[
35
+ "player_name",
36
+ "pitch_type",
37
+ "p_throws",
38
+ "velo",
39
+ "whiff_rate",
40
+ "gb_rate",
41
+ "csw",
42
+ ],
43
+ )
44
+ fig.update_layout(
45
+ xaxis_title="Horizontal: Arm-Side (+) | Glove-Side (−)",
46
+ yaxis_title="Vertical: Ride (+) | Drop (−)",
47
+ legend_title_text=color,
48
+ )
49
+ fig.add_hline(y=0, line_dash="dot")
50
+ fig.add_vline(x=0, line_dash="dot")
51
+ return fig
52
+
53
+
54
+ def radar_quality(row: pd.Series):
55
+ cats = ["csw", "whiff_rate", "gb_rate", "zone_pct"]
56
+ vals = [row[c] for c in cats]
57
+ fig = go.Figure(data=go.Scatterpolar(r=vals, theta=cats, fill="toself"))
58
+ fig.update_layout(
59
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=False
60
+ )
61
+ return fig
src/streamlit_app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))
src/tags.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+
6
+ def _mag_label(v, q25, q75, small="Subtle", mid="Moderate", big="Heavy"):
7
+ if v >= q75:
8
+ return big
9
+ if v <= q25:
10
+ return small
11
+ return mid
12
+
13
+
14
+ def _side_label(hb_as):
15
+ return "Arm-Side" if hb_as >= 0 else "Glove-Side"
16
+
17
+
18
+ def _vert_label(ivb):
19
+ return "Ride" if ivb >= 0 else "Drop"
20
+
21
+
22
+ def xy_cluster_tags(df_with_clusters: pd.DataFrame) -> dict[int, str]:
23
+ df = df_with_clusters.copy()
24
+
25
+ q_abs_ivb25 = np.nanquantile(np.abs(df["ivb_in"]), 0.25)
26
+ q_abs_ivb75 = np.nanquantile(np.abs(df["ivb_in"]), 0.75)
27
+ q_abs_hb25 = np.nanquantile(np.abs(df["hb_as_in"]), 0.25)
28
+ q_abs_hb75 = np.nanquantile(np.abs(df["hb_as_in"]), 0.75)
29
+
30
+ q_wh75 = np.nanquantile(df["whiff_rate"], 0.75)
31
+ q_gb75 = np.nanquantile(df["gb_rate"], 0.75)
32
+ q_zn75 = np.nanquantile(df["zone_pct"], 0.75)
33
+ q_wh50 = np.nanquantile(df["whiff_rate"], 0.50)
34
+ q_gb50 = np.nanquantile(df["gb_rate"], 0.50)
35
+ q_zn50 = np.nanquantile(df["zone_pct"], 0.50)
36
+
37
+ tags = {}
38
+ for c, sub in df.groupby("cluster"):
39
+ row = sub.mean(numeric_only=True)
40
+ dom_pt = (
41
+ sub["pitch_type"].mode().iloc[0]
42
+ if not sub["pitch_type"].mode().empty
43
+ else "Pitch"
44
+ )
45
+
46
+ side = _side_label(row["hb_as_in"])
47
+ vert = _vert_label(row["ivb_in"])
48
+ mag_side = _mag_label(abs(row["hb_as_in"]), q_abs_hb25, q_abs_hb75)
49
+ mag_vert = _mag_label(abs(row["ivb_in"]), q_abs_ivb25, q_abs_ivb75)
50
+
51
+ flavor = []
52
+ if row["whiff_rate"] >= q_wh75:
53
+ flavor.append("Whiff-First")
54
+ if row["gb_rate"] >= q_gb75:
55
+ flavor.append("Grounder-First")
56
+ if row["zone_pct"] >= q_zn75:
57
+ flavor.append("Strike-Throwing")
58
+ if not flavor:
59
+ diffs = {
60
+ "Whiff-First": row["whiff_rate"] - q_wh50,
61
+ "Grounder-First": row["gb_rate"] - q_gb50,
62
+ "Strike-Throwing": row["zone_pct"] - q_zn50,
63
+ }
64
+ flavor.append(max(diffs, key=diffs.get))
65
+
66
+ side_noun = "Run" if side == "Arm-Side" else "Sweep"
67
+ vert_noun = "Ride" if vert == "Ride" else "Drop"
68
+ shape = f"{side} • {mag_side} {side_noun}, {mag_vert} {vert_noun}"
69
+ tags[c] = f"{dom_pt}: {shape} • " + " / ".join(flavor)
70
+
71
+ return tags
src/utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CACHE_DIR = Path("data/cache")
4
+ ARTIFACTS_DIR = Path("artifacts")
5
+
6
+
7
+ def ensure_dirs():
8
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
9
+ ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)