Spaces:

NeuML
/

baseball

Running

App Files Files Community

davidmezzetti commited on Jun 6, 2023

Commit

cf62ef7

•

1 Parent(s): 58839f9

Create app.py

Browse files

Files changed (1) hide show

app.py +418 -0

app.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Baseball statistics application with txtai and Streamlit.
+Install txtai and streamlit to run:
+  pip install txtai streamlit
+"""
+import datetime
+import os
+import numpy as np
+import pandas as pd
+import streamlit as st
+from txtai.embeddings import Embeddings
+class Stats:
+    """
+    Base stats class. Contains methods for loading, indexing and searching baseball stats.
+    """
+    def __init__(self):
+        """
+        Creates a new Stats instance.
+        """
+        # Load columns
+        self.columns = self.loadcolumns()
+        # Load stats data
+        self.stats = self.load()
+        # Load names
+        self.names = self.loadnames()
+        # Build index
+        self.vectors, self.data, self.embeddings = self.index()
+    def loadcolumns(self):
+        """
+        Returns a list of data columns.
+        Returns:
+            list of columns
+        """
+        raise NotImplementedError
+    def load(self):
+        """
+        Loads and returns raw stats.
+        Returns:
+            stats
+        """
+        raise NotImplementedError
+    def sort(self, rows):
+        """
+        Sorts rows stored as a DataFrame.
+        Args:
+            rows: input DataFrame
+        Returns:
+            sorted DataFrame
+        """
+        raise NotImplementedError
+    def vector(self, row):
+        """
+        Build a vector for input row.
+        Args:
+            row: input row
+        Returns:
+            row vector
+        """
+        raise NotImplementedError
+    def loadnames(self):
+        """
+        Loads a name - player id dictionary.
+        Returns:
+            {player name: player id}
+        """
+        # Get unique names
+        names = {}
+        rows = self.stats[["nameFirst", "nameLast", "playerID"]].drop_duplicates()
+        for _, row in rows.iterrows():
+            # Name key
+            key = f"{row['nameFirst']} {row['nameLast']}"
+            suffix = f" ({row['playerID']})" if key in names else ""
+            # Save name key - player id pair
+            names[f"{key}{suffix}"] = row["playerID"]
+        return names
+    def index(self):
+        """
+        Builds an embeddings index to stats data. Returns vectors, input data and embeddings index.
+        Returns:
+            vectors, data, embeddings
+        """
+        # Build data dictionary
+        vectors = {f'{row["yearID"]}{row["playerID"]}': self.transform(row) for _, row in self.stats.iterrows()}
+        data = {f'{row["yearID"]}{row["playerID"]}': dict(row) for _, row in self.stats.iterrows()}
+        embeddings = Embeddings({
+            "transform": self.transform,
+        })
+        embeddings.index((uid, vectors[uid], None) for uid in vectors)
+        return vectors, data, embeddings
+    def years(self, player):
+        """
+        Looks up the years active for a player along with the player's best statistical year.
+        Args:
+            player: player name
+        Returns:
+            start, end, best
+        """
+        if player in self.names:
+            df = self.sort(self.stats[self.stats["playerID"] == self.names[player]])
+            return int(df["yearID"].min()), int(df["yearID"].max()), int(df["yearID"].iloc[0])
+        return 1871, datetime.datetime.today().year, 1950
+    def search(self, player=None, year=None, row=None, limit=10):
+        """
+        Runs an embeddings search. This method takes either a player-year or stats row as input.
+        Args:
+            player: player name to search
+            year: year to search
+            row: row of stats to search
+            limit: max results to return
+        Returns:
+            list of results
+        """
+        if row:
+            query = self.vector(row)
+        else:
+            # Lookup player key and build vector id
+            query = f"{year}{self.names.get(player)}"
+            query = self.vectors.get(query)
+        results, ids = [], set()
+        if query is not None:
+            for uid, _ in self.embeddings.search(query, limit * 5):
+                # Only add unique players
+                if uid[4:] not in ids:
+                    result = self.data[uid].copy()
+                    result["link"] = f'https://www.baseball-reference.com/players/{result["nameLast"].lower()[0]}/{result["bbrefID"]}.shtml'
+                    result["yearID"] = str(result["yearID"])
+                    results.append(result)
+                    ids.add(uid[4:])
+                    if len(ids) >= limit:
+                        break
+        return results
+    def transform(self, row):
+        """
+        Transforms a stats row into a vector.
+        Args:
+            row: stats row
+        Returns:
+            vector
+        """
+        if isinstance(row, np.ndarray):
+            return row
+        return np.array([0.0 if not row[x] or np.isnan(row[x]) else row[x] for x in self.columns])
+class Batting(Stats):
+    def loadcolumns(self):
+        return [
+            "birthMonth", "age", "weight", "height", "yearID", "G", "AB", "R", "H", "1B", "2B", "3B", "HR", "RBI", "SB", "CS",
+            "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP", "POS", "AVG", "OBP", "TB", "SLG", "OPS", "OPS+"
+        ]
+    def load(self):
+        # Retrieve raw data from GitHub
+        players = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/People.csv")
+        batting = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/Batting.csv")
+        fielding = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/Fielding.csv")
+        # Merge player data in
+        batting = pd.merge(players, batting, how="inner", on=["playerID"])
+        # Require player to have at least 350 plate appearances.
+        batting = batting[(batting["AB"] + batting["BB"]) >= 350]
+        # Derive primary player positions
+        positions = self.positions(fielding)
+        # Calculated columns
+        batting["age"] = batting["yearID"] - batting["birthYear"]
+        batting["POS"] = batting.apply(lambda row: self.position(positions, row), axis=1)
+        batting["AVG"] = batting["H"] / batting["AB"]
+        batting["OBP"] = (batting["H"] + batting["BB"]) / (batting["AB"] + batting["BB"])
+        batting["1B"] = batting["H"] - batting["2B"] - batting["3B"] - batting["HR"]
+        batting["TB"] = batting["1B"] + 2 * batting["2B"] + 3 * batting["3B"] + 4 * batting["HR"]
+        batting["SLG"] = batting["TB"] / batting["AB"]
+        batting["OPS"] = batting["OBP"] + batting["SLG"]
+        batting["OPS+"] = 100 + (batting["OPS"] - batting["OPS"].mean()) * 100
+        return batting
+    def sort(self, rows):
+        return rows.sort_values(by="OPS+", ascending=False)
+    def vector(self, row):
+        row["TB"] = row["1B"] + 2 * row["2B"] + 3 * row["3B"] + 4 * row["HR"]
+        row["AVG"] = row["H"] / row["AB"]
+        row["OBP"] = (row["H"] + row["BB"]) / (row["AB"] + row["BB"])
+        row["SLG"] = row["TB"] / row["AB"]
+        row["OPS"] = row["OBP"] + row["SLG"]
+        row["OPS+"] = 100 + (row["OPS"] - self.stats["OPS"].mean()) * 100
+        return self.transform(row)
+    def positions(self, fielding):
+        """
+        Derives primary positions for players.
+        Args:
+            fielding: fielding data
+        Returns:
+            {player id: (position, number of games)}
+        """
+        positions = {}
+        for x, row in fielding.iterrows():
+            uid = f'{row["yearID"]}{row["playerID"]}'
+            position = row["POS"] if row["POS"] else 0
+            if position == "P":
+                position = 1
+            elif position == "C":
+                position = 2
+            elif position == "1B":
+                position = 3
+            elif position == "2B":
+                position = 4
+            elif position == "3B":
+                position = 5
+            elif position == "SS":
+                position = 6
+            elif position == "OF":
+                position = 7
+            # Save position if not set or player played more at this position
+            if uid not in positions or positions[uid][1] < row["G"]:
+                positions[uid] = (position, row["G"])
+        return positions
+    def position(self, positions, row):
+        """
+        Looks up primary position for player row.
+        Arg:
+            positions: all player positions
+            row: player row
+        Returns:
+            primary player positions
+        """
+        uid = f'{row["yearID"]}{row["playerID"]}'
+        return positions[uid][0] if uid in positions else 0
+class Pitching(Stats):
+    def loadcolumns(self):
+        return [
+            "birthMonth", "age", "weight", "height", "yearID", "W", "L", "G", "GS", "CG", "SHO", "SV", "IPouts",
+            "H", "ER", "HR", "BB", "SO", "BAOpp", "ERA", "IBB", "WP", "HBP", "BK", "BFP", "GF", "R", "SH", "SF",
+            "GIDP", "WHIP", "WADJ"
+        ]
+    def load(self):
+        # Retrieve raw data from GitHub
+        players = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/People.csv")
+        pitching = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/Pitching.csv")
+        # Merge player data in
+        pitching = pd.merge(players, pitching, how="inner", on=["playerID"])
+        # Require player to have 20 appearances
+        pitching = pitching[pitching["G"] >= 20]
+        # Calculated columns
+        pitching["age"] = pitching["yearID"] - pitching["birthYear"]
+        pitching["WHIP"] = (pitching["BB"] + pitching["H"]) / (pitching["IPouts"] / 3)
+        pitching["WADJ"] =(pitching["W"] + pitching["SV"]) / (pitching["ERA"] + pitching["WHIP"])
+        return pitching
+    def sort(self, rows):
+        return rows.sort_values(by="WADJ", ascending=False)
+    def vector(self, row):
+        row["WHIP"] = (row["BB"] + row["H"]) / (row["IPouts"] / 3) if row["IPouts"] else None
+        row["WADJ"] =(row["W"] + row["SV"]) / (row["ERA"] + row["WHIP"]) if row["ERA"] and row["WHIP"] else None
+        return self.transform(row)
+class Application:
+    """
+    Main application.
+    """
+    def __init__(self):
+        """
+        Creates a new application.
+        """
+        # Batting stats
+        self.batting = Batting()
+        # Pitching stats
+        self.pitching = Pitching()
+    def run(self):
+        """
+        Runs a Streamlit application.
+        """
+        st.title("⚾ Baseball Statistics")
+        st.markdown("""
+            This application finds the best matching historical players using vector search with [txtai](https://github.com/neuml/txtai).
+            Raw data is from the [Baseball Databank](https://github.com/chadwickbureau/baseballdatabank) GitHub project.
+        """)
+        self.player()
+    def player(self):
+        """
+        Player tab.
+        """
+        st.markdown("Match by player-season. Each player search defaults to the best season sorted by OPS or Wins Adjusted.")
+        category = st.radio("Stat", ["Batting", "Pitching"], horizontal=True, key="playerstat")
+        stats, default = (self.batting, "Babe Ruth") if category == "Batting" else (self.pitching, "Cy Young")
+        # Player name
+        names = sorted(stats.names)
+        player = st.selectbox("Player", names, names.index(default))
+        # Player year
+        start, end, best = stats.years(player)
+        year = st.slider("Year", start, end, best) if start != end else start
+        # Run search
+        results = stats.search(player, year)
+        # Display results
+        self.display(results, ["nameFirst", "nameLast", "teamID"] + stats.columns[1:] + ["link"])
+    def display(self, results, columns):
+        """
+        Displays a list of results.
+        Args:
+            results: list of results
+            columns: column names
+        """
+        if results:
+            st.dataframe(pd.DataFrame(results)[columns])
+        else:
+            st.write("Player-Year not found")
+@st.cache_resource(show_spinner=False)
+def create():
+    """
+    Creates and caches a Streamlit application.
+    Returns:
+        Application
+    """
+    return Application()
+if __name__ == "__main__":
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # Create and run application
+    app = create()
+    app.run()