Spaces:
Sleeping
Sleeping
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ | |
| # "marimo", | |
| # "polars==1.23.0", | |
| # ] | |
| # /// | |
| import marimo | |
| __generated_with = "0.11.13" | |
| app = marimo.App(width="medium") | |
| def _(): | |
| import marimo as mo | |
| return (mo,) | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| # Basic operations on data | |
| _By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._ | |
| In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new). | |
| """ | |
| ) | |
| return | |
| def _(): | |
| import polars as pl | |
| df = pl.DataFrame( | |
| { | |
| "software": [ | |
| "Lotus-123", | |
| "WordStar", | |
| "dBase III", | |
| "VisiCalc", | |
| "WinZip", | |
| "MS-DOS", | |
| "HyperCard", | |
| "WordPerfect", | |
| "Excel", | |
| "Photoshop", | |
| "Visual Studio", | |
| "Slack", | |
| "Zoom", | |
| "Notion", | |
| "Figma", | |
| "Spotify", | |
| "VSCode", | |
| "Docker", | |
| ], | |
| "users": [ | |
| 10000, | |
| 4500, | |
| 2500, | |
| 3000, | |
| 1800, | |
| 17000, | |
| 2200, | |
| 1900, | |
| 500000, | |
| 12000000, | |
| 1500000, | |
| 3000000, | |
| 4000000, | |
| 2000000, | |
| 2500000, | |
| 4500000, | |
| 6000000, | |
| 3500000, | |
| ], | |
| "category": ["Vintage"] * 8 + ["Modern"] * 10, | |
| "year": [ | |
| 1985, | |
| 1980, | |
| 1984, | |
| 1979, | |
| 1991, | |
| 1981, | |
| 1987, | |
| 1982, | |
| 1987, | |
| 1990, | |
| 1997, | |
| 2013, | |
| 2011, | |
| 2016, | |
| 2016, | |
| 2008, | |
| 2015, | |
| 2013, | |
| ], | |
| } | |
| ) | |
| df | |
| return df, pl | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ## Arithmetic | |
| ### Addition | |
| Let's add 42 users to each piece of software. This means adding 42 to each value under **users**. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users") + 42) | |
| return | |
| def _(mo): | |
| mo.md(r"""Another way to perform the above operation is using the built-in function.""") | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users").add(42)) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Subtraction | |
| Let's subtract 42 users to each piece of software. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users") - 42) | |
| return | |
| def _(mo): | |
| mo.md(r"""Alternatively, you could subtract like this:""") | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users").sub(42)) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Division | |
| Suppose the **users** values are inflated, we can reduce them by dividing by 1000. Here's how to do it. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users") / 1000) | |
| return | |
| def _(mo): | |
| mo.md(r"""Or we could do it with a built-in expression.""") | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users").truediv(1000)) | |
| return | |
| def _(mo): | |
| mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""") | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users").floordiv(1000)) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Multiplication | |
| Let's pretend the *user* values are deflated and increase them by multiplying by 100. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| (df.with_columns(pl.col("users") * 100)) | |
| return | |
| def _(mo): | |
| mo.md(r"""Polars also has a built-in function for multiplication.""") | |
| return | |
| def _(df, pl): | |
| df.with_columns(pl.col("users").mul(100)) | |
| return | |
| def _(mo): | |
| mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column **decade** that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""") | |
| return | |
| def _(df, pl): | |
| (df.with_columns(decade=pl.col("year").floordiv(10).mul(10))) | |
| return | |
| def _(mo): | |
| mo.md(r"""We could create a new column another way as follows:""") | |
| return | |
| def _(df, pl): | |
| df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade")) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| **Tip** | |
| Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain. | |
| ## Comparison | |
| ### Equal | |
| Let's get all the software categorized as Vintage. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| ( | |
| df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
| .filter(pl.col("category") == "Vintage") | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""") | |
| return | |
| def _(df, pl): | |
| ( | |
| df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
| .filter(pl.col("category") == "Vintage") | |
| .filter(pl.col("decade") == 1970) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| We could also do this comparison in one line, if readability is not a concern | |
| **Notice** that we must enclose the two expressions between the `&` with parenthesis. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| ( | |
| df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
| .filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""We can also use the built-in function for equal to comparisons.""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(pl.col('category').eq('Vintage')) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Not equal | |
| We can also compare if something is `not` equal to something. In this case, category is not vintage. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(pl.col('category') != 'Vintage') | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Or with the built-in function.""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(pl.col('category').ne('Vintage')) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(~pl.col('category').eq('Vintage')) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Greater than | |
| Let's get the software where the year is greater than 2008 from the above dataframe. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(~pl.col('category').eq('Vintage')) | |
| .filter(pl.col('year') > 2008) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(~pl.col('category').eq('Vintage')) | |
| .filter(pl.col('year') >= 2008) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(~pl.col('category').eq('Vintage')) | |
| .filter(pl.col('year').gt(2008)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""And here's with greater or equal to""") | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(~pl.col('category').eq('Vintage')) | |
| .filter(pl.col('year').ge(2008)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| **Note**: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively. | |
| ### Is between | |
| Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result). | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(pl.col('category').eq('Modern')) | |
| .filter(pl.col('year').is_between(2013, 2016)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ### Or operator | |
| If we only want either one of the conditions in the comparison to be met, we could use `|`, which is the `or` operator. | |
| Let's get software that is either modern or used in the decade 1980s. | |
| """ | |
| ) | |
| return | |
| def _(df, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter((pl.col('category') == 'Modern') | (pl.col('decade') == 1980)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ## Conditionals | |
| Polars also allows you create new columns based on a condition. Let's create a column *status* that will indicate if the software is "discontinued" or "in use". | |
| Here's a list of products that are no longer in use. | |
| """ | |
| ) | |
| return | |
| def _(): | |
| discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard'] | |
| return (discontinued_list,) | |
| def _(mo): | |
| mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""") | |
| return | |
| def _(df, discontinued_list, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .filter(pl.col('software').is_in(discontinued_list)) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Now, let's create the **status** column.""") | |
| return | |
| def _(df, discontinued_list, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
| .then(pl.lit('Discontinued')) | |
| .otherwise(pl.lit('In use')) | |
| .alias('status') | |
| ) | |
| ) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ## Unique counts | |
| Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame. | |
| """ | |
| ) | |
| return | |
| def _(df, discontinued_list, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
| .then(pl.lit('Discontinued')) | |
| .otherwise(pl.lit('In use')) | |
| .alias('status') | |
| ) | |
| .select('decade').unique() | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Finally, let's find out the number of software used in each decade.""") | |
| return | |
| def _(df, discontinued_list, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
| .then(pl.lit('Discontinued')) | |
| .otherwise(pl.lit('In use')) | |
| .alias('status') | |
| ) | |
| ['decade'].value_counts() | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""We could also rewrite the above code as follows:""") | |
| return | |
| def _(df, discontinued_list, pl): | |
| (df | |
| .with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
| .with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
| .then(pl.lit('Discontinued')) | |
| .otherwise(pl.lit('In use')) | |
| .alias('status') | |
| ) | |
| .select('decade').to_series().value_counts() | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""") | |
| return | |
| def _(): | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |