Spaces:

marimo-team
/

marimo-learn

Sleeping

marimo-learn / polars /04_basic_operations.py

Joram Mutenge

run notebook in sandbox

f57b8b5 9 months ago

13.8 kB

	# /// script
	# requires-python = ">=3.13"
	# dependencies = [
	# "marimo",
	# "polars==1.23.0",
	# ]
	# ///

	import marimo

	__generated_with = "0.11.13"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import marimo as mo
	return (mo,)


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	# Basic operations on data
	_By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._

	In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new).
	"""
	)
	return


	@app.cell
	def _():
	import polars as pl

	df = pl.DataFrame(
	{
	"software": [
	"Lotus-123",
	"WordStar",
	"dBase III",
	"VisiCalc",
	"WinZip",
	"MS-DOS",
	"HyperCard",
	"WordPerfect",
	"Excel",
	"Photoshop",
	"Visual Studio",
	"Slack",
	"Zoom",
	"Notion",
	"Figma",
	"Spotify",
	"VSCode",
	"Docker",
	],
	"users": [
	10000,
	4500,
	2500,
	3000,
	1800,
	17000,
	2200,
	1900,
	500000,
	12000000,
	1500000,
	3000000,
	4000000,
	2000000,
	2500000,
	4500000,
	6000000,
	3500000,
	],
	"category": ["Vintage"] * 8 + ["Modern"] * 10,
	"year": [
	1985,
	1980,
	1984,
	1979,
	1991,
	1981,
	1987,
	1982,
	1987,
	1990,
	1997,
	2013,
	2011,
	2016,
	2016,
	2008,
	2015,
	2013,
	],
	}
	)

	df
	return df, pl


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	## Arithmetic
	### Addition
	Let's add 42 users to each piece of software. This means adding 42 to each value under users.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users") + 42)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Another way to perform the above operation is using the built-in function.""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users").add(42))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Subtraction
	Let's subtract 42 users to each piece of software.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users") - 42)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Alternatively, you could subtract like this:""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users").sub(42))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Division
	Suppose the users values are inflated, we can reduce them by dividing by 1000. Here's how to do it.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users") / 1000)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Or we could do it with a built-in expression.""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users").truediv(1000))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users").floordiv(1000))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Multiplication
	Let's pretend the user values are deflated and increase them by multiplying by 100.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(df.with_columns(pl.col("users") * 100))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Polars also has a built-in function for multiplication.""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns(pl.col("users").mul(100))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column decade that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""")
	return


	@app.cell
	def _(df, pl):
	(df.with_columns(decade=pl.col("year").floordiv(10).mul(10)))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""We could create a new column another way as follows:""")
	return


	@app.cell
	def _(df, pl):
	df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade"))
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	Tip
	Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain.

	## Comparison
	### Equal
	Let's get all the software categorized as Vintage.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(
	df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
	.filter(pl.col("category") == "Vintage")
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""")
	return


	@app.cell
	def _(df, pl):
	(
	df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
	.filter(pl.col("category") == "Vintage")
	.filter(pl.col("decade") == 1970)
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	We could also do this comparison in one line, if readability is not a concern

	Notice that we must enclose the two expressions between the `&` with parenthesis.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(
	df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
	.filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""We can also use the built-in function for equal to comparisons.""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(pl.col('category').eq('Vintage'))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Not equal
	We can also compare if something is `not` equal to something. In this case, category is not vintage.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(pl.col('category') != 'Vintage')
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Or with the built-in function.""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(pl.col('category').ne('Vintage'))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(~pl.col('category').eq('Vintage'))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Greater than
	Let's get the software where the year is greater than 2008 from the above dataframe.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(~pl.col('category').eq('Vintage'))
	.filter(pl.col('year') > 2008)
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(~pl.col('category').eq('Vintage'))
	.filter(pl.col('year') >= 2008)
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(~pl.col('category').eq('Vintage'))
	.filter(pl.col('year').gt(2008))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""And here's with greater or equal to""")
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(~pl.col('category').eq('Vintage'))
	.filter(pl.col('year').ge(2008))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	Note: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively.

	### Is between
	Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result).
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(pl.col('category').eq('Modern'))
	.filter(pl.col('year').is_between(2013, 2016))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	### Or operator
	If we only want either one of the conditions in the comparison to be met, we could use `\|`, which is the `or` operator.

	Let's get software that is either modern or used in the decade 1980s.
	"""
	)
	return


	@app.cell
	def _(df, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter((pl.col('category') == 'Modern') \| (pl.col('decade') == 1980))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	## Conditionals
	Polars also allows you create new columns based on a condition. Let's create a column status that will indicate if the software is "discontinued" or "in use".

	Here's a list of products that are no longer in use.
	"""
	)
	return


	@app.cell
	def _():
	discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard']
	return (discontinued_list,)


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""")
	return


	@app.cell
	def _(df, discontinued_list, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.filter(pl.col('software').is_in(discontinued_list))
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Now, let's create the status column.""")
	return


	@app.cell
	def _(df, discontinued_list, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
	.then(pl.lit('Discontinued'))
	.otherwise(pl.lit('In use'))
	.alias('status')
	)
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(
	r"""
	## Unique counts
	Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame.
	"""
	)
	return


	@app.cell
	def _(df, discontinued_list, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
	.then(pl.lit('Discontinued'))
	.otherwise(pl.lit('In use'))
	.alias('status')
	)
	.select('decade').unique()
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Finally, let's find out the number of software used in each decade.""")
	return


	@app.cell
	def _(df, discontinued_list, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
	.then(pl.lit('Discontinued'))
	.otherwise(pl.lit('In use'))
	.alias('status')
	)
	['decade'].value_counts()
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""We could also rewrite the above code as follows:""")
	return


	@app.cell
	def _(df, discontinued_list, pl):
	(df
	.with_columns(decade=pl.col('year').floordiv(10).mul(10))
	.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
	.then(pl.lit('Discontinued'))
	.otherwise(pl.lit('In use'))
	.alias('status')
	)
	.select('decade').to_series().value_counts()
	)
	return


	@app.cell(hide_code=True)
	def _(mo):
	mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""")
	return


	@app.cell
	def _():
	return


	if __name__ == "__main__":
	app.run()