Spaces:

marimo-team
/

marimo-learn

Running

File size: 6,772 Bytes

dd84d5b

import marimo

__generated_with = "0.11.30"
app = marimo.App(width="medium")


@app.cell(hide_code=True)
def _introduction(mo):
    mo.md(
        """
        # DuckDB: An Embeddable Analytical Database System

        ### What is DuckDB?

        [DuckDB](https://duckdb.org/) is a high-performance, in-process analytical database management system (DBMS) designed for speed and simplicity. It's particularly well-suited for analytical query workloads, offering a robust SQL interface and efficient data processing capabilities. This document highlights key features and aspects of DuckDB relevant for a course on database systems or data analysis.

        ### [Key Features](https://duckdb.org/why_duckdb):

        - In-Process: Easy integration, zero external dependencies.
        - Portable: Works on various OS and architectures.
        - Columnar Storage: Efficient for analytical queries.
        - Vectorized Execution: Speeds up data processing.
        - ACID Transactions: Ensures data integrity.
        - Multi-Language APIs: Python, R, Java, etc.

        ### [Use Cases](https://github.com/davidgasquez/awesome-duckdb?tab=readme-ov-file):

        - Data analysis and exploration
        - Embedded analytics in applications
        - ETL (Extract, Transform, Load) processes
        - Data science and machine learning workflows
        - Rapid prototyping of data analysis pipelines.

        ### [Installation](https://duckdb.org/docs/installation/?version=stable&environment=python):

        - The DuckDB Python API can be installed using pip:
        ```
        pip install duckdb
        ```
        - It is also possible to install DuckDB using conda:
        ```
        conda install python-duckdb -c conda-forge.
        ```

        **Python version:** DuckDB requires Python 3.7 or newer.
        """
    )
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""
        # [1. DuckDB Basic Connection](https://duckdb.org/docs/stable/connect/overview.html)

        DuckDB can run entirely in your computer's RAM, known as in-memory mode, which you can enable by using `:memory:` as the database name or by not providing a database file. It's crucial to understand that this means all data is temporary and will be completely erased when the program closes, as it isn't saved to disk.
        """
    )
    return


@app.cell
def _database_connection(duckdb):
    # Create a connection to an in-memory database
    database_connection = duckdb.connect(database=":memory:")
    print(
        f"DuckDB version: {database_connection.execute('SELECT version()').fetchone()[0]}"
    )
    return (database_connection,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [2. Creating Tables](https://duckdb.org/docs/stable/sql/statements/create_table.html)"""
    )
    return


@app.cell
def _create_users_table(database_connection):
    database_connection.execute(
        """
    CREATE TABLE users (
        id INTEGER,
        name VARCHAR,
        age INTEGER,
        registration_date DATE
    )
    """
    )
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [3. Instering data into table](https://duckdb.org/docs/stable/sql/statements/insert)"""
    )
    return


@app.cell
def _insert_user_data(database_connection):
    database_connection.execute(
        """
    INSERT INTO users VALUES
    (1, 'Alice', 25, '2021-01-01'),
    (2, 'Bob', 30, '2021-02-01'),
    (3, 'Charlie', 35, '2021-03-01')
    """
    )
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [4. Basic Queries](https://duckdb.org/docs/stable/sql/query_syntax/select)"""
    )
    return


@app.cell
def _basic_queries(database_connection):
    # Select all data
    user_results = database_connection.execute("SELECT * FROM users").fetchall()
    for user_row in user_results:
        print(user_row)
    return user_results, user_row


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [5. Working with Polars](https://duckdb.org/docs/stable/guides/python/polars.html)"""
    )
    return


@app.cell
def _polars_dataframe(database_connection, pl):
    # Create a Polars DataFrame
    polars_dataframe = pl.DataFrame(
        {
            "id": [1, 2, 3],
            "name": ["Alice", "Bob", "Charlie"],
            "age": [25, 30, 35],
            "registration_date": ["2021-01-01", "2021-02-01", "2021-03-01"],
        }
    )

    # Register the Polars DataFrame as a DuckDB table
    database_connection.register("users_polars", polars_dataframe)

    # Query the Polars DataFrame using DuckDB
    polars_results = database_connection.execute(
        "SELECT * FROM users_polars"
    ).fetchall()
    print("New Table:")
    for polars_row in polars_results:
        print(polars_row)
    return polars_dataframe, polars_results, polars_row


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [6. Join Operations](https://duckdb.org/docs/stable/guides/performance/join_operations.html)"""
    )
    return


@app.cell
def _join_operations(database_connection):
    join_results = database_connection.execute(
        """
    SELECT u.id, u.name, u.age, nu.registration_date
    FROM users u
    JOIN users_polars nu ON u.age < nu.age
    """
    )
    print("Join Result:")
    for join_row in join_results.fetchall():
        print(join_row)
    return join_results, join_row


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [7. Aggregate Functions](https://duckdb.org/docs/stable/sql/functions/aggregates.html)"""
    )
    return


@app.cell
def _aggregate_functions(database_connection):
    aggregate_results = database_connection.execute(
        """
    SELECT AVG(age) as avg_age, MAX(age) as max_age, MIN(age) as min_age
    FROM (SELECT * FROM users UNION ALL SELECT * FROM users_polars) AS all_users
    """
    ).fetchall()
    print(
        f"Average Age: {aggregate_results[0][0]:.1f}, "
        f"Max Age: {aggregate_results[0][1]}, "
        f"Min Age: {aggregate_results[0][2]}"
    )
    return (aggregate_results,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""# [8. Converting Results to Polars DataFrames](https://duckdb.org/docs/stable/guides/python/polars.html)"""
    )
    return


@app.cell
def _convert_to_polars(database_connection):
    # -- 8. Converting Results to Polars DataFrames --
    # Convert the result to a Polars DataFrame
    polars_result_df = database_connection.execute("SELECT * FROM users").df()
    print("Result as Polars DataFrame:")
    print(polars_result_df)
    return (polars_result_df,)


@app.cell(hide_code=True)
def _():
    import marimo as mo
    import duckdb
    import polars as pl

    return duckdb, mo, pl


if __name__ == "__main__":
    app.run()