{ "cells": [ { "cell_type": "markdown", "id": "6fb06d81-1778-403c-b15b-d68200a5e6b5", "metadata": {}, "source": [ "# Spark on Hugging Face" ] }, { "cell_type": "code", "execution_count": null, "id": "7399a5ed-aea8-45cf-866f-2decd7097456", "metadata": { "tags": [] }, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder.appName(\"demo\").getOrCreate()" ] }, { "cell_type": "markdown", "id": "8bf07f63-6fed-4cf9-8fee-5f3a5fb6bed1", "metadata": { "tags": [] }, "source": [ "Example:\n", "\n", "```python\n", "# Load the BAAI/Infinity-Instruct dataset\n", "df = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\")\n", "\n", "# Load only one column\n", "df_langdetect_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", columns=[\"langdetect\"])\n", "\n", "# Load values within certain ranges\n", "criteria = [(\"langdetect\", \"=\", \"zh-cn\")]\n", "df_chinese_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", filters=criteria)\n", "\n", "# Save dataset\n", "write_parquet(df_chinese_only, \"hf://datasets/username/Infinity-Instruct-Chinese-Only\")\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "ca71b3ac-3291-4e4e-8fee-b3550b0426d6", "metadata": { "tags": [] }, "outputs": [], "source": [ "from hf_spark_utils import read_parquet, write_parquet, set_session\n", "set_session(spark)" ] }, { "cell_type": "markdown", "id": "07ea62a4-7549-4a75-8a12-9d830f6e3cde", "metadata": {}, "source": [ "#### (Optional) Login" ] }, { "cell_type": "code", "execution_count": null, "id": "343b3a9a-2dce-492b-9384-703368ba3975", "metadata": { "tags": [] }, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "notebook_login(new_session=False)" ] }, { "cell_type": "markdown", "id": "332b7609-f0eb-4703-aea6-fec3d09f5870", "metadata": {}, "source": [ "#### Run your code:" ] }, { "cell_type": "code", "execution_count": null, "id": "6c0dfe01-9190-454c-9c52-216f74d339e1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 5 }