{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "95b8c937-b560-4d22-99b0-e7fdd7ef051e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /home/austin/.cache/huggingface/token\n", "Login successful\n", "Total folders: 473\n", "Selected folders (50% - 75%): 118\n", "Folders: 773a4156, 7787d8bf, 78024d52, 780c767e, 78a17f40, 78ddc745, 79a9f817, 79b0d13c, 79ff88ce, 7a63c040, 7aa5ec7f, 7b3d6f79, 7bc60e34, 7cf5370c, 7d23dc35, 7d557804, 7d9fed60, 7da6e5dd, 7e74d475, 7f563200, 7fa77e7c, 80e000b3, 82917a7d, 8340aaf6, 84be23bd, 84d1e0bf, 86a62c68, 86b2eb0e, 88ea6529, 89374c0b, 8a601bf6, 8b6e7173, 8d2b2495, 8d6eccd0, 8da3217d, 8e1072e6, 8f8acabb, 8fd9edef, 8fe123c6, 90d21566, 90f28db9, 90fa05fd, 91539726, 915f3667, 917e0da2, 917feebd, 91c32dcd, 93443a40, 93dda15e, 940de876, 95719ea1, 95c3bdd8, 95c67421, 96d8dcb4, 978de897, 99b5eb16, 9a3063e7, 9c125949, 9d05be3b, 9d33dced, 9dfdd4e5, 9e3fc422, 9e884660, 9ee921f6, 9f9b4bae, 9febd2ae, a01fb5a5, a0674c57, a0fd12d7, a1a0d114, a3089480, a3697aa1, a3feb976, a47dd6dc, a52aed66, a534c742, a5440ee6, a601effc, a67a3b57, a77550c4, a808c635, a8a5767d, a8ce1f8e, a8d5a308, a8e4b0f3, a93da23d, aafb5758, ac0e6660, ac12bbfd, ac5de73d, ad28b91b, ae93354c, aea6ef6b, afd09d8d, b05484ba, b05d669d, b1921b3f, b2b32e5b, b4f1d560, b6178b7e, b67195c6, b8015202, b85e88db, b8b5fe66, b906d548, ba3c433f, bb6ac6f1, bbd2e2a6, bbd90363, bbfd0f60, bc1f03bf, bc778ddb, bc8cc1a2, bca2cfac, bce2a5af, bd0cc9b2, bd4f8711, bead557a\n" ] } ], "source": [ "from huggingface_hub import HfApi\n", "import math\n", "!huggingface-cli login --token hf_xxxx\n", "def get_folder_subset(repo_id, start_percent, end_percent, repo_type=\"dataset\"):\n", " api = HfApi()\n", "\n", " # List the contents of the repository\n", " repo_contents = api.list_repo_files(repo_id, repo_type=repo_type)\n", "\n", " # Filter for files inside the \"data\" directory\n", " data_contents = [file for file in repo_contents if file.startswith(\"data/\")]\n", "\n", " # Get unique folders inside \"data\"\n", " folders_in_data = sorted(set(file.split('/')[1] for file in data_contents if file.count('/') > 1))\n", " \n", " total_folders = len(folders_in_data)\n", " start_index = math.floor(total_folders * start_percent / 100)\n", " end_index = math.floor(total_folders * end_percent / 100)\n", "\n", " selected_folders = folders_in_data[start_index:end_index]\n", "\n", " print(f\"Total folders: {total_folders}\")\n", " print(f\"Selected folders ({start_percent}% - {end_percent}%): {len(selected_folders)}\")\n", " print(\"Folders:\", ', '.join(selected_folders))\n", "\n", " # Return files from selected folders\n", " selected_files = [file for file in data_contents if file.split('/')[1] in selected_folders]\n", " return selected_files\n", "\n", "# Replace with the actual repo_id\n", "repo_id = \"litagin/moe-speech\"\n", "\n", "# Example usage:\n", "# First 25%\n", "# first_quarter = get_folder_subset(repo_id, 0, 25)\n", "\n", "# # Second 25%\n", "#second_quarter = get_folder_subset(repo_id, 25, 50)\n", "\n", "# # Third 25%\n", "third_quarter = get_folder_subset(repo_id, 50, 75)\n", "\n", "# # Last 25%\n", "# last_quarter = get_folder_subset(repo_id, 75, 100)\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "479bc618-3ab0-4903-9560-52986761390a", "metadata": {}, "outputs": [], "source": [ "import io\n", "import sys\n", "import time\n", "import threading\n", "from IPython import get_ipython\n", "\n", "def start_logging(log_file_path='cell_75%_output.log', interval=5):\n", " # Create a custom output stream\n", " class LogStream(io.StringIO):\n", " def __init__(self, filename):\n", " super().__init__()\n", " self.filename = filename\n", " \n", " def write(self, text):\n", " super().write(text)\n", " with open(self.filename, 'a') as f:\n", " f.write(text)\n", "\n", " # Create the log stream\n", " log_stream = LogStream(log_file_path)\n", "\n", " # Redirect stdout and stderr to the log stream\n", " sys.stdout = log_stream\n", " sys.stderr = log_stream\n", "\n", " # Function to save the current output\n", " def save_output():\n", " while True:\n", " time.sleep(interval)\n", " log_stream.flush()\n", "\n", " # Start the logging in a separate thread\n", " logging_thread = threading.Thread(target=save_output, daemon=True)\n", " logging_thread.start()\n", "\n", " print(f\"Logging started. Output will be saved to {log_file_path} every {interval} seconds.\")\n", "\n", "# Start logging\n", "start_logging()" ] }, { "cell_type": "code", "execution_count": null, "id": "7d0f63f8-772c-420b-af70-944381b980e5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "95b2594bc5694442ae55409bece8cd31", "version_major": 2, "version_minor": 0 }, "text/plain": [ "773a4156_0000.wav: 0%| | 0.00/636k [00:00