import contextlib import re import tempfile from functools import lru_cache from typing import Optional import gradio as gr from git import Repo from httpx import Client from huggingface_hub import create_repo, upload_folder from toolz import groupby import kagglehub from kagglehub import KaggleDatasetAdapter client = Client() def clone_into_temp_dir(github_repo_url): temp_dir = tempfile.TemporaryDirectory() return Repo.clone_from(github_repo_url, temp_dir), temp_dir # repo = clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") # clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") def upload_directory_to_hf( repo_id: str, directory: str, oauth_token: str, ): private = False url = create_repo( repo_id, token=oauth_token, exist_ok=True, repo_type="dataset", private=private, ) commit_url = upload_folder( repo_id=repo_id, folder_path=directory, path_in_repo="data", repo_type="dataset", token=oauth_token, commit_message="Migrated from GitHub", ignore_patterns=[ "*.git*", # "*README.md*", "*.DS_Store", "*.env", ], # ignore git files and .env files ) def push_to_hf( source_github_repository, destination_hf_hub_repository, subdirectory, oauth_token: gr.OAuthToken, ): gr.Info("Cloning source GitHub repository...") repo, temporary_directory = clone_into_temp_dir(source_github_repository) gr.Info("Cloning source GitHub repository...Done") gr.Info("Syncing with Hugging Face Hub...") if subdirectory: src_directory = f"{repo.working_dir}/{subdirectory[0]}" else: src_directory = repo.working_dir upload_directory_to_hf( repo_id=destination_hf_hub_repository, directory=src_directory, oauth_token=oauth_token.token, ) gr.Info("Syncing with Hugging Face Hub...Done") temporary_directory.cleanup() return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" def extract_user_name_and_repo_from_url(github_url: str): pattern = r"https://github.com/([^/]+)/([^/]+)" if match := re.search(pattern, github_url): return match[1], match[2] print("No match found in the GitHub URL.") return None def get_files_and_directories(response): data = response.json() grouped_by_type = groupby(lambda item: item["type"], data["tree"]) files = grouped_by_type.get("blob", []) directories = grouped_by_type.get("tree", []) if files: files = [file["path"] for file in files] if directories: directories = [directory["path"] for directory in directories] return {"files": files, "directories": directories} @lru_cache(maxsize=128) def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"): user_name_and_repo = extract_user_name_and_repo_from_url(repo_url) if user_name_and_repo is None: return None user_name, repo_name = user_name_and_repo url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}" response = client.get(url) if response.status_code == 200: return get_files_and_directories(response) def show_files_and_directories(url: str): with contextlib.suppress(Exception): files_and_directories = list_git_repo_files_and_directories(url) directories = files_and_directories.get("directories", []) files = files_and_directories.get("files", []) print(directories) return gr.Dropdown( label="Directories", choices=directories, max_choices=1, visible=True, interactive=True, multiselect=True, ), gr.Dropdown( label="Files", choices=files, max_choices=None, visible=True, interactive=True, multiselect=True, ) def push_kaggle_to_hf( source_kaggle_dataset: str, destination_hf_hub_repository: str, file_path: str, oauth_token: gr.OAuthToken, ): """Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter""" if not file_path: raise ValueError("File path must be specified for Kaggle datasets") gr.Info("Loading Kaggle dataset...") dataset = kagglehub.load_dataset( KaggleDatasetAdapter.HUGGING_FACE, source_kaggle_dataset, file_path, ) gr.Info("Loading Kaggle dataset...Done") gr.Info("Pushing to Hugging Face Hub...") dataset.push_to_hub( destination_hf_hub_repository, token=oauth_token.token, ) gr.Info("Pushing to Hugging Face Hub...Done") return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" html_text_app_description = """ While GitHub and Kaggle are great platforms, the Hugging Face Datasets Hub is a better place to host and share datasets. Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:

This app will help you migrate datasets currently hosted on GitHub or Kaggle to the Hugging Face Datasets Hub. """ with gr.Blocks(theme=gr.themes.Base()) as demo: gr.HTML( """

Dataset Migration Tool

✨ Migrate datasets to Hugging Face Hub in a few steps ✨
""" ) with gr.Row(): gr.LoginButton(size="sm") with gr.Tabs() as tabs: with gr.Tab("GitHub"): gr.Markdown("### Location of existing dataset") gr.Markdown( "URL for the GitHub repository where the dataset is currently hosted" ) source_github_repository = gr.Textbox( lines=1, label="Source GitHub Repository URL" ) with gr.Accordion("Advanced Options", open=False): gr.Markdown("### Select files and folder to migrate") gr.Markdown( "(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated." ) folder_in_github_repo = gr.Dropdown( None, label="Folder in the GitHub Repository to migrate", allow_custom_value=True, visible=True, ) files_in_github_repo = gr.Dropdown( None, label="Files in GitHub Repository to migrate", allow_custom_value=True, visible=True, ) source_github_repository.change( show_files_and_directories, [source_github_repository], [folder_in_github_repo, files_in_github_repo], ) gr.Markdown("### Destination for your migrated dataset") destination_hf_hub_repository = gr.Textbox( label="Destination Hugging Face Repository", placeholder="i.e. /", ) github_submit_btn = gr.Button("Migrate GitHub Dataset") github_result = gr.Markdown(label="Summary", visible=True) github_submit_btn.click( push_to_hf, [ source_github_repository, destination_hf_hub_repository, folder_in_github_repo, ], [github_result], ) with gr.Tab("Kaggle"): gr.Markdown("### Source Kaggle Dataset") gr.Markdown("Enter the Kaggle dataset name and file path") source_kaggle_dataset = gr.Textbox( lines=1, label="Source Kaggle Dataset", placeholder="username/dataset-name", ) kaggle_file_path = gr.Textbox( label="File path in dataset", placeholder="e.g., train.csv", info="Specify the file to migrate from the dataset", ) gr.Markdown("### Destination for your migrated dataset") kaggle_destination_hf_hub = gr.Textbox( label="Destination Hugging Face Repository", placeholder="i.e. /", ) kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset") kaggle_result = gr.Markdown(label="Summary", visible=True) kaggle_submit_btn.click( push_kaggle_to_hf, [ source_kaggle_dataset, kaggle_destination_hf_hub, kaggle_file_path, ], [kaggle_result], ) gr.Markdown( """You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card). If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)""" ) demo.launch()