davanstrien HF staff commited on
Commit
00dd2d0
1 Parent(s): ab9b086

Add code for migrating GitHub repository to

Browse files
Files changed (1) hide show
  1. app.py +162 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import re
3
+ import tempfile
4
+ from functools import lru_cache
5
+
6
+ import gradio as gr
7
+ from git import Repo
8
+ from httpx import Client
9
+ from huggingface_hub import create_repo, upload_folder
10
+ from toolz import groupby
11
+
12
+ client = Client()
13
+
14
+
15
+ def clone_into_temp_dir(github_repo_url):
16
+ temp_dir = tempfile.TemporaryDirectory()
17
+ return Repo.clone_from(github_repo_url, temp_dir), temp_dir
18
+
19
+
20
+ repo = clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/")
21
+
22
+ clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/")
23
+
24
+
25
+ def upload_directory_to_hf(
26
+ repo_id: str,
27
+ directory: str,
28
+ token: str,
29
+ private: bool = False,
30
+ ):
31
+ url = create_repo(
32
+ repo_id,
33
+ token=token,
34
+ exist_ok=True,
35
+ repo_type="dataset",
36
+ private=private,
37
+ )
38
+
39
+ commit_url = upload_folder(
40
+ folder_path=directory,
41
+ path_in_repo="data",
42
+ repo_id=repo_id,
43
+ repo_type="dataset",
44
+ token=token,
45
+ commit_message="Migrated from GitHub",
46
+ ignore_patterns=[
47
+ "*.git*",
48
+ "*README.md*",
49
+ "*.DS_Store",
50
+ "*.env",
51
+ ], # ignore git files, README, and .env files
52
+ )
53
+
54
+
55
+ def push_to_hf(
56
+ source_github_repository, destination_hf_hub_repository, hf_token, subdirectory=None
57
+ ):
58
+ gr.Info("Cloning source GitHub repository...")
59
+ repo, temporary_directory = clone_into_temp_dir(source_github_repository)
60
+ gr.Info("Cloning source GitHub repository...Done")
61
+ gr.Info("Syncing with Hugging Face Hub...")
62
+ if subdirectory:
63
+ src_directory = f"{repo.working_dir}/{subdirectory[0]}"
64
+ else:
65
+ src_directory = repo.working_dir
66
+ upload_directory_to_hf(
67
+ repo_id=destination_hf_hub_repository,
68
+ directory=src_directory,
69
+ token=hf_token,
70
+ private=False,
71
+ )
72
+ gr.Info("Syncing with Hugging Face Hub...Done")
73
+ temporary_directory.cleanup()
74
+ return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets{destination_hf_hub_repository})"
75
+
76
+
77
+ def extract_user_name_and_repo_from_url(github_url: str):
78
+ pattern = r"https://github.com/([^/]+)/([^/]+)"
79
+ if match := re.search(pattern, github_url):
80
+ return match[1], match[2]
81
+ print("No match found in the GitHub URL.")
82
+ return None
83
+
84
+
85
+ def get_files_and_directories(response):
86
+ data = response.json()
87
+ grouped_by_type = groupby(lambda item: item["type"], data["tree"])
88
+ files = grouped_by_type.get("blob", [])
89
+ directories = grouped_by_type.get("tree", [])
90
+ if files:
91
+ files = [file["path"] for file in files]
92
+ if directories:
93
+ directories = [directory["path"] for directory in directories]
94
+ return {"files": files, "directories": directories}
95
+
96
+
97
+ @lru_cache(maxsize=128)
98
+ def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"):
99
+ user_name_and_repo = extract_user_name_and_repo_from_url(repo_url)
100
+ if user_name_and_repo is None:
101
+ return None
102
+ user_name, repo_name = user_name_and_repo
103
+ url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}"
104
+ response = client.get(url)
105
+ if response.status_code == 200:
106
+ return get_files_and_directories(response)
107
+
108
+
109
+ def show_directories(url: str):
110
+ with contextlib.suppress(Exception):
111
+ files_and_directories = list_git_repo_files_and_directories(url)
112
+ directories = files_and_directories.get("directories", [])
113
+ print(directories)
114
+ return gr.Dropdown(
115
+ label="Directories",
116
+ choices=directories,
117
+ max_choices=1,
118
+ visible=True,
119
+ interactive=True,
120
+ multiselect=True,
121
+ )
122
+
123
+
124
+ with gr.Blocks() as demo:
125
+ gr.Markdown("# Migrate a GitHub Repository to Hugging Face Hub")
126
+ gr.Markdown("URL for the GitHub repository where the dataset is currently hosted")
127
+ source_github_repository = gr.Textbox(lines=1, label="Source GitHub Repository URL")
128
+ gr.Markdown("OPTIONAL: If you want to upload a specific folder in the GitHub repo")
129
+ folder_in_github_repo = gr.Dropdown(
130
+ None,
131
+ label="Folder in GitHub Repository to upload",
132
+ allow_custom_value=True,
133
+ visible=True,
134
+ )
135
+ source_github_repository.change(
136
+ show_directories, [source_github_repository], [folder_in_github_repo]
137
+ )
138
+ gr.Markdown("Destination repo for your dataset")
139
+ destination_hf_hub_repository = gr.Textbox(
140
+ label="Destination Hugging Face Repository",
141
+ placeholder="username/repository_name",
142
+ )
143
+ gr.Markdown(
144
+ """You need to provide a token with write access to the namespace you want to upload to.
145
+ You can generate or access your token from [here](https://huggingface.co/settings/token)."""
146
+ )
147
+ hf_token = gr.Textbox(label="Hugging Face Token", type="password")
148
+ summit_btn = gr.Button()
149
+ result = gr.Markdown(label="Summary", visible=True)
150
+ summit_btn.click(
151
+ push_to_hf,
152
+ [
153
+ source_github_repository,
154
+ destination_hf_hub_repository,
155
+ hf_token,
156
+ folder_in_github_repo,
157
+ ],
158
+ [result],
159
+ )
160
+
161
+
162
+ demo.launch()