code-completion / scraping /files /get_licensed_files.py

Upload folder using huggingface_hub

a8639ac verified about 2 months ago

2.11 kB

	import os
	from urllib.parse import urlparse
	import glob
	import shutil
	from tqdm import tqdm

	path = os.path.expanduser("~/torch_datasets/github-python/mega_corpus_all_files")
	output_path = os.path.expanduser(
	"~/torch_datasets/github-python/mega_licensed_all_files"
	)

	with open("python_files.txt", "r") as f:
	all_urls = {line.strip() for line in f if line.strip()}

	with open("python_files_allowed.txt", "r") as f:
	allowed_urls = {line.strip() for line in f if line.strip()}

	# Find URLs in python_files_allowed.txt that are not in python_files.txt
	missing_urls = allowed_urls - all_urls

	if missing_urls:
	print(
	"The following URLs are in python_files_allowed.txt but not in python_files.txt:"
	)
	for url in missing_urls:
	print(url)
	else:
	print("All URLs in python_files_allowed.txt are contained in python_files.txt.")


	# Rename all .py files in the input path to ensure they have a single .py extension
	for root, _, files in tqdm(os.walk(path)):
	for file in files:
	if file.endswith(".py"):
	old_file_path = os.path.join(root, file)
	new_file_name = file.split(".py")[0] + ".py"
	new_file_path = os.path.join(root, new_file_name)
	if old_file_path != new_file_path:
	os.rename(old_file_path, new_file_path)
	print("Renaming completed.")


	with open("python_files_allowed.txt", "r") as f:
	urls = [line.strip() for line in f if line.strip()]
	repo_paths = set(["/".join(url.split("//")[1].split("/")[1:3]) for url in urls])
	print(repo_paths)

	num_existing = 0
	all_files = glob.glob(os.path.join(path, "*.py"))

	for file in (pbar := tqdm(all_files)):
	if any(repo_path in file.replace("_", "/") for repo_path in repo_paths):
	num_existing += 1
	file_name = os.path.basename(file)
	shutil.copy(file, os.path.join(output_path, file_name))
	pbar.set_description(f"Copied {num_existing} files")

	else:
	# print(f"File not found: {file}")
	pass

	print(f"Number of existing files: {num_existing}")