|
import os |
|
from urllib.parse import urlparse |
|
import glob |
|
import shutil |
|
from tqdm import tqdm |
|
|
|
path = os.path.expanduser("~/torch_datasets/github-python/mega_corpus_all_files") |
|
output_path = os.path.expanduser( |
|
"~/torch_datasets/github-python/mega_licensed_all_files" |
|
) |
|
|
|
with open("python_files.txt", "r") as f: |
|
all_urls = {line.strip() for line in f if line.strip()} |
|
|
|
with open("python_files_allowed.txt", "r") as f: |
|
allowed_urls = {line.strip() for line in f if line.strip()} |
|
|
|
|
|
missing_urls = allowed_urls - all_urls |
|
|
|
if missing_urls: |
|
print( |
|
"The following URLs are in python_files_allowed.txt but not in python_files.txt:" |
|
) |
|
for url in missing_urls: |
|
print(url) |
|
else: |
|
print("All URLs in python_files_allowed.txt are contained in python_files.txt.") |
|
|
|
|
|
|
|
for root, _, files in tqdm(os.walk(path)): |
|
for file in files: |
|
if file.endswith(".py"): |
|
old_file_path = os.path.join(root, file) |
|
new_file_name = file.split(".py")[0] + ".py" |
|
new_file_path = os.path.join(root, new_file_name) |
|
if old_file_path != new_file_path: |
|
os.rename(old_file_path, new_file_path) |
|
print("Renaming completed.") |
|
|
|
|
|
with open("python_files_allowed.txt", "r") as f: |
|
urls = [line.strip() for line in f if line.strip()] |
|
repo_paths = set(["/".join(url.split("//")[1].split("/")[1:3]) for url in urls]) |
|
print(repo_paths) |
|
|
|
num_existing = 0 |
|
all_files = glob.glob(os.path.join(path, "*.py")) |
|
|
|
for file in (pbar := tqdm(all_files)): |
|
if any(repo_path in file.replace("_", "/") for repo_path in repo_paths): |
|
num_existing += 1 |
|
file_name = os.path.basename(file) |
|
shutil.copy(file, os.path.join(output_path, file_name)) |
|
pbar.set_description(f"Copied {num_existing} files") |
|
|
|
else: |
|
|
|
pass |
|
|
|
print(f"Number of existing files: {num_existing}") |
|
|