repo2txt / main.py
blaise-tk's picture
v0.0.1
6f74dd4
import os
import shutil
import subprocess
import argparse
def clone_repo(repo_url, clone_dir):
"""Clone the GitHub repository into the specified directory."""
subprocess.run(["git", "clone", repo_url, clone_dir], check=True)
def extract_repo_name_from_url(repo_url):
"""Extract the repository name from the GitHub URL."""
repo_name = repo_url.rstrip("/").split("/")[-1]
return repo_name.split(".")[0] if "." in repo_name else repo_name
def get_directory_structure(root_dir):
"""Get the directory structure in a tree format, ignoring .git directory."""
lines = []
for root, dirs, files in os.walk(root_dir):
if ".git" in dirs:
dirs.remove(".git") # Avoid walking into .git directory
level = root.replace(root_dir, "").count(os.sep)
indent = " " * 4 * level
lines.append(f"{indent}β”œβ”€β”€ {os.path.basename(root)}/")
subindent = " " * 4 * (level + 1)
for file in files:
lines.append(f"{subindent}β”œβ”€β”€ {file}")
return "\n".join(lines)
def read_file_contents(file_path):
"""Read the contents of a file, ignore if in .git directory."""
if ".git" in file_path:
return "[Ignored .git directory]"
try:
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
except (UnicodeDecodeError, OSError) as e:
return f"[Error reading file: {e}]"
def extract_all_files_contents(root_dir):
"""Extract contents of all files in the directory, ignoring .git directory."""
file_contents = {}
for root, _, files in os.walk(root_dir):
if ".git" in root:
continue
for file_name in files:
file_path = os.path.join(root, file_name)
relative_path = os.path.relpath(file_path, root_dir)
file_contents[relative_path] = read_file_contents(file_path)
return file_contents
def count_tokens(text):
"""Count the number of tokens in a given text."""
return len(text.split())
def write_output_file(output_file, directory_structure, file_contents):
"""Write the directory structure and file contents to the output file with metadata."""
total_lines = directory_structure.count("\n") + sum(
content.count("\n") for content in file_contents.values()
)
total_chars = len(directory_structure) + sum(
len(content) for content in file_contents.values()
)
with open(output_file, "w", encoding="utf-8") as file:
file.write(f"Lines: {total_lines}\nCharacters: {total_chars}\n\n")
file.write("Directory Structure:\n```\n")
file.write(directory_structure)
file.write("\n```\n")
for file_path, content in file_contents.items():
file.write(f"\nContents of {file_path}:\n```\n")
file.write(content)
file.write("\n```\n")
def cleanup(clone_dir):
"""Remove the cloned repository directory with error handling."""
if os.path.exists(clone_dir):
try:
shutil.rmtree(clone_dir, onerror=handle_remove_error)
except Exception as e:
print(f"An error occurred while cleaning up: {e}")
def handle_remove_error(func, path, exc_info):
"""Error handler for shutil.rmtree to handle permission errors."""
import stat
if isinstance(exc_info[1], PermissionError):
os.chmod(path, stat.S_IWRITE)
func(path)
else:
print(f"Error removing {path}: {exc_info[1]}")
def main():
parser = argparse.ArgumentParser(
description="Generate a text file with repository structure and all file contents."
)
parser.add_argument("repo_url", help="URL of the GitHub repository to process.")
parser.add_argument("output_file", help="Path to the output text file.")
args = parser.parse_args()
repo_url = args.repo_url
output_file = args.output_file
repo_name = extract_repo_name_from_url(repo_url)
clone_dir = repo_name
clone_repo(repo_url, clone_dir)
directory_structure = get_directory_structure(clone_dir)
file_contents = extract_all_files_contents(clone_dir)
write_output_file(output_file, directory_structure, file_contents)
cleanup(clone_dir)
if __name__ == "__main__":
main()