transformers / utils /check_modular_conversion.py

Upload folder using huggingface_hub

a9bd396 verified 2 months ago

11.3 kB

	import argparse
	import difflib
	import glob
	import logging
	import multiprocessing
	import os
	import shutil
	import subprocess
	from functools import partial

	from create_dependency_mapping import find_priority_list

	# Console for rich printing
	from modular_model_converter import convert_modular_file, run_ruff
	from rich.console import Console
	from rich.syntax import Syntax


	logging.basicConfig()
	logging.getLogger().setLevel(logging.ERROR)
	console = Console()

	BACKUP_EXT = ".modular_backup"


	def process_file(
	modular_file_path,
	generated_modeling_content,
	file_type="modeling_",
	show_diff=True,
	):
	file_name_prefix = file_type.split(".*")[0]
	file_name_suffix = file_type.split(".")[-1] if "." in file_type else ""
	file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
	# Read the actual modeling file
	with open(file_path, "r", encoding="utf-8") as modeling_file:
	content = modeling_file.read()
	diff = difflib.unified_diff(
	generated_modeling_content[file_type].splitlines(),
	content.splitlines(),
	fromfile=f"{file_path}_generated",
	tofile=f"{file_path}",
	lineterm="",
	)
	diff_list = list(diff)
	# Check for differences
	if diff_list:
	# first save the copy of the original file, to be able to restore it later
	shutil.copy(file_path, file_path + BACKUP_EXT)
	# we always save the generated content, to be able to update dependant files
	with open(file_path, "w", encoding="utf-8", newline="\n") as modeling_file:
	modeling_file.write(generated_modeling_content[file_type])
	if not show_diff:
	console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]")
	if show_diff:
	console.print(f"\n[bold red]Differences found between the generated code and {file_path}:[/bold red]\n")
	diff_text = "\n".join(diff_list)
	syntax = Syntax(diff_text, "diff", theme="ansi_dark", line_numbers=True)
	console.print(syntax)
	return 1
	else:
	return 0


	def convert_and_run_ruff(modular_file_path: str) -> dict[str, str]:
	"""From a modular file, convert it and return all the contents of the file as string.
	We need this function, because `ruff` needs the final filename to apply all rules correctly, so to get the
	output as a string, we need to save a temporary file with similar name, run ruff, and re-read the temporary file"""
	# Generate the expected modeling content
	generated_modeling_content = convert_modular_file(modular_file_path)
	# Temporary save the files with similar names to run `ruff` correctly, then re-read the result after linting/formatting
	for file_type in generated_modeling_content:
	file_name_prefix = file_type.split(".*")[0]
	file_name_suffix = file_type.split(".")[-1] if "." in file_type else ""
	temp_file_name = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(
	".py", f"_temp_pattern__{file_name_suffix}.py"
	)
	# Write the file only temporarily
	with open(temp_file_name, "w") as f:
	f.write(generated_modeling_content[file_type])
	# Run ruff on the new file (with similar name pattern as the original one)
	run_ruff(temp_file_name)
	with open(temp_file_name, "r") as f:
	generated_modeling_content[file_type] = f.read()
	# delete file
	os.remove(temp_file_name)

	return generated_modeling_content


	def compare_files(modular_file_path, show_diff=True):
	# Generate the expected modeling content
	generated_modeling_content = convert_and_run_ruff(modular_file_path)
	diff = 0
	for file_type in generated_modeling_content:
	diff += process_file(modular_file_path, generated_modeling_content, file_type, show_diff)
	return diff


	def get_models_in_diff():
	"""
	Finds all models that have been modified in the diff.

	Returns:
	A set containing the names of the models that have been modified (e.g. {'llama', 'whisper'}).
	"""
	fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
	modified_files = (
	subprocess.check_output(f"git diff --diff-filter=d --name-only {fork_point_sha}".split())
	.decode("utf-8")
	.split()
	)

	# Matches both modelling files and tests
	relevant_modified_files = [x for x in modified_files if "/models/" in x and x.endswith(".py")]
	model_names = set()
	for file_path in relevant_modified_files:
	model_name = file_path.split("/")[-2]
	model_names.add(model_name)
	return model_names


	def guaranteed_no_diff(modular_file_path, dependencies, models_in_diff):
	"""
	Returns whether it is guaranteed to have no differences between the modular file and the modeling file.

	Model is in the diff -> not guaranteed to have no differences
	Dependency is in the diff -> not guaranteed to have no differences
	Otherwise -> guaranteed to have no differences

	Args:
	modular_file_path: The path to the modular file.
	dependencies: A dictionary containing the dependencies of each modular file.
	models_in_diff: A set containing the names of the models that have been modified.

	Returns:
	A boolean indicating whether the model (code and tests) is guaranteed to have no differences.
	"""
	model_name = modular_file_path.rsplit("modular_", 1)[1].replace(".py", "")
	if model_name in models_in_diff:
	return False
	for dep in dependencies[modular_file_path]:
	# two possible patterns: `transformers.models.model_name.(...)` or `model_name.(...)`
	dependency_model_name = dep.split(".")[-2]
	if dependency_model_name in models_in_diff:
	return False
	return True


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Compare modular_xxx.py files with modeling_xxx.py files.")
	parser.add_argument(
	"--files", default=["all"], type=str, nargs="+", help="List of modular_xxx.py files to compare."
	)
	parser.add_argument(
	"--fix_and_overwrite", action="store_true", help="Overwrite the modeling_xxx.py file if differences are found."
	)
	parser.add_argument("--check_all", action="store_true", help="Check all files, not just the ones in the diff.")
	parser.add_argument(
	"--num_workers",
	default=-1,
	type=int,
	help="The number of workers to run. Default is -1, which means the number of CPU cores.",
	)
	args = parser.parse_args()
	if args.files == ["all"]:
	args.files = glob.glob("src/transformers/models/*/modular_.py", recursive=True)

	if args.num_workers == -1:
	args.num_workers = multiprocessing.cpu_count()

	# Assuming there is a topological sort on the dependency mapping: if the file being checked and its dependencies
	# are not in the diff, then there it is guaranteed to have no differences. If no models are in the diff, then this
	# script will do nothing.
	current_branch = subprocess.check_output(["git", "branch", "--show-current"], text=True).strip()
	if current_branch == "main":
	console.print(
	"[bold red]You are developing on the main branch. We cannot identify the list of changed files and will have to check all files. This may take a while.[/bold red]"
	)
	models_in_diff = {file_path.split("/")[-2] for file_path in args.files}
	else:
	models_in_diff = get_models_in_diff()
	if not models_in_diff and not args.check_all:
	exit(0)

	non_matching_files = []
	ordered_files, dependencies = find_priority_list(args.files)
	flat_ordered_files = [item for sublist in ordered_files for item in sublist]

	# ordered_files is a sorted list of lists of filepaths
	# - files from the first list do NOT depend on other files
	# - files in the second list depend on files from the first list
	# - files in the third list depend on files from the second and (optionally) the first list
	# - ... and so on
	# files (models) within the same list are independent of each other;
	# we start applying modular conversion to each list in parallel, starting from the first list
	try:
	for dependency_level_files in ordered_files:
	# Filter files guaranteed no diff
	files_to_check = []
	for file_path in dependency_level_files:
	if args.check_all or not guaranteed_no_diff(file_path, dependencies, models_in_diff):
	files_to_check.append(file_path)

	if not files_to_check:
	continue

	# Process files with diff
	num_workers = min(args.num_workers, len(files_to_check))
	with multiprocessing.Pool(num_workers) as p:
	try:
	is_changed_flags = p.map(
	partial(compare_files, show_diff=not args.fix_and_overwrite),
	files_to_check,
	)
	except Exception as e:
	console.print(
	f"[bold red]Failed to convert one or more files in batch: {files_to_check}[/bold red]"
	)
	console.print(f"[bold red]Error: {e}[/bold red]")
	# Try to process files individually to identify which one failed
	is_changed_flags = []
	for file_path in files_to_check:
	try:
	result = compare_files(file_path, show_diff=not args.fix_and_overwrite)
	is_changed_flags.append(result)
	except Exception as individual_error:
	console.print(f"[bold red]Failed to convert {file_path}: {individual_error}[/bold red]")
	is_changed_flags.append(0) # Mark as no change to continue processing

	# Collect changed files and their original paths
	for is_changed, file_path in zip(is_changed_flags, files_to_check):
	if is_changed:
	non_matching_files.append(file_path)

	# Update changed models, after each round of conversions
	# (save model folder name)
	models_in_diff.add(file_path.split("/")[-2])

	finally:
	# Restore overwritten files by modular (if needed)
	backup_files = glob.glob("*/" + BACKUP_EXT, recursive=True)
	for backup_file_path in backup_files:
	overwritten_path = backup_file_path.replace(BACKUP_EXT, "")
	if not args.fix_and_overwrite and os.path.exists(overwritten_path):
	shutil.copy(backup_file_path, overwritten_path)
	os.remove(backup_file_path)

	if non_matching_files and not args.fix_and_overwrite:
	diff_models = set(file_path.split("/")[-2] for file_path in non_matching_files) # noqa
	models_str = "\n - " + "\n - ".join(sorted(diff_models))
	raise ValueError(f"Some diff and their modeling code did not match. Models in diff:{models_str}")