CheckpointCleanup-Release / .python_tmp /cfe9ba7d-1be2-4ef9-8a9f-b4106fadfff6.py

Upload folder using huggingface_hub

f09153a verified 3 months ago

3.17 kB

	import os, re, json, hashlib, datetime, sys
	from pathlib import Path
	root = Path('checkpoints')
	if not root.exists():
	print('checkpoints directory not found')
	sys.exit(1)
	step_dirs = []
	for p in root.iterdir():
	if p.is_dir() and re.match(r'step_\d+$', p.name):
	step_dirs.append(p)
	step_dirs.sort(key=lambda x: int(x.name.split('_')[1]))
	print(f'Found {len(step_dirs)} step dirs')
	configs = {}
	raw_map = {}
	for d in step_dirs:
	cfg = d / 'config.json'
	if cfg.exists():
	try:
	obj = json.load(cfg.open())
	norm = json.dumps(obj, sort_keys=True, separators=(',',':'))
	except Exception as e:
	# fallback to raw normalized whitespace
	txt = cfg.read_text()
	norm = '\n'.join([line.strip() for line in txt.splitlines() if line.strip()])
	h = hashlib.md5(norm.encode()).hexdigest()
	configs.setdefault(h, {'norm':norm,'steps':[]})['steps'].append((int(d.name.split('_')[1]), d))
	raw_map[d] = {'hash':h,'path':cfg}
	else:
	print(f'No config in {d}')

	print('Groups:')
	for h,v in configs.items():
	steps_sorted = sorted(v['steps'], key=lambda x: x[0])
	print(h, '->', [s for s,_ in steps_sorted])

	# For each group, keep config in highest step, replace others with pointer
	now = datetime.datetime.utcnow().isoformat()+'Z'
	for h,v in configs.items():
	steps = v['steps']
	max_step, max_dir = max(steps, key=lambda x:x[0])
	print(f'Canonical for hash {h} is step_{max_step}')
	for s,d in steps:
	cfg = d / 'config.json'
	if s == max_step:
	print(f'Keeping canonical config in {d}')
	continue
	# remove config.json but DO NOT delete pytorch_model.bin
	try:
	cfg.unlink()
	print(f'Removed {cfg}')
	except Exception as e:
	print('Failed to remove', cfg, e)
	# create pointer file
	pointer = d / 'config_pointer.txt'
	rel = os.path.relpath(max_dir / 'config.json', d)
	content = f"This config was consolidated during repository cleanup on {now}.\nCanonical config retained at: {rel}\nOriginal step: step_{s}\nCanonical step: step_{max_step}\nMD5: {h}\n"
	pointer.write_text(content)
	print(f'Wrote pointer {pointer}')

	# Update README.md: insert short note in section '## 4. How to Run Locally'
	readme = Path('README.md')
	if readme.exists():
	txt = readme.read_text()
	insert_after = '## 4. How to Run Locally'
	note = '\n\n> Note: Checkpoints cleanup — duplicate config.json files across checkpoints have been consolidated. For each unique config, only the highest-numbered step retains the canonical config; other steps now contain a pointer file. No pytorch_model.bin files were removed.\n'
	if insert_after in txt:
	parts = txt.split(insert_after,1)
	newtxt = parts[0] + insert_after + note + parts[1]
	readme.write_text(newtxt)
	print('Updated README.md with cleanup note')
	else:
	print('Could not find section to insert note; appending at end')
	readme.write_text(txt + '\n\n' + note)
	else:
	print('README.md not found')

	print('Done')