Spaces:

SharathReddy
/

doc-mining

Runtime error

App Files Files Community

doc-mining / mine_diffs.py

SharathReddy

Update mine_diffs.py

eb46234 verified 9 months ago

raw

history blame contribute delete

3.57 kB

	import os
	import json
	import ast
	import tempfile
	from git import Repo, exc

	# --- Configuration ---
	REPO_CONFIG = {
	"fastapi": "https://github.com/tiangolo/fastapi.git",
	"requests": "https://github.com/psf/requests.git",
	"scikit-learn": "https://github.com/scikit-learn/scikit-learn.git"
	}
	OUTPUT_FILE = "diff_dataset.jsonl"
	MAX_COMMITS_PER_REPO = 5000

	class FuncParser(ast.NodeVisitor):
	def __init__(self):
	self.functions = {}
	def visit_FunctionDef(self, node):
	docstring = ast.get_docstring(node) or ""
	self.functions[node.name] = docstring
	self.generic_visit(node)

	def get_functions_from_source(source_code):
	try:
	tree = ast.parse(source_code)
	parser = FuncParser()
	parser.visit(tree)
	return parser.functions
	except SyntaxError:
	return {}

	def format_for_model(diff_text, old_doc, new_doc):
	return {
	"text": f"""### INSTRUCTION:
	A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.

	### GIT DIFF:
	```diff
	{diff_text}
	OLD DOCUMENTATION:

	{old_doc.strip()}
	UPDATED DOCUMENTATION:

	{new_doc.strip()}
	"""
	}

	def main():
	dataset = []
	base_repo_dir = tempfile.mkdtemp()
	print(f"Using temporary directory for clones: {base_repo_dir}")

	for name, url in REPO_CONFIG.items():
	repo_dir = os.path.join(base_repo_dir, name)
	try:
	print(f"Cloning {name} from {url}...")
	repo = Repo.clone_from(url, repo_dir)
	except exc.GitCommandError as e:
	print(f"Error cloning {name}: {e}")
	continue

	print(f"Mining commit history for {name}...")
	commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
	for commit in commits:
	if not commit.parents:
	continue

	parent = commit.parents[0]
	diffs = commit.diff(parent, create_patch=True, unified=0)

	for diff in diffs:
	if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
	continue
	if diff.a_blob is None or diff.b_blob is None:
	continue

	try:
	old_source = diff.a_blob.data_stream.read().decode('utf-8')
	new_source = diff.b_blob.data_stream.read().decode('utf-8')
	except UnicodeDecodeError:
	continue

	old_funcs = get_functions_from_source(old_source)
	new_funcs = get_functions_from_source(new_source)

	for func_name, old_doc in old_funcs.items():
	if func_name in new_funcs:
	new_doc = new_funcs[func_name]
	if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
	diff_text = diff.diff.decode('utf-8', errors='ignore')
	formatted_example = format_for_model(diff_text, old_doc, new_doc)
	dataset.append(formatted_example)

	print(f"\nFound {len(dataset)} high-quality examples.")

	try:
	with open(OUTPUT_FILE, 'w') as f:
	for item in dataset:
	f.write(json.dumps(item) + "\n")
	print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
	except Exception as e:
	print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")

	if __name__ == "main":
	main()