cs3319-project2 / code /make_notebook_style_split.py

CS3319 Project 2 final deliverable (public F1 = 0.96626)

f28d994 13 days ago

3.49 kB

	"""Create a notebook-style local validation split.

	The official example notebook creates validation data by:
	1. sampling 90% of train author-paper edges as training edges;
	2. using the remaining 10% known positives as validation positives;
	3. sampling the same number of random negatives not present in all known refs.

	This script materializes that split so later experiments use identical data.
	"""

	from __future__ import annotations

	import argparse
	from pathlib import Path

	import numpy as np
	import pandas as pd


	def read_txt(path: Path) -> list[list[int]]:
	rows: list[list[int]] = []
	with path.open("r") as f:
	for line in f:
	rows.append(list(map(int, line.strip().split())))
	return rows


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
	parser.add_argument("--seed", type=int, default=0)
	parser.add_argument("--train-frac", type=float, default=0.9)
	args = parser.parse_args()

	root = args.package_root
	data_dir = root / "data_and_docs"
	split_dir = root / "splits" / f"notebook_seed{args.seed}"
	split_dir.mkdir(parents=True, exist_ok=True)

	refs = read_txt(data_dir / "bipartite_train_ann.txt")
	coauthor = read_txt(data_dir / "author_file_ann.txt")
	citation = read_txt(data_dir / "paper_file_ann.txt")
	test_refs = read_txt(data_dir / "bipartite_test_ann.txt")

	ref_edges = pd.DataFrame(refs, columns=["source", "target"])
	ref_edges = ref_edges.set_index("r-" + ref_edges.index.astype(str))
	coauthor_edges = pd.DataFrame(coauthor, columns=["source", "target"])
	citation_edges = pd.DataFrame(citation, columns=["source", "target"])
	test_arr = np.array(test_refs, dtype=np.int64)

	node_tmp = pd.concat([citation_edges["source"], citation_edges["target"], ref_edges["target"]])
	paper_ids = pd.unique(node_tmp).astype(np.int64)
	node_tmp = pd.concat([ref_edges["source"], coauthor_edges["source"], coauthor_edges["target"]])
	author_ids = pd.unique(node_tmp).astype(np.int64)

	train_refs = ref_edges.sample(frac=args.train_frac, random_state=args.seed, axis=0)
	val_pos = ref_edges[~ref_edges.index.isin(train_refs.index)].copy()
	val_pos.loc[:, "label"] = 1

	existing_ref_set = set(map(tuple, ref_edges[["source", "target"]].to_numpy().tolist()))
	neg_pairs: list[tuple[int, int]] = []
	rng = np.random.default_rng(args.seed)
	while len(neg_pairs) < len(val_pos):
	src = int(rng.choice(author_ids))
	dst = int(rng.choice(paper_ids))
	if (src, dst) not in existing_ref_set:
	neg_pairs.append((src, dst))

	val_neg = pd.DataFrame(neg_pairs, columns=["source", "target"])
	val_neg.loc[:, "label"] = 0
	val_pairs = pd.concat([val_pos.reset_index(drop=True), val_neg], ignore_index=True)
	val_pairs = val_pairs.sample(frac=1, random_state=args.seed, axis=0).reset_index(drop=True)

	train_refs[["source", "target"]].to_csv(split_dir / "train_refs.csv", index=False)
	val_pairs[["source", "target", "label"]].to_csv(split_dir / "val_pairs.csv", index=False)
	np.save(split_dir / "test_refs.npy", test_arr)

	print(f"wrote {split_dir}")
	print(f"train positives: {len(train_refs)}")
	print(f"val positives: {int(val_pairs['label'].sum())}")
	print(f"val negatives: {int((val_pairs['label'] == 0).sum())}")
	print(f"val total: {len(val_pairs)}")


	if __name__ == "__main__":
	main()