Spaces:

Bachstelze
/

github_sync

Sleeping

github_sync / Datasets_all /AimoScore_dedublication.py

Bachstelze

add dedubed train and test csv

6adac01 about 1 month ago

3.55 kB

	import pickle
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from openpyxl.utils import get_column_letter
	from openpyxl import load_workbook
	from openpyxl.workbook import Workbook

	INPUT_FILE = "AimoScore_WeakLink_big_scores.xls" # or .xls if using xlrd engine
	SCORE_PICKLE = "AimoScore_WeakLink_big_scores.pkl"
	OUTPUT_FILE = "AimoScore_deduped.xlsx"
	REMOVED_REPORT = "AimoScore_removed_report.csv"

	# old implemenation to load from original excel
	sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
	df = next(iter(sheets.values()))
	"""
	with open(SCORE_PICKLE, "rb") as f:
	sheets = pickle.load(f)
	df = sheets["Sheet1"]
	#print(df)
	"""

	required_cols = {"AimoScore", "EstimatedScore"}
	if not required_cols.issubset(df.columns):
	raise ValueError(f"Input file must contain columns: {required_cols}. Found: {df.columns.tolist()}")

	counts = df["AimoScore"].value_counts()
	duplicated_values = set(counts[counts > 1].index)
	mask_removed = df["AimoScore"].isin(duplicated_values)
	removed_rows = df[mask_removed].copy().reset_index(drop=True)
	kept_rows = df[~mask_removed].copy()

	# Add comparison columns for removed rows
	removed_rows["Diff"] = removed_rows["EstimatedScore"] - removed_rows["AimoScore"]


	# Convert RelDiff into boolean Quality estimator: True if absolute rel diff > 0.1
	removed_rows["Threshold"] = removed_rows["Diff"].apply(
	lambda x: bool(abs(x) > 0.1) if pd.notna(x) else False
	)

	# Only keep the requested columns for the removed report
	removed_report_df = removed_rows[["EstimatedScore", "Diff", "Threshold"]]

	# Save removed report (overwrite previous save)
	removed_report_df.to_csv(REMOVED_REPORT, index=False)


	agg = removed_rows.groupby("AimoScore").agg(
	count_removed=("AimoScore", "size"),
	mean_estimated=("EstimatedScore", "mean"),
	median_estimated=("EstimatedScore", "median"),
	mean_diff=("Diff", "mean")
	).reset_index()

	# Save kept rows to Excel via pandas (openpyxl engine creates file)
	kept_rows.to_excel(OUTPUT_FILE, index=False, engine="openpyxl")

	# Adjust column widths using openpyxl
	wb = load_workbook(OUTPUT_FILE)
	ws = wb.active

	# Strategy: set each column width to max(length of header, max cell string length) capped to a sensible range
	min_width = 15
	max_width = 60

	for i, col in enumerate(kept_rows.columns, start=1):
	col_letter = get_column_letter(i)
	# header length
	max_len = len(str(col))
	# check cell values in column
	for cell in ws[col_letter]:
	if cell.value is not None:
	cell_len = len(str(cell.value))
	if cell_len > max_len:
	max_len = cell_len
	# set width with caps and small padding
	adjusted_width = min(max(max_len + 2, min_width), max_width)
	ws.column_dimensions[col_letter].width = adjusted_width

	wb.save(OUTPUT_FILE)

	# Split kept rows into 80% train and 20% test (random, reproducible)
	train_df, test_df = train_test_split(kept_rows, test_size=0.2, random_state=42, shuffle=True)

	# Optional: reset index
	train_df = train_df.reset_index(drop=True)
	test_df = test_df.reset_index(drop=True)

	# Save CSVs
	train_df.to_csv("train.csv", index=False)
	test_df.to_csv("test.csv", index=False)

	print(f"Train rows: {len(train_df)}")
	print(f"Test rows: {len(test_df)}")

	# Save reports
	removed_rows.to_csv(REMOVED_REPORT, index=False)
	agg.to_csv("AimoScore_removed_agg.csv", index=False)

	print(f"Original rows: {len(df)}")
	print(f"Removed rows: {len(removed_rows)} (duplicated AimoScore values: {len(duplicated_values)})")
	print(f"Kept rows: {len(kept_rows)}")