github_sync / Datasets_all /AimoScore_dedublication.py
Bachstelze
add dedubed train and test csv
6adac01
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from openpyxl.utils import get_column_letter
from openpyxl import load_workbook
from openpyxl.workbook import Workbook
INPUT_FILE = "AimoScore_WeakLink_big_scores.xls" # or .xls if using xlrd engine
SCORE_PICKLE = "AimoScore_WeakLink_big_scores.pkl"
OUTPUT_FILE = "AimoScore_deduped.xlsx"
REMOVED_REPORT = "AimoScore_removed_report.csv"
# old implemenation to load from original excel
sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
df = next(iter(sheets.values()))
"""
with open(SCORE_PICKLE, "rb") as f:
sheets = pickle.load(f)
df = sheets["Sheet1"]
#print(df)
"""
required_cols = {"AimoScore", "EstimatedScore"}
if not required_cols.issubset(df.columns):
raise ValueError(f"Input file must contain columns: {required_cols}. Found: {df.columns.tolist()}")
counts = df["AimoScore"].value_counts()
duplicated_values = set(counts[counts > 1].index)
mask_removed = df["AimoScore"].isin(duplicated_values)
removed_rows = df[mask_removed].copy().reset_index(drop=True)
kept_rows = df[~mask_removed].copy()
# Add comparison columns for removed rows
removed_rows["Diff"] = removed_rows["EstimatedScore"] - removed_rows["AimoScore"]
# Convert RelDiff into boolean Quality estimator: True if absolute rel diff > 0.1
removed_rows["Threshold"] = removed_rows["Diff"].apply(
lambda x: bool(abs(x) > 0.1) if pd.notna(x) else False
)
# Only keep the requested columns for the removed report
removed_report_df = removed_rows[["EstimatedScore", "Diff", "Threshold"]]
# Save removed report (overwrite previous save)
removed_report_df.to_csv(REMOVED_REPORT, index=False)
agg = removed_rows.groupby("AimoScore").agg(
count_removed=("AimoScore", "size"),
mean_estimated=("EstimatedScore", "mean"),
median_estimated=("EstimatedScore", "median"),
mean_diff=("Diff", "mean")
).reset_index()
# Save kept rows to Excel via pandas (openpyxl engine creates file)
kept_rows.to_excel(OUTPUT_FILE, index=False, engine="openpyxl")
# Adjust column widths using openpyxl
wb = load_workbook(OUTPUT_FILE)
ws = wb.active
# Strategy: set each column width to max(length of header, max cell string length) capped to a sensible range
min_width = 15
max_width = 60
for i, col in enumerate(kept_rows.columns, start=1):
col_letter = get_column_letter(i)
# header length
max_len = len(str(col))
# check cell values in column
for cell in ws[col_letter]:
if cell.value is not None:
cell_len = len(str(cell.value))
if cell_len > max_len:
max_len = cell_len
# set width with caps and small padding
adjusted_width = min(max(max_len + 2, min_width), max_width)
ws.column_dimensions[col_letter].width = adjusted_width
wb.save(OUTPUT_FILE)
# Split kept rows into 80% train and 20% test (random, reproducible)
train_df, test_df = train_test_split(kept_rows, test_size=0.2, random_state=42, shuffle=True)
# Optional: reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# Save CSVs
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)
print(f"Train rows: {len(train_df)}")
print(f"Test rows: {len(test_df)}")
# Save reports
removed_rows.to_csv(REMOVED_REPORT, index=False)
agg.to_csv("AimoScore_removed_agg.csv", index=False)
print(f"Original rows: {len(df)}")
print(f"Removed rows: {len(removed_rows)} (duplicated AimoScore values: {len(duplicated_values)})")
print(f"Kept rows: {len(kept_rows)}")