Spaces:
Running
Running
File size: 4,883 Bytes
6441bc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
#!/usr/bin/env python3
import os
import pandas as pd
from huggingface_hub import snapshot_download
from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN
def download_datasets():
"""Download datasets from HuggingFace repositories"""
print("Downloading datasets from HuggingFace...")
# Download eval requests (queue)
try:
print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
snapshot_download(
repo_id=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
print("✓ Eval requests downloaded successfully")
except Exception as e:
print(f"Error downloading eval requests: {e}")
# Download eval results
try:
print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
print("✓ Eval results downloaded successfully")
except Exception as e:
print(f"Error downloading eval results: {e}")
# Download prediction data (main dataset)
try:
print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
snapshot_download(
repo_id=DATA_REPO,
local_dir=PREDICTIONS_CSV_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
print("✓ Prediction data downloaded successfully")
except Exception as e:
print(f"Error downloading prediction data: {e}")
def process_data():
"""Process the downloaded data and create queue"""
print("Processing downloaded data...")
# Load the main dataset
csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
if not os.path.exists(csv_path):
print(f"Error: data.csv not found at {csv_path}")
return None, None
print(f"Loading data from {csv_path}")
df = pd.read_csv(csv_path)
# Convert date columns
df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"])
df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"])
print(f"Loaded {len(df)} records")
print(f"Data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
# Get unique dates for prediction windows
prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique())
print(f"Prediction dates: {prediction_dates}")
# Get unique algorithms/models
algorithms = df["algorithm_name"].unique()
print(f"Algorithms: {algorithms}")
# Get unique event types
event_types = df["event_type"].unique()
print(f"Event types: {event_types}")
# Create a summary of the data
summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()}
print("\n=== Data Summary ===")
for key, value in summary.items():
print(f"{key}: {value}")
return df, summary
def generate_queue(df):
"""Generate evaluation queue from processed data"""
print("Generating evaluation queue...")
# Get unique events that need evaluation
unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index()
# Filter for events that haven't been resolved yet (if needed)
pending_events = unique_events[unique_events["result"].isna()]
resolved_events = unique_events[unique_events["result"].notna()]
print(f"Total unique events: {len(unique_events)}")
print(f"Pending events: {len(pending_events)}")
print(f"Resolved events: {len(resolved_events)}")
# Save queue locally
queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv")
unique_events.to_csv(queue_path, index=False)
print(f"✓ Queue saved to {queue_path}")
return unique_events
def main():
"""Main function to download and process data"""
print("=== FutureBench Data Download and Processing ===")
# Download datasets
download_datasets()
# Process data
df, summary = process_data()
if df is None:
print("❌ Failed to process data. Exiting.")
return
# Generate queue
queue = generate_queue(df)
print("\n=== Processing Complete ===")
print("Data processed and queue generated successfully!")
print(f"Queue contains {len(queue)} events")
if __name__ == "__main__":
main()
|