File size: 4,883 Bytes
6441bc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
import os

import pandas as pd
from huggingface_hub import snapshot_download

from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN


def download_datasets():
    """Download datasets from HuggingFace repositories"""
    print("Downloading datasets from HuggingFace...")

    # Download eval requests (queue)
    try:
        print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
        snapshot_download(
            repo_id=QUEUE_REPO,
            local_dir=EVAL_REQUESTS_PATH,
            repo_type="dataset",
            tqdm_class=None,
            etag_timeout=30,
            token=TOKEN,
        )
        print("✓ Eval requests downloaded successfully")
    except Exception as e:
        print(f"Error downloading eval requests: {e}")

    # Download eval results
    try:
        print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
        snapshot_download(
            repo_id=RESULTS_REPO,
            local_dir=EVAL_RESULTS_PATH,
            repo_type="dataset",
            tqdm_class=None,
            etag_timeout=30,
            token=TOKEN,
        )
        print("✓ Eval results downloaded successfully")
    except Exception as e:
        print(f"Error downloading eval results: {e}")

    # Download prediction data (main dataset)
    try:
        print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
        snapshot_download(
            repo_id=DATA_REPO,
            local_dir=PREDICTIONS_CSV_PATH,
            repo_type="dataset",
            tqdm_class=None,
            etag_timeout=30,
            token=TOKEN,
        )
        print("✓ Prediction data downloaded successfully")
    except Exception as e:
        print(f"Error downloading prediction data: {e}")


def process_data():
    """Process the downloaded data and create queue"""
    print("Processing downloaded data...")

    # Load the main dataset
    csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
    if not os.path.exists(csv_path):
        print(f"Error: data.csv not found at {csv_path}")
        return None, None

    print(f"Loading data from {csv_path}")
    df = pd.read_csv(csv_path)

    # Convert date columns
    df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"])
    df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"])

    print(f"Loaded {len(df)} records")
    print(f"Data shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Get unique dates for prediction windows
    prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique())
    print(f"Prediction dates: {prediction_dates}")

    # Get unique algorithms/models
    algorithms = df["algorithm_name"].unique()
    print(f"Algorithms: {algorithms}")

    # Get unique event types
    event_types = df["event_type"].unique()
    print(f"Event types: {event_types}")

    # Create a summary of the data
    summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()}

    print("\n=== Data Summary ===")
    for key, value in summary.items():
        print(f"{key}: {value}")

    return df, summary


def generate_queue(df):
    """Generate evaluation queue from processed data"""
    print("Generating evaluation queue...")

    # Get unique events that need evaluation
    unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index()

    # Filter for events that haven't been resolved yet (if needed)
    pending_events = unique_events[unique_events["result"].isna()]
    resolved_events = unique_events[unique_events["result"].notna()]

    print(f"Total unique events: {len(unique_events)}")
    print(f"Pending events: {len(pending_events)}")
    print(f"Resolved events: {len(resolved_events)}")

    # Save queue locally
    queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv")
    unique_events.to_csv(queue_path, index=False)
    print(f"✓ Queue saved to {queue_path}")

    return unique_events


def main():
    """Main function to download and process data"""
    print("=== FutureBench Data Download and Processing ===")

    # Download datasets
    download_datasets()

    # Process data
    df, summary = process_data()

    if df is None:
        print("❌ Failed to process data. Exiting.")
        return

    # Generate queue
    queue = generate_queue(df)

    print("\n=== Processing Complete ===")
    print("Data processed and queue generated successfully!")
    print(f"Queue contains {len(queue)} events")


if __name__ == "__main__":
    main()