File size: 7,636 Bytes
8a142a6
 
 
 
 
 
f85a3ff
 
 
8a142a6
 
 
 
 
f85a3ff
 
 
 
 
 
8a142a6
 
 
 
 
 
 
f85a3ff
 
 
8a142a6
 
 
 
 
 
 
 
 
 
 
347797e
 
8a142a6
347797e
 
8a142a6
 
 
 
 
 
 
f85a3ff
8a142a6
 
 
 
 
 
 
 
 
 
 
 
347797e
8a142a6
347797e
 
8a142a6
 
347797e
 
 
 
 
 
 
 
 
8a142a6
347797e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a142a6
347797e
8a142a6
347797e
8a142a6
 
347797e
8a142a6
347797e
8a142a6
347797e
8a142a6
347797e
 
8a142a6
347797e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a142a6
347797e
 
 
 
8a142a6
 
 
347797e
 
 
8a142a6
347797e
 
8a142a6
 
 
347797e
8a142a6
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import json
import pandas as pd
import random
import re

# Global data store - loaded once at import time
_ARENA_DATA = None

def load_arena_data():
    """
    Loads the arena data from the arena_df.csv file in the utils directory.
    Returns the data in a format compatible with the application.
    """
    global _ARENA_DATA
    
    # If data is already loaded, return it
    if _ARENA_DATA is not None:
        return _ARENA_DATA
    
    try:
        # Define the path to the CSV file
        csv_path = os.path.join('utils', 'arena_df.csv')
        
        # Read the CSV file
        df = pd.read_csv(csv_path)
        print(f"Loaded arena data with {len(df)} examples")
        
        # Store the data globally
        _ARENA_DATA = df
        return df
    except Exception as e:
        print(f"Error loading arena data: {e}")
        # Return an empty DataFrame if file can't be loaded
        return pd.DataFrame()

def create_dummy_example():
    """Creates a dummy example if no data is loaded"""
    return {
        "question": "Could not load questions from the dataset. Please check the data file.",
        "processed_context_desc": "Error: Data not available",
        "contexts": [],
        "full_contexts": [],
        "Answerable": False,
        "insufficient": True,
        "insufficient_reason": "Data loading error"
    }

def get_random_example():
    """
    Selects a random example from the loaded arena data.
    Returns the example data in a format compatible with the application.
    """
    # Get the globally stored data - won't reload from disk
    df = load_arena_data()
    
    if df.empty:
        # If no data is loaded, return a dummy example
        return create_dummy_example()
    
    # Select a random row
    example = df.sample(1).iloc[0]
    
    # Process the example data
    processed_example = {
        "question": example['question'],
        "Answerable": not example.get('insufficient', False),
        "insufficient": example.get('insufficient', False),
        "insufficient_reason": example.get('insufficient_reason', ''),
        "sample_id": example.get('sample_id', 0)
    }
    
    # Process the context description - ensure it's a non-empty string
    context_desc = example.get('processed_context_desc', '')
    if pd.isna(context_desc):
        context_desc = ""
    # Add the description to the processed example
    processed_example["processed_context_desc"] = context_desc
    
    # Process full contexts - from the 'contexts' column
    full_contexts = []
    try:
        if 'contexts' in example and example['contexts']:
            # Try to parse contexts as JSON if it's a string
            contexts_str = example['contexts']
            
            if isinstance(contexts_str, str):
                # Try to parse as list literal first (for Python list representation)
                if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'):
                    try:
                        # This is for handling Python list literals like "['string1', 'string2']"
                        import ast
                        contexts_list = ast.literal_eval(contexts_str)
                        
                        # Process each context string in the list
                        for ctx in contexts_list:
                            full_contexts.append(ctx)
                    except (SyntaxError, ValueError) as e:
                        # If ast.literal_eval fails, try JSON
                        try:
                            contexts_list = json.loads(contexts_str)
                            
                            # Process each context in the list
                            for ctx in contexts_list:
                                if isinstance(ctx, str):
                                    full_contexts.append(ctx)
                                elif isinstance(ctx, dict) and 'content' in ctx:
                                    full_contexts.append(ctx.get('content', ''))
                        except json.JSONDecodeError:
                            # Not valid JSON, treat as a single context
                            full_contexts.append(contexts_str)
                else:
                    # Single context string (not JSON array or list literal)
                    full_contexts.append(contexts_str)
            elif isinstance(contexts_str, list):
                # Already a list, process directly
                for ctx in contexts_str:
                    if isinstance(ctx, str):
                        full_contexts.append(ctx)
                    elif isinstance(ctx, dict) and 'content' in ctx:
                        full_contexts.append(ctx.get('content', ''))
    except Exception as e:
        print(f"Error processing full contexts: {e}")
    
    # Process highlighted contexts - from contexts_highlighted column
    contexts_highlighted = []
    try:
        # Process contexts_highlighted - this is stored as a string in CSV
        if 'contexts_highlighted' in example and example['contexts_highlighted']:
            highlights_str = example['contexts_highlighted']
            
            if isinstance(highlights_str, str):
                try:
                    # Try to parse as JSON array
                    highlights_list = json.loads(highlights_str)
                    
                    # Process each highlighted context
                    for i, ctx in enumerate(highlights_list):
                        if isinstance(ctx, dict):
                            ctx_type = ctx.get('type', 'secondary')
                            content = ctx.get('abbreviatedContent', '')
                            
                            # The content already has HTML span tags for highlights
                            contexts_highlighted.append({
                                'is_primary': ctx_type == 'primary',
                                'content': content
                            })
                except json.JSONDecodeError:
                    print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...")
            elif isinstance(highlights_str, list):
                # Already a list, process directly
                for ctx in highlights_str:
                    if isinstance(ctx, dict):
                        ctx_type = ctx.get('type', 'secondary')
                        content = ctx.get('abbreviatedContent', '')
                        
                        contexts_highlighted.append({
                            'is_primary': ctx_type == 'primary',
                            'content': content
                        })
    except Exception as e:
        print(f"Error processing highlighted contexts: {e}")
    
    # Make sure we have the highlighted contexts populated even if there are no contexts_highlighted
    if not contexts_highlighted and full_contexts:
        for content in full_contexts:
            contexts_highlighted.append({
                'is_primary': False,
                'content': content
            })
    
    processed_example["contexts"] = contexts_highlighted
    processed_example["full_contexts"] = full_contexts
    
    return processed_example

def get_random_example_and_models(model_names):
    """
    Selects a random example from the arena data and assigns two distinct
    random models to positions A and B.
    """
    example = get_random_example()
    # Choose two different models from the model list
    model_a_name, model_b_name = random.sample(model_names, 2)
    return example, model_a_name, model_b_name