Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,636 Bytes
8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 347797e 8a142a6 347797e 8a142a6 f85a3ff 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 347797e 8a142a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import os
import json
import pandas as pd
import random
import re
# Global data store - loaded once at import time
_ARENA_DATA = None
def load_arena_data():
"""
Loads the arena data from the arena_df.csv file in the utils directory.
Returns the data in a format compatible with the application.
"""
global _ARENA_DATA
# If data is already loaded, return it
if _ARENA_DATA is not None:
return _ARENA_DATA
try:
# Define the path to the CSV file
csv_path = os.path.join('utils', 'arena_df.csv')
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Loaded arena data with {len(df)} examples")
# Store the data globally
_ARENA_DATA = df
return df
except Exception as e:
print(f"Error loading arena data: {e}")
# Return an empty DataFrame if file can't be loaded
return pd.DataFrame()
def create_dummy_example():
"""Creates a dummy example if no data is loaded"""
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
"contexts": [],
"full_contexts": [],
"Answerable": False,
"insufficient": True,
"insufficient_reason": "Data loading error"
}
def get_random_example():
"""
Selects a random example from the loaded arena data.
Returns the example data in a format compatible with the application.
"""
# Get the globally stored data - won't reload from disk
df = load_arena_data()
if df.empty:
# If no data is loaded, return a dummy example
return create_dummy_example()
# Select a random row
example = df.sample(1).iloc[0]
# Process the example data
processed_example = {
"question": example['question'],
"Answerable": not example.get('insufficient', False),
"insufficient": example.get('insufficient', False),
"insufficient_reason": example.get('insufficient_reason', ''),
"sample_id": example.get('sample_id', 0)
}
# Process the context description - ensure it's a non-empty string
context_desc = example.get('processed_context_desc', '')
if pd.isna(context_desc):
context_desc = ""
# Add the description to the processed example
processed_example["processed_context_desc"] = context_desc
# Process full contexts - from the 'contexts' column
full_contexts = []
try:
if 'contexts' in example and example['contexts']:
# Try to parse contexts as JSON if it's a string
contexts_str = example['contexts']
if isinstance(contexts_str, str):
# Try to parse as list literal first (for Python list representation)
if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'):
try:
# This is for handling Python list literals like "['string1', 'string2']"
import ast
contexts_list = ast.literal_eval(contexts_str)
# Process each context string in the list
for ctx in contexts_list:
full_contexts.append(ctx)
except (SyntaxError, ValueError) as e:
# If ast.literal_eval fails, try JSON
try:
contexts_list = json.loads(contexts_str)
# Process each context in the list
for ctx in contexts_list:
if isinstance(ctx, str):
full_contexts.append(ctx)
elif isinstance(ctx, dict) and 'content' in ctx:
full_contexts.append(ctx.get('content', ''))
except json.JSONDecodeError:
# Not valid JSON, treat as a single context
full_contexts.append(contexts_str)
else:
# Single context string (not JSON array or list literal)
full_contexts.append(contexts_str)
elif isinstance(contexts_str, list):
# Already a list, process directly
for ctx in contexts_str:
if isinstance(ctx, str):
full_contexts.append(ctx)
elif isinstance(ctx, dict) and 'content' in ctx:
full_contexts.append(ctx.get('content', ''))
except Exception as e:
print(f"Error processing full contexts: {e}")
# Process highlighted contexts - from contexts_highlighted column
contexts_highlighted = []
try:
# Process contexts_highlighted - this is stored as a string in CSV
if 'contexts_highlighted' in example and example['contexts_highlighted']:
highlights_str = example['contexts_highlighted']
if isinstance(highlights_str, str):
try:
# Try to parse as JSON array
highlights_list = json.loads(highlights_str)
# Process each highlighted context
for i, ctx in enumerate(highlights_list):
if isinstance(ctx, dict):
ctx_type = ctx.get('type', 'secondary')
content = ctx.get('abbreviatedContent', '')
# The content already has HTML span tags for highlights
contexts_highlighted.append({
'is_primary': ctx_type == 'primary',
'content': content
})
except json.JSONDecodeError:
print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...")
elif isinstance(highlights_str, list):
# Already a list, process directly
for ctx in highlights_str:
if isinstance(ctx, dict):
ctx_type = ctx.get('type', 'secondary')
content = ctx.get('abbreviatedContent', '')
contexts_highlighted.append({
'is_primary': ctx_type == 'primary',
'content': content
})
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
# Make sure we have the highlighted contexts populated even if there are no contexts_highlighted
if not contexts_highlighted and full_contexts:
for content in full_contexts:
contexts_highlighted.append({
'is_primary': False,
'content': content
})
processed_example["contexts"] = contexts_highlighted
processed_example["full_contexts"] = full_contexts
return processed_example
def get_random_example_and_models(model_names):
"""
Selects a random example from the arena data and assigns two distinct
random models to positions A and B.
"""
example = get_random_example()
# Choose two different models from the model list
model_a_name, model_b_name = random.sample(model_names, 2)
return example, model_a_name, model_b_name |