Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import json | |
import pandas as pd | |
import random | |
import re | |
from .context_processor import process_highlights | |
# Global data store - loaded once at import time | |
_ARENA_DATA = None | |
def load_arena_data(): | |
""" | |
Loads the arena data from the arena_df.csv file in the utils directory. | |
Returns the data in a format compatible with the application. | |
""" | |
global _ARENA_DATA | |
# If data is already loaded, return it | |
if _ARENA_DATA is not None: | |
return _ARENA_DATA | |
try: | |
# Define the path to the CSV file | |
csv_path = os.path.join('utils', 'arena_df.csv') | |
# Read the CSV file | |
df = pd.read_csv(csv_path) | |
print(f"Loaded arena data with {len(df)} examples") | |
# Store the data globally | |
_ARENA_DATA = df | |
return df | |
except Exception as e: | |
print(f"Error loading arena data: {e}") | |
# Return an empty DataFrame if file can't be loaded | |
return pd.DataFrame() | |
def create_dummy_example(): | |
"""Creates a dummy example if no data is loaded""" | |
return { | |
"question": "Could not load questions from the dataset. Please check the data file.", | |
"processed_context_desc": "Error: Data not available", | |
"contexts": ["No context available"], | |
"full_context": "Error loading context data.", | |
"Answerable": False, | |
"insufficient": True | |
} | |
def get_random_example(): | |
""" | |
Selects a random example from the loaded arena data. | |
Returns the example data in a format compatible with the application. | |
""" | |
# Get the globally stored data - won't reload from disk | |
df = load_arena_data() | |
if df.empty: | |
# If no data is loaded, return a dummy example | |
return create_dummy_example() | |
# Select a random row | |
example = df.sample(1).iloc[0] | |
# Process the example data | |
processed_example = { | |
"question": example['question'], | |
"processed_context_desc": example.get('processed_context_desc', ''), | |
"Answerable": example.get('Answerable', True), # Default to True unless specified otherwise | |
"insufficient": example.get('insufficient', False), | |
"insufficient_reason": example.get('insufficient_reason', '') | |
} | |
# Process contexts - for full context | |
try: | |
contexts_raw = example['contexts'] | |
if isinstance(contexts_raw, str): | |
contexts = json.loads(contexts_raw) | |
# Store full contexts as individual items | |
full_contexts = [] | |
if isinstance(contexts, list): | |
for i, chunk in enumerate(contexts): | |
if isinstance(chunk, dict) and 'content' in chunk: | |
full_contexts.append({ | |
'chunk_num': i + 1, | |
'content': chunk.get('content', '') | |
}) | |
processed_example["full_contexts"] = full_contexts | |
else: | |
processed_example["full_contexts"] = [] | |
except Exception as e: | |
print(f"Error processing contexts: {e}") | |
processed_example["full_contexts"] = [] | |
# Process highlighted contexts for display | |
contexts_highlighted = [] | |
try: | |
# Check if contexts_highlighted exists | |
if 'contexts_highlighted' in example and example['contexts_highlighted']: | |
highlighted_contexts = [] | |
if isinstance(example['contexts_highlighted'], str): | |
try: | |
# Try direct JSON parsing first | |
raw_str = example['contexts_highlighted'] | |
# First, manually parse the highlighted contexts using regex | |
# This is a more robust approach for our specific format | |
type_pattern = r'"type":\s*"(primary|secondary)"' | |
content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)' | |
types = re.findall(type_pattern, raw_str) | |
# Handle both regular quotes and escaped quotes in content | |
raw_contents = re.findall(content_pattern, raw_str) | |
# Extract contents from tuple matches (the regex has capture groups) | |
contents = [] | |
for match in raw_contents: | |
# Get the non-empty string from the tuple | |
content = next((s for s in match if s), "") | |
contents.append(content) | |
# Create the highlighted contexts from extracted data | |
for i, (ctx_type, content) in enumerate(zip(types, contents)): | |
highlighted_contexts.append({ | |
'type': ctx_type, | |
'abbreviatedContent': content | |
}) | |
except Exception as e: | |
print(f"Error extracting contexts with regex: {e}") | |
else: | |
# Already an object, not a string | |
highlighted_contexts = example['contexts_highlighted'] | |
# Process each context item | |
for i, item in enumerate(highlighted_contexts): | |
if isinstance(item, dict): | |
ctx_type = item.get('type', 'secondary') | |
content = item.get('abbreviatedContent', '') | |
# Process highlights using the standard format | |
content = process_highlights(content) | |
contexts_highlighted.append({ | |
'chunk_num': i + 1, | |
'content': content, | |
'is_primary': ctx_type == 'primary' | |
}) | |
except Exception as e: | |
print(f"Error processing highlighted contexts: {e}") | |
# If we couldn't process the highlighted contexts, fall back to the full contexts | |
if not contexts_highlighted and processed_example["full_contexts"]: | |
for i, ctx in enumerate(processed_example["full_contexts"]): | |
contexts_highlighted.append({ | |
'chunk_num': i + 1, | |
'content': ctx.get('content', ''), | |
'is_primary': False | |
}) | |
processed_example["contexts"] = contexts_highlighted | |
return processed_example | |
def get_random_example_and_models(model_names): | |
""" | |
Selects a random example from the arena data and assigns two distinct | |
random models to positions A and B. | |
""" | |
example = get_random_example() | |
# Choose two different models from the model list | |
model_a_name, model_b_name = random.sample(model_names, 2) | |
return example, model_a_name, model_b_name |