Optimus-Agent-Performance / fetch_and_preprocess_data.py
gauravlochab
feat: implement data fetching for APR and ROI metrics
2425de8
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import logging
# Get the logger
logger = logging.getLogger(__name__)
def generate_continuous_random_data(existing_data, end_time=None):
"""
Generate authentic-looking random data that continues from existing data
with adjusted APR following APR with a small offset
Args:
existing_data: DataFrame containing the existing data
end_time: Optional end time (defaults to current time)
Returns:
DataFrame with dummy data points
"""
# Use current time if not specified
if end_time is None:
end_time = datetime.now()
# Find the latest timestamp in the existing data
if not existing_data.empty:
start_time = existing_data['timestamp'].max() + timedelta(minutes=10)
else:
# If no existing data, start from 30 days ago
start_time = end_time - timedelta(days=30)
# Generate timestamps with 10-minute intervals
timestamps = []
current = start_time
while current <= end_time:
timestamps.append(current)
current += timedelta(minutes=10)
if not timestamps:
return pd.DataFrame() # No new data needed
# Get unique agents from existing data
if not existing_data.empty:
unique_agents = existing_data[['agent_id', 'agent_name']].drop_duplicates().to_dict('records')
else:
# Create one dummy agent if no existing data
unique_agents = [{'agent_id': 'dummy_agent', 'agent_name': 'Dummy Agent'}]
dummy_data_list = []
# For each agent, create continuous dummy data
for agent in unique_agents:
agent_id = agent['agent_id']
# Get the last real values for this agent to ensure continuity
last_apr = None
last_adjusted_apr = None
last_roi = None
if not existing_data.empty:
# Get last APR value
agent_apr_data = existing_data[(existing_data['agent_id'] == agent_id) &
(existing_data['metric_type'] == 'APR')]
if not agent_apr_data.empty:
last_apr = agent_apr_data['apr'].iloc[-1]
last_adjusted_apr = agent_apr_data['adjusted_apr'].iloc[-1]
# Get last ROI value
agent_roi_data = existing_data[(existing_data['agent_id'] == agent_id) &
(existing_data['metric_type'] == 'ROI')]
if not agent_roi_data.empty:
last_roi = agent_roi_data['roi'].iloc[-1]
# If no last values, start with reasonable values in our range
if last_apr is None or pd.isna(last_apr):
last_apr = random.uniform(-0.1, 0.1) # Start close to zero
if last_adjusted_apr is None or pd.isna(last_adjusted_apr):
# If we have APR but no adjusted APR, make it slightly different than APR
# Sometimes higher, sometimes lower to look more natural
if random.random() > 0.5:
last_adjusted_apr = last_apr + random.uniform(0.05, 0.15)
else:
last_adjusted_apr = last_apr - random.uniform(0.05, 0.15)
last_adjusted_apr = max(-0.5, min(1.0, last_adjusted_apr))
if last_roi is None or pd.isna(last_roi):
last_roi = random.uniform(-0.1, 0.1) # Start close to zero
# Generate APR values using smoother random walk
apr_values = [last_apr]
# Create a more natural pattern with some trends
# Define a few trend periods to make it look more authentic
num_points = len(timestamps)
trend_periods = []
# Create 3-5 trend periods
num_trends = random.randint(3, 5)
period_length = num_points // num_trends
for i in range(num_trends):
# Each trend has a direction (up, down, or sideways)
# and a strength (how strong the trend is)
direction = random.choice([-1, 0, 1]) # -1: down, 0: sideways, 1: up
strength = random.uniform(0.01, 0.03) # Smaller changes for more natural look
start_idx = i * period_length
end_idx = min((i + 1) * period_length, num_points)
trend_periods.append({
'start': start_idx,
'end': end_idx,
'direction': direction,
'strength': strength
})
# Generate values following the trends
for i in range(1, num_points):
# Find which trend period we're in
current_trend = None
for trend in trend_periods:
if trend['start'] <= i < trend['end']:
current_trend = trend
break
# If we couldn't find a trend (shouldn't happen), use a neutral trend
if current_trend is None:
current_trend = {'direction': 0, 'strength': 0.01}
# Base change is influenced by the trend
base_change = current_trend['direction'] * current_trend['strength']
# Add some randomness
random_change = random.normalvariate(0, 0.01) # Normal distribution for more natural randomness
# Previous momentum (30% influence to make it smoother)
prev_change = 0 if i == 1 else apr_values[i-1] - apr_values[i-2]
momentum = 0.3 * prev_change
# Combine all factors
total_change = base_change + random_change + momentum
# Apply the change
new_value = apr_values[i-1] + total_change
# Keep within reasonable bounds (-0.5 to 1.0)
new_value = max(-0.5, min(1.0, new_value))
apr_values.append(new_value)
# Generate adjusted APR values that follow APR with a small, varying offset
adjusted_apr_values = []
for i, apr_value in enumerate(apr_values):
# Make adjusted APR follow APR but with a small, varying offset
# Sometimes higher, sometimes lower to look more natural
if i % 5 == 0: # Periodically recalculate the offset direction
offset_direction = 1 if random.random() > 0.5 else -1
offset = offset_direction * random.uniform(0.05, 0.15)
adjusted_value = apr_value + offset
# Keep within reasonable bounds (-0.5 to 1.0)
adjusted_value = max(-0.5, min(1.0, adjusted_value))
adjusted_apr_values.append(adjusted_value)
# Generate ROI values with a completely different approach to ensure better distribution
# Note: ROI values will be multiplied by 100 in app.py, so we need to generate values
# between -0.01 and 0 to get final values between -1 and 0
# Instead of building on the last_roi value, we'll generate a completely new sequence
# that's well-distributed between -0.01 and 0
# First, create a sequence of target values that we want to hit
# This ensures we get good coverage of the entire range
target_points = []
for i in range(5): # Create 5 target points
# Distribute targets across the range, but avoid exactly 0
target = -0.01 + (i * 0.0025) # Values from -0.01 to -0.0025
target_points.append(target)
# Shuffle the targets to make the pattern less predictable
random.shuffle(target_points)
# Divide the total points into segments, one for each target
segment_length = num_points // len(target_points)
# Generate the ROI values
roi_values = []
# Start with the last real value, or a random value in our range if none exists
if last_roi is None or pd.isna(last_roi) or last_roi < -0.01 or last_roi > 0:
# If no valid last value, start in the middle of our range
current_value = -0.005
else:
current_value = last_roi
roi_values.append(current_value)
# For each segment, gradually move toward the target value
for segment_idx, target in enumerate(target_points):
start_idx = segment_idx * segment_length
end_idx = min((segment_idx + 1) * segment_length, num_points)
# How many steps we have to reach the target
steps = end_idx - start_idx
if steps <= 0:
continue # Skip if this segment has no points
# Current value is the last value in roi_values
current_value = roi_values[-1]
# Calculate how much to change per step to reach the target
step_change = (target - current_value) / steps
# Generate values for this segment
for step in range(steps):
# Base change to move toward target
base_change = step_change
# Add some randomness, but make sure we're still generally moving toward the target
random_factor = random.uniform(-0.0005, 0.0005)
# Calculate new value
new_value = current_value + base_change + random_factor
# Ensure we stay within range
new_value = max(-0.01, min(0, new_value))
roi_values.append(new_value)
current_value = new_value
# If we didn't generate enough points, add more
while len(roi_values) < num_points + 1:
# Add a point with small random variation from the last point
last_value = roi_values[-1]
new_value = last_value + random.uniform(-0.001, 0.001)
new_value = max(-0.01, min(0, new_value))
roi_values.append(new_value)
# If we generated too many points, trim the list
roi_values = roi_values[:num_points + 1]
# Create dummy data points
for i, timestamp in enumerate(timestamps):
# APR data
dummy_apr = {
'timestamp': timestamp,
'apr': apr_values[i],
'adjusted_apr': adjusted_apr_values[i],
'roi': None,
'agent_id': agent_id,
'agent_name': agent['agent_name'],
'is_dummy': True,
'metric_type': 'APR'
}
dummy_data_list.append(dummy_apr)
# ROI data
dummy_roi = {
'timestamp': timestamp,
'apr': None,
'adjusted_apr': None,
'roi': roi_values[i],
'agent_id': agent_id,
'agent_name': agent['agent_name'],
'is_dummy': True,
'metric_type': 'ROI'
}
dummy_data_list.append(dummy_roi)
return pd.DataFrame(dummy_data_list)