auto-analyst-backend / scripts /generate_test_data.py
FireBird-Tech's picture
Upload 54 files
d32c69c verified
raw
history blame
3.67 kB
import sys
import os
import random
from datetime import datetime, timedelta
import sqlite3
# Add parent directory to path so we can import modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from init_db import ModelUsage, session_factory
# Models and providers to use in test data
MODELS = {
"gpt-3.5-turbo": {"provider": "OpenAI", "cost_per_1k": 0.0015},
"gpt-4": {"provider": "OpenAI", "cost_per_1k": 0.03},
"gpt-4o": {"provider": "OpenAI", "cost_per_1k": 0.01},
"gpt-4o-mini": {"provider": "OpenAI", "cost_per_1k": 0.0015},
"o1-mini": {"provider": "OpenAI", "cost_per_1k": 0.00015},
"claude-3-opus": {"provider": "Anthropic", "cost_per_1k": 0.015},
"claude-3-sonnet": {"provider": "Anthropic", "cost_per_1k": 0.008},
"claude-3-haiku": {"provider": "Anthropic", "cost_per_1k": 0.003},
"llama-3-8b": {"provider": "Groq", "cost_per_1k": 0.0005},
"llama-3-70b": {"provider": "Groq", "cost_per_1k": 0.002},
}
# User IDs to use (can be random if you don't have specific users)
USER_IDS = [1, 2, 3, 4, 5]
def generate_test_data(num_records=100):
"""Generate test model usage data"""
session = session_factory()
try:
# Generate records for the past 30 days
end_date = datetime.utcnow()
start_date = end_date - timedelta(days=30)
for _ in range(num_records):
# Random timestamp within the date range
random_days = random.randint(0, 30)
timestamp = end_date - timedelta(days=random_days,
hours=random.randint(0, 23),
minutes=random.randint(0, 59))
# Select random model and user
model_name = random.choice(list(MODELS.keys()))
model_info = MODELS[model_name]
user_id = random.choice(USER_IDS)
# Generate random token counts
prompt_tokens = random.randint(100, 1000)
completion_tokens = random.randint(50, 500)
total_tokens = prompt_tokens + completion_tokens
# Calculate cost
cost = (total_tokens / 1000) * model_info["cost_per_1k"]
# Create model usage record
usage = ModelUsage(
user_id=user_id,
chat_id=random.randint(1, 50), # Random chat ID
model_name=model_name,
provider=model_info["provider"],
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
query_size=prompt_tokens * 4, # Approximate characters
response_size=completion_tokens * 4, # Approximate characters
cost=cost,
timestamp=timestamp,
is_streaming=random.choice([True, False]),
request_time_ms=random.randint(500, 5000) # Between 0.5 and 5 seconds
)
session.add(usage)
session.commit()
print(f"Successfully generated {num_records} test records")
except Exception as e:
session.rollback()
print(f"Error generating test data: {e}")
finally:
session.close()
if __name__ == "__main__":
# Default to 100 records, but allow command line override
num_records = int(sys.argv[1]) if len(sys.argv) > 1 else 100
generate_test_data(num_records)
print("Done! The model_usage table has been populated with test data.")