Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

PySR / gui /processing.py

MilesCranmer

Refactor GUI to multiple files

9fa2182 unverified about 1 year ago

raw

history blame

4.67 kB

	import multiprocessing as mp
	import os
	import tempfile
	import time
	from pathlib import Path

	import numpy as np
	import pandas as pd

	from .data import generate_data

	EMPTY_DF = lambda: pd.DataFrame(
	{
	"Equation": [],
	"Loss": [],
	"Complexity": [],
	}
	)


	def process(
	file_input,
	force_run,
	test_equation,
	num_points,
	noise_level,
	data_seed,
	niterations,
	maxsize,
	binary_operators,
	unary_operators,
	plot_update_delay,
	parsimony,
	populations,
	population_size,
	ncycles_per_iteration,
	elementwise_loss,
	adaptive_parsimony_scaling,
	optimizer_algorithm,
	optimizer_iterations,
	batching,
	batch_size,
	):
	"""Load data, then spawn a process to run the greet function."""
	if file_input is not None:
	# Look at some statistics of the file:
	df = pd.read_csv(file_input)
	if len(df) == 0:
	return (
	EMPTY_DF(),
	"The file is empty!",
	)
	if len(df.columns) == 1:
	return (
	EMPTY_DF(),
	"The file has only one column!",
	)
	if len(df) > 10_000 and not force_run:
	return (
	EMPTY_DF(),
	"You have uploaded a file with more than 10,000 rows. "
	"This will take very long to run. "
	"Please upload a subsample of the data, "
	"or check the box 'Ignore Warnings'.",
	)

	col_to_fit = df.columns[-1]
	y = np.array(df[col_to_fit])
	X = df.drop([col_to_fit], axis=1)
	else:
	X, y = generate_data(test_equation, num_points, noise_level, data_seed)

	with tempfile.TemporaryDirectory() as tmpdirname:
	base = Path(tmpdirname)
	equation_file = base / "hall_of_fame.csv"
	equation_file_bkup = base / "hall_of_fame.csv.bkup"
	process = mp.Process(
	target=pysr_fit,
	kwargs=dict(
	X=X,
	y=y,
	niterations=niterations,
	maxsize=maxsize,
	binary_operators=binary_operators,
	unary_operators=unary_operators,
	equation_file=equation_file,
	parsimony=parsimony,
	populations=populations,
	population_size=population_size,
	ncycles_per_iteration=ncycles_per_iteration,
	elementwise_loss=elementwise_loss,
	adaptive_parsimony_scaling=adaptive_parsimony_scaling,
	optimizer_algorithm=optimizer_algorithm,
	optimizer_iterations=optimizer_iterations,
	batching=batching,
	batch_size=batch_size,
	),
	)
	process.start()
	last_yield_time = None
	while process.is_alive():
	if equation_file_bkup.exists():
	try:
	# First, copy the file to a the copy file
	equation_file_copy = base / "hall_of_fame_copy.csv"
	os.system(f"cp {equation_file_bkup} {equation_file_copy}")
	equations = pd.read_csv(equation_file_copy)
	# Ensure it is pareto dominated, with more complex expressions
	# having higher loss. Otherwise remove those rows.
	# TODO: Not sure why this occurs; could be the result of a late copy?
	equations.sort_values("Complexity", ascending=True, inplace=True)
	equations.reset_index(inplace=True)
	bad_idx = []
	min_loss = None
	for i in equations.index:
	if min_loss is None or equations.loc[i, "Loss"] < min_loss:
	min_loss = float(equations.loc[i, "Loss"])
	else:
	bad_idx.append(i)
	equations.drop(index=bad_idx, inplace=True)

	while (
	last_yield_time is not None
	and time.time() - last_yield_time < plot_update_delay
	):
	time.sleep(0.1)

	yield equations[["Complexity", "Loss", "Equation"]]

	last_yield_time = time.time()
	except pd.errors.EmptyDataError:
	pass

	process.join()


	def pysr_fit(
	*,
	X,
	y,
	**pysr_kwargs,
	):
	import pysr

	model = pysr.PySRRegressor(
	progress=False,
	timeout_in_seconds=1000,
	**pysr_kwargs,
	)
	model.fit(X, y)