Star Data Symbolic Regression

import matplotlib.pyplot as plt
import numpy as np
from pysr import PySRRegressor
import pandas as pd


def filter_data(data, key, min_value, max_value):
    """
    Filters a dictionary by removing entries outside a specified range for a given key.
    All other values in the dictionary are filtered accordingly to maintain consistency.

    :param data: Dictionary containing the data.
    :param key: The key in the dictionary used for filtering.
    :param min_value: The minimum allowed value.
    :param max_value: The maximum allowed value.
    :return: A new dictionary with filtered values.
    """
    filtered_data = {k: [] for k in data.keys()}

    for i, value in enumerate(data[key]):
        if min_value <= value <= max_value:
            for k in data.keys():
                filtered_data[k].append(data[k][i])

    return filtered_data


def get_normalized_cdf(data_dict, x_key, y_key):
    """
    Extracts the x and y data from the provided dictionary using the given keys,
    computes the CDF of the y data, and normalizes it to [0, 1].

    :param data_dict: Dictionary containing the data.
    :param x_key: Key corresponding to the x-axis data in the dictionary.
    :param y_key: Key corresponding to the y-axis data in the dictionary.
    :return: Tuple of (x_data, normalized_cdf)
    """
    # Extract the data using the provided keys
    x_data = np.array(data_dict[x_key])
    y_data = np.array(data_dict[y_key])

    # Compute the cumulative sum (CDF)
    cdf = np.cumsum(y_data)

    # Normalize the CDF to [0, 1]
    cdf_normalized = cdf / cdf[-1]

    return x_data, cdf_normalized


def plot_cdf(data_dict, x_key, y_key):
    """
    Plots the cumulative distribution function (CDF) for the provided data.
    The CDF is normalized to [0, 1].

    :param data_dict: Dictionary containing the data.
    :param x_key: Key corresponding to the x-axis data in the dictionary.
    :param y_key: Key corresponding to the y-axis data in the dictionary.
    """
    # Get the normalized CDF data
    x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)

    # Plot the CDF
    plt.figure(figsize=(8, 6))
    plt.plot(cdf_normalized, x_data, marker='o', linestyle='-', color='b')
    plt.title(f'Cumulative Distribution Function of {x_key} vs {y_key}')
    plt.xlabel(f'Cumulative Fraction of {y_key}')
    plt.ylabel(f'{x_key}')
    plt.grid(True)
    plt.show()


def fit_symbolic_regression_on_cdf(data_dict, x_key, y_key, iterations=1000):
    """
    Fits a symbolic regression model using PySRRegressor on the normalized CDF data.
    The CDF is normalized to [0, 1].

    :param data_dict: Dictionary containing the data.
    :param x_key: Key corresponding to the x-axis data in the dictionary.
    :param y_key: Key corresponding to the y-axis data in the dictionary.
    :param iterations: Number of iterations for PySRRegressor to run.
    :return: The trained symbolic regression model.
    """
    # Get the normalized CDF data
    x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)

    # Create a PySRRegressor object
    # all operators https://ai.damtp.cam.ac.uk/pysr/operators/
    model = PySRRegressor(
        niterations=iterations,
        binary_operators=["plus", "sub", "mult", "div", "pow"],
        # unary_operators=["sqrt", "exp", "log2", "log", "cos", "sin", "tan", "atan"],
    )

    # Fit the model
    model.fit(cdf_normalized.reshape(-1, 1), x_data)

    # Set pandas options for better display of model output
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('max_colwidth', None)

    return model


# from https://en.wikipedia.org/wiki/Apparent_magnitude
brightness_data = {
    "apparent_magnitude": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 7.0, 8.0, 9.0, 10.0],
    "brightness_relative_to_vega": [2.51, 1.0, 0.4, 0.16, 0.063, 0.025, 0.01, 0.004, 0.0025, 0.0016, 0.00063, 0.00025,
                                    0.0001],
    "num_stars": [1, 5, 15, 48, 171, 513, 1602, 4800, 9100, 14000, 42000, 121000, 340000]}

# from https://clarkvision.com/articles/color-of-stars/
bv_data = {
    "B-V": [-2.4, -2.3, -2.2, -2.1, -2.0, -1.9, -1.8, -1.7, -1.6, -1.5, -1.4, -1.3, -1.2, -1.1, -1.0, -0.9, -0.8, -0.7,
            -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
            1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4,
            3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
            5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4],
    "num_stars": [4, 2, 5, 5, 4, 8, 14, 20, 37, 47, 57, 119, 190, 296, 424, 767, 1109, 1808, 3055, 5010, 8158, 13159,
                  23521, 45616, 76357, 105827, 134717, 168902, 205831, 217000, 193206, 156765, 124677, 108664, 109187,
                  115567, 111691, 97171, 80262, 65384, 54335, 45887, 38363, 31775, 22922, 16157, 11864, 8599, 6488,
                  4806, 3568, 2610, 1941, 1460, 1145, 856, 676, 497, 364, 302, 223, 193, 158, 124, 91, 80, 54, 54, 43,
                  27, 24, 16, 5, 6, 3, 6, 2, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2]}

# remove data points outside of valid range
brightness_data = filter_data(brightness_data, "brightness_relative_to_vega", 0.002, 3.0)  # more visible stars
bv_data = filter_data(bv_data, "B-V", -1.0, 2.5)  # Ballesteros formula for Temperature only valid for -0.4 to 2.0

# plot the data
# plot_cdf(brightness_data, "apparent_magnitude", "num_stars")
plot_cdf(bv_data, "B-V", "num_stars")

# Plot the CDF and fit the symbolic regression model
iterations = 256
# print(fit_symbolic_regression_on_cdf(brightness_data, "apparent_magnitude", "num_stars", iterations=256))
# print(fit_symbolic_regression_on_cdf(bv_data, "B-V", "num_stars", iterations=iterations))