Advertisement
gehtsiegarnixan

Star Data Symbolic Regression

Mar 4th, 2025
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.91 KB | Science | 0 0
  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. from pysr import PySRRegressor
  4. import pandas as pd
  5.  
  6.  
  7. def filter_data(data, key, min_value, max_value):
  8.     """
  9.    Filters a dictionary by removing entries outside a specified range for a given key.
  10.    All other values in the dictionary are filtered accordingly to maintain consistency.
  11.  
  12.    :param data: Dictionary containing the data.
  13.    :param key: The key in the dictionary used for filtering.
  14.    :param min_value: The minimum allowed value.
  15.    :param max_value: The maximum allowed value.
  16.    :return: A new dictionary with filtered values.
  17.    """
  18.     filtered_data = {k: [] for k in data.keys()}
  19.  
  20.     for i, value in enumerate(data[key]):
  21.         if min_value <= value <= max_value:
  22.             for k in data.keys():
  23.                 filtered_data[k].append(data[k][i])
  24.  
  25.     return filtered_data
  26.  
  27.  
  28. def get_normalized_cdf(data_dict, x_key, y_key):
  29.     """
  30.    Extracts the x and y data from the provided dictionary using the given keys,
  31.    computes the CDF of the y data, and normalizes it to [0, 1].
  32.  
  33.    :param data_dict: Dictionary containing the data.
  34.    :param x_key: Key corresponding to the x-axis data in the dictionary.
  35.    :param y_key: Key corresponding to the y-axis data in the dictionary.
  36.    :return: Tuple of (x_data, normalized_cdf)
  37.    """
  38.     # Extract the data using the provided keys
  39.     x_data = np.array(data_dict[x_key])
  40.     y_data = np.array(data_dict[y_key])
  41.  
  42.     # Compute the cumulative sum (CDF)
  43.     cdf = np.cumsum(y_data)
  44.  
  45.     # Normalize the CDF to [0, 1]
  46.     cdf_normalized = cdf / cdf[-1]
  47.  
  48.     return x_data, cdf_normalized
  49.  
  50.  
  51. def plot_cdf(data_dict, x_key, y_key):
  52.     """
  53.    Plots the cumulative distribution function (CDF) for the provided data.
  54.    The CDF is normalized to [0, 1].
  55.  
  56.    :param data_dict: Dictionary containing the data.
  57.    :param x_key: Key corresponding to the x-axis data in the dictionary.
  58.    :param y_key: Key corresponding to the y-axis data in the dictionary.
  59.    """
  60.     # Get the normalized CDF data
  61.     x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)
  62.  
  63.     # Plot the CDF
  64.     plt.figure(figsize=(8, 6))
  65.     plt.plot(cdf_normalized, x_data, marker='o', linestyle='-', color='b')
  66.     plt.title(f'Cumulative Distribution Function of {x_key} vs {y_key}')
  67.     plt.xlabel(f'Cumulative Fraction of {y_key}')
  68.     plt.ylabel(f'{x_key}')
  69.     plt.grid(True)
  70.     plt.show()
  71.  
  72.  
  73. def fit_symbolic_regression_on_cdf(data_dict, x_key, y_key, iterations=1000):
  74.     """
  75.    Fits a symbolic regression model using PySRRegressor on the normalized CDF data.
  76.    The CDF is normalized to [0, 1].
  77.  
  78.    :param data_dict: Dictionary containing the data.
  79.    :param x_key: Key corresponding to the x-axis data in the dictionary.
  80.    :param y_key: Key corresponding to the y-axis data in the dictionary.
  81.    :param iterations: Number of iterations for PySRRegressor to run.
  82.    :return: The trained symbolic regression model.
  83.    """
  84.     # Get the normalized CDF data
  85.     x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)
  86.  
  87.     # Create a PySRRegressor object
  88.     # all operators https://ai.damtp.cam.ac.uk/pysr/operators/
  89.     model = PySRRegressor(
  90.         niterations=iterations,
  91.         binary_operators=["plus", "sub", "mult", "div", "pow"],
  92.         # unary_operators=["sqrt", "exp", "log2", "log", "cos", "sin", "tan", "atan"],
  93.     )
  94.  
  95.     # Fit the model
  96.     model.fit(cdf_normalized.reshape(-1, 1), x_data)
  97.  
  98.     # Set pandas options for better display of model output
  99.     pd.set_option('display.max_columns', None)
  100.     pd.set_option('display.expand_frame_repr', False)
  101.     pd.set_option('max_colwidth', None)
  102.  
  103.     return model
  104.  
  105.  
  106. # from https://en.wikipedia.org/wiki/Apparent_magnitude
  107. brightness_data = {
  108.     "apparent_magnitude": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 7.0, 8.0, 9.0, 10.0],
  109.     "brightness_relative_to_vega": [2.51, 1.0, 0.4, 0.16, 0.063, 0.025, 0.01, 0.004, 0.0025, 0.0016, 0.00063, 0.00025,
  110.                                     0.0001],
  111.     "num_stars": [1, 5, 15, 48, 171, 513, 1602, 4800, 9100, 14000, 42000, 121000, 340000]}
  112.  
  113. # from https://clarkvision.com/articles/color-of-stars/
  114. bv_data = {
  115.     "B-V": [-2.4, -2.3, -2.2, -2.1, -2.0, -1.9, -1.8, -1.7, -1.6, -1.5, -1.4, -1.3, -1.2, -1.1, -1.0, -0.9, -0.8, -0.7,
  116.             -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
  117.             1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4,
  118.             3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
  119.             5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4],
  120.     "num_stars": [4, 2, 5, 5, 4, 8, 14, 20, 37, 47, 57, 119, 190, 296, 424, 767, 1109, 1808, 3055, 5010, 8158, 13159,
  121.                   23521, 45616, 76357, 105827, 134717, 168902, 205831, 217000, 193206, 156765, 124677, 108664, 109187,
  122.                   115567, 111691, 97171, 80262, 65384, 54335, 45887, 38363, 31775, 22922, 16157, 11864, 8599, 6488,
  123.                   4806, 3568, 2610, 1941, 1460, 1145, 856, 676, 497, 364, 302, 223, 193, 158, 124, 91, 80, 54, 54, 43,
  124.                   27, 24, 16, 5, 6, 3, 6, 2, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2]}
  125.  
  126. # remove data points outside of valid range
  127. brightness_data = filter_data(brightness_data, "brightness_relative_to_vega", 0.002, 3.0)  # more visible stars
  128. bv_data = filter_data(bv_data, "B-V", -1.0, 2.5)  # Ballesteros formula for Temperature only valid for -0.4 to 2.0
  129.  
  130. # plot the data
  131. # plot_cdf(brightness_data, "apparent_magnitude", "num_stars")
  132. plot_cdf(bv_data, "B-V", "num_stars")
  133.  
  134. # Plot the CDF and fit the symbolic regression model
  135. iterations = 256
  136. # print(fit_symbolic_regression_on_cdf(brightness_data, "apparent_magnitude", "num_stars", iterations=256))
  137. # print(fit_symbolic_regression_on_cdf(bv_data, "B-V", "num_stars", iterations=iterations))
  138.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement