Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import numpy as np
- from pysr import PySRRegressor
- import pandas as pd
- def filter_data(data, key, min_value, max_value):
- """
- Filters a dictionary by removing entries outside a specified range for a given key.
- All other values in the dictionary are filtered accordingly to maintain consistency.
- :param data: Dictionary containing the data.
- :param key: The key in the dictionary used for filtering.
- :param min_value: The minimum allowed value.
- :param max_value: The maximum allowed value.
- :return: A new dictionary with filtered values.
- """
- filtered_data = {k: [] for k in data.keys()}
- for i, value in enumerate(data[key]):
- if min_value <= value <= max_value:
- for k in data.keys():
- filtered_data[k].append(data[k][i])
- return filtered_data
- def get_normalized_cdf(data_dict, x_key, y_key):
- """
- Extracts the x and y data from the provided dictionary using the given keys,
- computes the CDF of the y data, and normalizes it to [0, 1].
- :param data_dict: Dictionary containing the data.
- :param x_key: Key corresponding to the x-axis data in the dictionary.
- :param y_key: Key corresponding to the y-axis data in the dictionary.
- :return: Tuple of (x_data, normalized_cdf)
- """
- # Extract the data using the provided keys
- x_data = np.array(data_dict[x_key])
- y_data = np.array(data_dict[y_key])
- # Compute the cumulative sum (CDF)
- cdf = np.cumsum(y_data)
- # Normalize the CDF to [0, 1]
- cdf_normalized = cdf / cdf[-1]
- return x_data, cdf_normalized
- def plot_cdf(data_dict, x_key, y_key):
- """
- Plots the cumulative distribution function (CDF) for the provided data.
- The CDF is normalized to [0, 1].
- :param data_dict: Dictionary containing the data.
- :param x_key: Key corresponding to the x-axis data in the dictionary.
- :param y_key: Key corresponding to the y-axis data in the dictionary.
- """
- # Get the normalized CDF data
- x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)
- # Plot the CDF
- plt.figure(figsize=(8, 6))
- plt.plot(cdf_normalized, x_data, marker='o', linestyle='-', color='b')
- plt.title(f'Cumulative Distribution Function of {x_key} vs {y_key}')
- plt.xlabel(f'Cumulative Fraction of {y_key}')
- plt.ylabel(f'{x_key}')
- plt.grid(True)
- plt.show()
- def fit_symbolic_regression_on_cdf(data_dict, x_key, y_key, iterations=1000):
- """
- Fits a symbolic regression model using PySRRegressor on the normalized CDF data.
- The CDF is normalized to [0, 1].
- :param data_dict: Dictionary containing the data.
- :param x_key: Key corresponding to the x-axis data in the dictionary.
- :param y_key: Key corresponding to the y-axis data in the dictionary.
- :param iterations: Number of iterations for PySRRegressor to run.
- :return: The trained symbolic regression model.
- """
- # Get the normalized CDF data
- x_data, cdf_normalized = get_normalized_cdf(data_dict, x_key, y_key)
- # Create a PySRRegressor object
- # all operators https://ai.damtp.cam.ac.uk/pysr/operators/
- model = PySRRegressor(
- niterations=iterations,
- binary_operators=["plus", "sub", "mult", "div", "pow"],
- # unary_operators=["sqrt", "exp", "log2", "log", "cos", "sin", "tan", "atan"],
- )
- # Fit the model
- model.fit(cdf_normalized.reshape(-1, 1), x_data)
- # Set pandas options for better display of model output
- pd.set_option('display.max_columns', None)
- pd.set_option('display.expand_frame_repr', False)
- pd.set_option('max_colwidth', None)
- return model
- # from https://en.wikipedia.org/wiki/Apparent_magnitude
- brightness_data = {
- "apparent_magnitude": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 7.0, 8.0, 9.0, 10.0],
- "brightness_relative_to_vega": [2.51, 1.0, 0.4, 0.16, 0.063, 0.025, 0.01, 0.004, 0.0025, 0.0016, 0.00063, 0.00025,
- 0.0001],
- "num_stars": [1, 5, 15, 48, 171, 513, 1602, 4800, 9100, 14000, 42000, 121000, 340000]}
- # from https://clarkvision.com/articles/color-of-stars/
- bv_data = {
- "B-V": [-2.4, -2.3, -2.2, -2.1, -2.0, -1.9, -1.8, -1.7, -1.6, -1.5, -1.4, -1.3, -1.2, -1.1, -1.0, -0.9, -0.8, -0.7,
- -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
- 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4,
- 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
- 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4],
- "num_stars": [4, 2, 5, 5, 4, 8, 14, 20, 37, 47, 57, 119, 190, 296, 424, 767, 1109, 1808, 3055, 5010, 8158, 13159,
- 23521, 45616, 76357, 105827, 134717, 168902, 205831, 217000, 193206, 156765, 124677, 108664, 109187,
- 115567, 111691, 97171, 80262, 65384, 54335, 45887, 38363, 31775, 22922, 16157, 11864, 8599, 6488,
- 4806, 3568, 2610, 1941, 1460, 1145, 856, 676, 497, 364, 302, 223, 193, 158, 124, 91, 80, 54, 54, 43,
- 27, 24, 16, 5, 6, 3, 6, 2, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2]}
- # remove data points outside of valid range
- brightness_data = filter_data(brightness_data, "brightness_relative_to_vega", 0.002, 3.0) # more visible stars
- bv_data = filter_data(bv_data, "B-V", -1.0, 2.5) # Ballesteros formula for Temperature only valid for -0.4 to 2.0
- # plot the data
- # plot_cdf(brightness_data, "apparent_magnitude", "num_stars")
- plot_cdf(bv_data, "B-V", "num_stars")
- # Plot the CDF and fit the symbolic regression model
- iterations = 256
- # print(fit_symbolic_regression_on_cdf(brightness_data, "apparent_magnitude", "num_stars", iterations=256))
- # print(fit_symbolic_regression_on_cdf(bv_data, "B-V", "num_stars", iterations=iterations))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement