Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Linq;
- public enum ActivationFunction
- {
- ReLU,
- Sigmoid,
- Tanh,
- LeakyReLU,
- Swish,
- Mish,
- GELU
- }
- public enum Regularizer
- {
- None,
- L1,
- L2
- }
- public class GithubNeuralNetwork
- {
- private int[] _layers;
- private Matrix[] _weights;
- private Matrix[] _biases;
- private Func<Matrix, Matrix>[] _activationFunctions;
- private double _learningRate;
- private double _epsilon;
- private Matrix[] _gamma;
- private Matrix[] _beta;
- private double _initialLearningRate;
- private double _decayRate;
- private string _optimizer;
- private Matrix[] _movingMeans;
- private Matrix[] _movingVariances;
- private Matrix[] _mWeights;
- private Matrix[] _vWeights;
- private Matrix[] _mBiases;
- private Matrix[] _vBiases;
- private Matrix[] _mGamma;
- private Matrix[] _vGamma;
- private Matrix[] _mBeta;
- private Matrix[] _vBeta;
- private Matrix[] _slowWeights;
- private Matrix[] _slowBiases;
- private double _lookaheadAlpha;
- private double _lookaheadBeta;
- private int _t;
- private double _dropoutRate;
- private Matrix[] _dropoutMasks;
- private ActivationFunction[] _activationOptions;
- private Regularizer _regularizer;
- private double _lambda;
- private double _dropblockKeepProb;
- private int _dropblockSize;
- private double _maxLearningRate;
- private double _baseLearningRate;
- private int _stepSize;
- private int _cycle;
- private int _iterations;
- public GithubNeuralNetwork(double learningRate, double epsilon, string optimizer, double decayRate, double dropoutRate, Regularizer regularizer, double lambda, params int[] layers, double lookaheadAlpha = 0.5, double lookaheadBeta = 0.9, bool useGroupNormalization, int numGroups, int epochs, int batchSize)
- {
- _layers = layers;
- _weights = new Matrix[layers.Length - 1];
- _biases = new Matrix[layers.Length - 1];
- _activationFunctions = new Func<Matrix, Matrix>[layers.Length - 1];
- _learningRate = learningRate;
- _epsilon = epsilon;
- _gamma = new Matrix[layers.Length - 1];
- _beta = new Matrix[layers.Length - 1];
- _initialLearningRate = learningRate;
- _decayRate = decayRate;
- _optimizer = optimizer;
- _movingMeans = new Matrix[layers.Length - 1];
- _movingVariances = new Matrix[layers.Length - 1];
- _mWeights = new Matrix[layers.Length - 1];
- _vWeights = new Matrix[layers.Length - 1];
- _mBiases = new Matrix[layers.Length - 1];
- _vBiases = new Matrix[layers.Length - 1];
- _mGamma = new Matrix[layers.Length - 1];
- _vGamma = new Matrix[layers.Length - 1];
- _mBeta = new Matrix[layers.Length - 1];
- _vBeta = new Matrix[layers.Length - 1];
- _slowWeights = new Matrix[_weights.Length];
- _slowBiases = new Matrix[_biases.Length];
- _lookaheadAlpha = lookaheadAlpha;
- _lookaheadBeta = lookaheadBeta;
- _t = 1;
- _dropoutRate = dropoutRate;
- _dropoutMasks = new Matrix[layers.Length - 1];
- _activationOptions = new ActivationFunction[layers.Length - 1];
- _regularizer = regularizer;
- _lambda = lambda;
- _dropblockKeepProb = dropblockKeepProb;
- _dropblockSize = dropblockSize;
- _maxLearningRate = maxLearningRate;
- _baseLearningRate = baseLearningRate;
- _stepSize = stepSize;
- _cycle = 0;
- _iterations = 0;
- InitializeWeightsAndBiases();
- SetActivationFunctions();
- InitializeSlowWeightsAndBiases();
- InitializeRAdamParameters();
- }
- private void InitializeSlowWeightsAndBiases()
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- _slowWeights[i] = _weights[i].Copy();
- _slowBiases[i] = _biases[i].Copy();
- }
- }
- private Matrix ResidualBlock(Matrix input, int layerIndex)
- {
- Matrix residual = input;
- Matrix outputs = input;
- int numLayersInBlock = 2;
- int units = _layers[layerIndex + 1];
- for (int i = 0; i < numLayersInBlock; i++)
- {
- Matrix layerOutput = outputs * _weights[layerIndex] + _biases[layerIndex];
- layerOutput = layerOutput.Map(_activationFunctions[layerIndex]);
- outputs = layerOutput;
- }
- if (outputs.RowCount == residual.RowCount && outputs.ColumnCount == residual.ColumnCount)
- {
- outputs += residual; // Adding the shortcut (residual) to the output
- }
- return outputs;
- }
- private void LookaheadOptimizer(Matrix[] gradientsWeights, Matrix[] gradientsBiases)
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- _slowWeights[i] = (_lookaheadAlpha * _slowWeights[i]) + ((1 - _lookaheadAlpha) * _weights[i]);
- _slowBiases[i] = (_lookaheadAlpha * _slowBiases[i]) + ((1 - _lookaheadAlpha) * _biases[i]);
- _weights[i] -= _learningRate * (_lookaheadBeta * gradientsWeights[i] + (1 - _lookaheadBeta) * (_slowWeights[i]));
- _biases[i] -= _learningRate * (_lookaheadBeta * gradientsBiases[i] + (1 - _lookaheadBeta) * (_slowBiases[i]));
- }
- }
- private void Optimizer(Matrix[] gradientsWeights, Matrix[] gradientsBiases, /* Existing parameters */)
- {
- private void InitializeWeightsAndBiases()
- {
- Random rand = new Random();
- for (int i = 0; i < _weights.Length; i++)
- {
- _weights[i] = XavierInitialization(_layers[i + 1], _layers[i], rand);
- _biases[i] = Matrix.Zeros(_layers[i + 1], 1);
- _gamma[i] = Matrix.Ones(_layers[i + 1], 1);
- _beta[i] = Matrix.Zeros(_layers[i + 1], 1);
- _movingMeans[i] = Matrix.Zeros(_layers[i + 1], 1);
- _movingVariances[i] = Matrix.Ones(_layers[i + 1], 1);
- _mWeights[i] = Matrix.Zeros(_weights[i].RowCount, _weights[i].ColumnCount);
- _vWeights[i] = Matrix.Zeros(_weights[i].RowCount, _weights[i].ColumnCount);
- _mBiases[i] = Matrix.Zeros(_biases[i].RowCount, _biases[i].ColumnCount);
- _vBiases[i] = Matrix.Zeros(_biases[i].RowCount, _biases[i].ColumnCount);
- _mGamma[i] = Matrix.Zeros(_gamma[i].RowCount, _gamma[i].ColumnCount);
- _vGamma[i] = Matrix.Zeros(_gamma[i].RowCount, _gamma[i].ColumnCount);
- _mBeta[i] = Matrix.Zeros(_beta[i].RowCount, _beta[i].ColumnCount);
- _vBeta[i] = Matrix.Zeros(_beta[i].RowCount, _beta[i].ColumnCount);
- }
- }
- private Matrix Swish(Matrix x)
- {
- return x * MatrixFunctions.Sigmoid(x);
- }
- private Matrix Mish(Matrix x)
- {
- return x * MatrixFunctions.Tanh(MatrixFunctions.Softplus(x));
- }
- private Matrix GELU(Matrix x)
- {
- return 0.5 * x * (1 + MatrixFunctions.Tanh((Math.Sqrt(2 / Math.PI) * (x + 0.044715 * Math.Pow(x, 3)))));
- }
- private void SetActivationFunctions()
- {
- Random rand = new Random();
- for (int i = 0; i < _activationOptions.Length; i++)
- {
- int choice = rand.Next(7); // Randomly choose an activation function
- _activationOptions[i] = (ActivationFunction)choice;
- }
- }
- private Matrix XavierInitialization(int rows, int cols, Random rand)
- {
- double scale = Math.Sqrt(2.0 / (rows + cols));
- return Matrix.RandomMatrix(rows, cols, rand) * scale;
- }
- private Matrix LayerNormalization(Matrix x, Matrix gamma, Matrix beta, int layerIndex)
- {
- Matrix mean = MatrixFunctions.Mean(x, axis: 1);
- Matrix variance = MatrixFunctions.Variance(x, axis: 1);
- _movingMeans[layerIndex] = (_movingMeans[layerIndex] * 0.9) + (mean * 0.1);
- _movingVariances[layerIndex] = (_movingVariances[layerIndex] * 0.9) + (variance * 0.1);
- Matrix normalized = (x - mean) / MatrixFunctions.Sqrt(variance + _epsilon);
- return (gamma * normalized) + beta;
- }
- private Matrix FeedForward(Matrix input, bool training)
- {
- Matrix outputs = input;
- for (int i = 0; i < _weights.Length; i++)
- {
- if (training && _dropoutRate > 0.0)
- {
- _dropoutMasks[i] = Matrix.RandomMatrix(outputs.RowCount, outputs.ColumnCount);
- _dropoutMasks[i] = _dropoutMasks[i].Map(x => x < _dropoutRate ? 0 : 1);
- outputs = outputs.PointwiseMultiply(_dropoutMasks[i]);
- outputs *= 1.0 / (1.0 - _dropoutRate); // Scale the remaining neurons
- }
- outputs = outputs * _weights[i] + _biases[i];
- switch (_activationOptions[i])
- {
- case ActivationFunction.ReLU:
- outputs = outputs.Map(MatrixFunctions.ReLU);
- break;
- case ActivationFunction.Sigmoid:
- outputs = outputs.Map(MatrixFunctions.Sigmoid);
- break;
- case ActivationFunction.Tanh:
- outputs = outputs.Map(MatrixFunctions.Tanh);
- break;
- case ActivationFunction.LeakyReLU:
- outputs = outputs.Map(MatrixFunctions.LeakyReLU);
- break;
- default:
- outputs = outputs.Map(MatrixFunctions.ReLU);
- break;
- }
- }
- return outputs;
- private Matrix FeedForward(Matrix input, bool training)
- {
- Matrix outputs = input;
- for (int i = 0; i < _weights.Length; i++)
- {
- if (training && _dropoutRate > 0.0)
- {
- _dropoutMasks[i] = Matrix.RandomMatrix(outputs.RowCount, outputs.ColumnCount);
- _dropoutMasks[i] = _dropoutMasks[i].Map(x => x < _dropoutRate ? 0 : 1);
- outputs = outputs.PointwiseMultiply(_dropoutMasks[i]);
- outputs *= 1.0 / (1.0 - _dropoutRate); // Scale the remaining neurons
- }
- outputs = outputs * _weights[i] + _biases[i];
- switch (_activationOptions[i])
- {
- case ActivationFunction.ReLU:
- outputs = outputs.Map(MatrixFunctions.ReLU);
- break;
- case ActivationFunction.Sigmoid:
- outputs = outputs.Map(MatrixFunctions.Sigmoid);
- break;
- case ActivationFunction.Tanh:
- outputs = outputs.Map(MatrixFunctions.Tanh);
- break;
- case ActivationFunction.LeakyReLU:
- outputs = outputs.Map(MatrixFunctions.LeakyReLU);
- break;
- default:
- outputs = outputs.Map(MatrixFunctions.ReLU);
- break;
- }
- }
- return outputs;
- }
- private void Backpropagation(Matrix input, Matrix target)
- {
- Matrix[] outputs = new Matrix[_weights.Length + 1];
- outputs[0] = input;
- for (int i = 0; i < _weights.Length; i++)
- {
- outputs[i + 1] = outputs[i] * _weights[i] + _biases[i];
- outputs[i + 1] = outputs[i + 1].Map(_activationFunctions[i]);
- }
- Matrix[] errors = new Matrix[_weights.Length];
- errors[_weights.Length - 1] = outputs[^1] - target;
- for (int i = _weights.Length - 2; i >= 0; i--)
- {
- errors[i] = (_weights[i + 1].Transpose() * errors[i + 1]).MapDerivative(_activationFunctions[i]);
- }
- Matrix[] gradientsWeights = new Matrix[_weights.Length];
- Matrix[] gradientsBiases = new Matrix[_weights.Length];
- Matrix[] gradientsGamma = new Matrix[_weights.Length];
- Matrix[] gradientsBeta = new Matrix[_weights.Length];
- for (int i = 0; i < _weights.Length; i++)
- {
- gradientsWeights[i] = errors[i] * outputs[i].Transpose();
- gradientsBiases[i] = errors[i];
- gradientsGamma[i] = errors[i] * _movingMeans[i];
- gradientsBeta[i] = errors[i] * _movingVariances[i];
- }
- Optimizer(gradientsWeights, gradientsBiases, gradientsGamma, gradientsBeta);
- // Regularization
- if (_regularizer != Regularizer.None)
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- if (_regularizer == Regularizer.L1)
- {
- _weights[i] -= (_lambda * MatrixFunctions.Sign(_weights[i]));
- }
- else if (_regularizer == Regularizer.L2)
- {
- _weights[i] -= (_lambda * _weights[i]);
- }
- }
- }
- }
- public void Train(Matrix[] inputs, Matrix[] targets, int epochs, int batchSize)
- {
- Random rand = new Random();
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = FeedForward(batchInputs[j], true);
- Backpropagation(batchInputs[j], batchTargets[j]);
- }
- }
- LearningRateScheduler(epoch);
- }
- }
- public Matrix Predict(Matrix input)
- {
- return FeedForward(input, false);
- }
- private void LearningRateScheduler(int epoch)
- {
- _learningRate = _initialLearningRate / (1 + _decayRate * epoch);
- }
- public class GithubNeuralNetwork
- {
- // Existing fields and methods...
- private Matrix XavierInitialization(int rows, int cols, Random rand)
- {
- double scale = Math.Sqrt(2.0 / (rows + cols));
- return Matrix.RandomMatrix(rows, cols, rand) * scale;
- }
- private Matrix LayerNormalization(Matrix x, Matrix gamma, Matrix beta, int layerIndex)
- {
- Matrix mean = MatrixFunctions.Mean(x, axis: 1);
- Matrix variance = MatrixFunctions.Variance(x, axis: 1);
- _movingMeans[layerIndex] = (_movingMeans[layerIndex] * 0.9) + (mean * 0.1);
- _movingVariances[layerIndex] = (_movingVariances[layerIndex] * 0.9) + (variance * 0.1);
- Matrix normalized = (x - mean) / MatrixFunctions.Sqrt(variance + _epsilon);
- return (gamma * normalized) + beta;
- }
- private void Optimizer(Matrix[] gradientsWeights, Matrix[] gradientsBiases, Matrix[] gradientsGamma, Matrix[] gradientsBeta)
- {
- double final_lr = 0.1; // Define the final learning rate for AdaBound
- double beta1 = 0.9; // Adam's hyperparameter (momentum decay)
- double beta2 = 0.999; // Adam's hyperparameter (RMSprop decay)
- double epsilon = 1e-8; // Small constant to prevent division by zero
- double gamma = 1e-3; // AdaBound's hyperparameter
- double schedule_decay = 0.004;
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- _t++;
- double step_size = _learningRate * Math.Sqrt(1 - Math.Pow(beta2, _t)) / (1 - Math.Pow(beta1, _t));
- double lower_bound = final_lr * (1.0 - 1.0 / (_t + 1));
- double upper_bound = final_lr * (1.0 + 1.0 / (_t + 1));
- for (int k = 0; k < _weights.Length; k++)
- {
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = FeedForward(batchInputs[j], true);
- Backpropagation(batchInputs[j], batchTargets[j]);
- for (int i = 0; i < _weights.Length; i++)
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- _t++;
- _mWeights[i] = (beta1 * _mWeights[i]) + ((1 - beta1) * gradientsWeights[i]);
- _vWeights[i] = (beta2 * _vWeights[i]) + ((1 - beta2) * (gradientsWeights[i] * gradientsWeights[i]));
- _mBiases[i] = (beta1 * _mBiases[i]) + ((1 - beta1) * gradientsBiases[i]);
- _vBiases[i] = (beta2 * _vBiases[i]) + ((1 - beta2) * (gradientsBiases[i] * gradientsBiases[i]));
- _mGamma[i] = (beta1 * _mGamma[i]) + ((1 - beta1) * gradientsGamma[i]);
- _vGamma[i] = (beta2 * _vGamma[i]) + ((1 - beta2) * (gradientsGamma[i] * gradientsGamma[i]));
- _mBeta[i] = (beta1 * _mBeta[i]) + ((1 - beta1) * gradientsBeta[i]);
- _vBeta[i] = (beta2 * _vBeta[i]) + ((1 - beta2) * (gradientsBeta[i] * gradientsBeta[i]));
- double schedule = schedule_decay * (1 - Math.Pow(0.999, _t)) / (1 - Math.Pow(0.9, _t));
- Matrix mHatWeights = _mWeights[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatWeights = _vWeights[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatBiases = _mBiases[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatBiases = _vBiases[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatGamma = _mGamma[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatGamma = _vGamma[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatBeta = _mBeta[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatBeta = _vBeta[i] / (1 - Math.Pow(beta2, _t));
- _weights[i] -= (_learningRate * schedule * mHatWeights) / (MatrixFunctions.Sqrt(vHatWeights) + epsilon);
- _biases[i] -= (_learningRate * schedule * mHatBiases) / (MatrixFunctions.Sqrt(vHatBiases) + epsilon);
- _gamma[i] -= (_learningRate * schedule * mHatGamma) / (MatrixFunctions.Sqrt(vHatGamma) + epsilon);
- _beta[i] -= (_learningRate * schedule * mHatBeta) / (MatrixFunctions.Sqrt(vHatBeta) + epsilon);
- }
- if (_optimizer == "Adam")
- {
- _mWeights[i] = (beta1 * _mWeights[i]) + ((1 - beta1) * gradientsWeights[i]);
- _vWeights[i] = (beta2 * _vWeights[i]) + ((1 - beta2) * (gradientsWeights[i] * gradientsWeights[i]));
- _mBiases[i] = (beta1 * _mBiases[i]) + ((1 - beta1) * gradientsBiases[i]);
- _vBiases[i] = (beta2 * _vBiases[i]) + ((1 - beta2) * (gradientsBiases[i] * gradientsBiases[i]));
- _mGamma[i] = (beta1 * _mGamma[i]) + ((1 - beta1) * gradientsGamma[i]);
- _vGamma[i] = (beta2 * _vGamma[i]) + ((1 - beta2) * (gradientsGamma[i] * gradientsGamma[i]));
- _mBeta[i] = (beta1 * _mBeta[i]) + ((1 - beta1) * gradientsBeta[i]);
- _vBeta[i] = (beta2 * _vBeta[i]) + ((1 - beta2) * (gradientsBeta[i] * gradientsBeta[i]));
- Matrix mHatWeights = _mWeights[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatWeights = _vWeights[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatBiases = _mBiases[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatBiases = _vBiases[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatGamma = _mGamma[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatGamma = _vGamma[i] / (1 - Math.Pow(beta2, _t));
- Matrix mHatBeta = _mBeta[i] / (1 - Math.Pow(beta1, _t));
- Matrix vHatBeta = _vBeta[i] / (1 - Math.Pow(beta2, _t));
- _weights[i] -= (_learningRate * mHatWeights) / (MatrixFunctions.Sqrt(vHatWeights) + epsilon);
- _biases[i] -= (_learningRate * mHatBiases) / (MatrixFunctions.Sqrt(vHatBiases) + epsilon);
- _gamma[i] -= (_learningRate * mHatGamma) / (MatrixFunctions.Sqrt(vHatGamma) + epsilon);
- _beta[i] -= (_learningRate * mHatBeta) / (MatrixFunctions.Sqrt(vHatBeta) + epsilon);
- _t++;
- }
- else if (_optimizer == "AdaGrad")
- {
- _vWeights[i] += gradientsWeights[i] * gradientsWeights[i];
- _vBiases[i] += gradientsBiases[i] * gradientsBiases[i];
- _vGamma[i] += gradientsGamma[i] * gradientsGamma[i];
- _vBeta[i] += gradientsBeta[i] * gradientsBeta[i];
- _weights[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vWeights[i]) + epsilon)) * gradientsWeights[i];
- _biases[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vBiases[i]) + epsilon)) * gradientsBiases[i];
- _gamma[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vGamma[i]) + epsilon)) * gradientsGamma[i];
- _beta[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vBeta[i]) + epsilon)) * gradientsBeta[i];
- }
- else if (_optimizer == "RMSProp")
- {
- _vWeights[i] = (beta1 * _vWeights[i]) + ((1 - beta1) * (gradientsWeights[i] * gradientsWeights[i]));
- _vBiases[i] = (beta1 * _vBiases[i]) + ((1 - beta1) * (gradientsBiases[i] * gradientsBiases[i]));
- _vGamma[i] = (beta1 * _vGamma[i]) + ((1 - beta1) * (gradientsGamma[i] * gradientsGamma[i]));
- _vBeta[i] = (beta1 * _vBeta[i]) + ((1 - beta1) * (gradientsBeta[i] * gradientsBeta[i]));
- _weights[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vWeights[i]) + epsilon)) * gradientsWeights[i];
- _biases[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vBiases[i]) + epsilon)) * gradientsBiases[i];
- _gamma[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vGamma[i]) + epsilon)) * gradientsGamma[i];
- _beta[i] -= (_learningRate / (MatrixFunctions.Sqrt(_vBeta[i]) + epsilon)) * gradientsBeta[i];
- }
- else if (_optimizer == "Lookahead")
- {
- LookaheadOptimizer(gradientsWeights, gradientsBiases);
- }
- else
- {
- _weights[i] -= _learningRate * gradientsWeights[i];
- _biases[i] -= _learningRate * gradientsBiases[i];
- _gamma[i] -= _learningRate * gradientsGamma[i];
- _beta[i] -= _learningRate * gradientsBeta[i];
- }
- }
- }
- }
- // DevBuild 0.01
- private double AdaBound(double lr, double final_lr, double beta1, double beta2, double epsilon, int t)
- {
- double step_size = lr * Math.Sqrt(1 - Math.Pow(beta2, t)) / (1 - Math.Pow(beta1, t));
- double lower_bound = final_lr * (1.0 - 1.0 / (t + 1));
- double upper_bound = final_lr * (1.0 + 1.0 / (t + 1));
- return Math.Max(lower_bound, Math.Min(upper_bound, step_size));
- }
- private void AdvancedOptimizer(Matrix[] gradientsWeights, Matrix[] gradientsBiases, Matrix[] gradientsGamma, Matrix[] gradientsBeta, int epochs, Matrix[] inputs, Matrix[] targets, int batchSize)
- {
- double final_lr = 0.1; // Define the final learning rate for AdaBound
- double beta1 = 0.9; // Adam's hyperparameter (momentum decay)
- double beta2 = 0.999; // Adam's hyperparameter (RMSprop decay)
- double epsilon = 1e-8; // Small constant to prevent division by zero
- int t = 1;
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- t++;
- double schedule = 0.004 * (1 - Math.Pow(0.999, t)) / (1 - Math.Pow(0.9, t));
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = FeedForward(batchInputs[j], true);
- Backpropagation(batchInputs[j], batchTargets[j]);
- for (int k = 0; k < _weights.Length; k++)
- {
- _mWeights[k] = (beta1 * _mWeights[k]) + ((1 - beta1) * gradientsWeights[k]);
- _vWeights[k] = (beta2 * _vWeights[k]) + ((1 - beta2) * (gradientsWeights[k] * gradientsWeights[k]));
- _mBiases[k] = (beta1 * _mBiases[k]) + ((1 - beta1) * gradientsBiases[k]);
- _vBiases[k] = (beta2 * _vBiases[k]) + ((1 - beta2) * (gradientsBiases[k] * gradientsBiases[k]));
- _mGamma[k] = (beta1 * _mGamma[k]) + ((1 - beta1) * gradientsGamma[k]);
- _vGamma[k] = (beta2 * _vGamma[k]) + ((1 - beta2) * (gradientsGamma[k] * gradientsGamma[k]));
- _mBeta[k] = (beta1 * _mBeta[k]) + ((1 - beta1) * gradientsBeta[k]);
- _vBeta[k] = (beta2 * _vBeta[k]) + ((1 - beta2) * (gradientsBeta[k] * gradientsBeta[k]));
- Matrix mHatWeights = _mWeights[k] / (1 - Math.Pow(beta1, t));
- Matrix vHatWeights = _vWeights[k] / (1 - Math.Pow(beta2, t));
- Matrix mHatBiases = _mBiases[k] / (1 - Math.Pow(beta1, t));
- Matrix vHatBiases = _vBiases[k] / (1 - Math.Pow(beta2, t));
- Matrix mHatGamma = _mGamma[k] / (1 - Math.Pow(beta1, t));
- Matrix vHatGamma = _vGamma[k] / (1 - Math.Pow(beta2, t));
- Matrix mHatBeta = _mBeta[k] / (1 - Math.Pow(beta1, t));
- Matrix vHatBeta = _vBeta[k] / (1 - Math.Pow(beta2, t));
- double step_size = AdaBound(_learningRate, final_lr, beta1, beta2, epsilon, t);
- _weights[k] -= (step_size * schedule * mHatWeights) / (MatrixFunctions.Sqrt(vHatWeights) + epsilon);
- _biases[k] -= (step_size * schedule * mHatBiases) / (MatrixFunctions.Sqrt(vHatBiases) + epsilon);
- _gamma[k] -= (step_size * schedule * mHatGamma) / (MatrixFunctions.Sqrt(vHatGamma) + epsilon);
- _beta[k] -= (step_size * schedule * mHatBeta) / (MatrixFunctions.Sqrt(vHatBeta) + epsilon);
- }
- }
- }
- }
- }
- private Matrix GeM(Matrix x)
- {
- return MatrixFunctions.GeM(x);
- }
- private Matrix Swish(Matrix x)
- {
- return x * MatrixFunctions.Sigmoid(x);
- }
- private Matrix Mish(Matrix x)
- {
- return x * MatrixFunctions.Tanh(MatrixFunctions.Softplus(x));
- }
- private Matrix GELU(Matrix x)
- {
- return 0.5 * x * (1 + MatrixFunctions.Tanh((Math.Sqrt(2 / Math.PI) * (x + 0.044715 * Math.Pow(x, 3)))));
- }
- private Matrix AELU(Matrix x)
- {
- double alpha = 0.01;
- return x.Map(val => val > 0 ? val : alpha * (Math.Exp(val) - 1));
- }
- private void SetActivationFunctions()
- {
- Random rand = new Random();
- for (int i = 0; i < _activationOptions.Length; i++)
- {
- int choice = rand.Next(10); // Increase the range to accommodate new functions
- switch (choice)
- {
- case 0:
- _activationOptions[i] = ActivationFunction.ReLU;
- _activationFunctions[i] = MatrixFunctions.ReLU;
- break;
- case 1:
- _activationOptions[i] = ActivationFunction.Sigmoid;
- _activationFunctions[i] = MatrixFunctions.Sigmoid;
- break;
- case 2:
- _activationOptions[i] = ActivationFunction.Tanh;
- _activationFunctions[i] = MatrixFunctions.Tanh;
- break;
- case 3:
- _activationOptions[i] = ActivationFunction.LeakyReLU;
- _activationFunctions[i] = MatrixFunctions.LeakyReLU;
- break;
- case 4:
- _activationOptions[i] = ActivationFunction.Swish;
- _activationFunctions[i] = Swish;
- break;
- case 5:
- _activationOptions[i] = ActivationFunction.Mish;
- _activationFunctions[i] = Mish;
- break;
- case 6:
- _activationOptions[i] = ActivationFunction.GELU;
- _activationFunctions[i] = GELU;
- break;
- case 7:
- _activationOptions[i] = ActivationFunction.GeM;
- _activationFunctions[i] = GeM;
- break;
- case 8:
- _activationOptions[i] = ActivationFunction.AELU;
- _activationFunctions[i] = AELU;
- break;
- default:
- _activationOptions[i] = ActivationFunction.ReLU; // Default to ReLU
- _activationFunctions[i] = MatrixFunctions.ReLU;
- break;
- }
- }
- }
- public void TrainWithDynamicRegularization(Matrix[] inputs, Matrix[] targets, int epochs, int batchSize, double mixUpAlpha = 0.1, int cutMixBatchSize = 32, double cutMixAlpha = 0.3)
- {
- Random rand = new Random();
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- // Apply MixUp or CutMix with given probabilities
- if (rand.NextDouble() < 0.5)
- MixUp(batchInputs, batchTargets, mixUpAlpha);
- else
- CutMix(batchInputs, batchTargets, cutMixBatchSize, cutMixAlpha);
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = FeedForward(batchInputs[j], true);
- Backpropagation(batchInputs[j], batchTargets[j]);
- }
- }
- LearningRateScheduler(epoch);
- }
- }
- public class GroupNormalization
- {
- private int _numGroups;
- public GroupNormalization(int numGroups)
- {
- _numGroups = numGroups;
- }
- public Matrix GroupNormalize(Matrix x)
- {
- int channels = x.RowCount;
- int groupSize = channels / _numGroups;
- Matrix normalized = new Matrix(x.RowCount, x.ColumnCount);
- for (int i = 0; i < _numGroups; i++)
- {
- int start = i * groupSize;
- int end = (i + 1) * groupSize;
- Matrix group = x.GetSubMatrix(start, 0, end - start, x.ColumnCount);
- Matrix groupMean = MatrixFunctions.Mean(group, axis: 0);
- Matrix groupVariance = MatrixFunctions.Variance(group, axis: 0);
- for (int j = start; j < end; j++)
- {
- for (int k = 0; k < x.ColumnCount; k++)
- {
- normalized[j, k] = (x[j, k] - groupMean[0, k]) / (Math.Sqrt(groupVariance[0, k]) + 1e-8);
- }
- }
- }
- return normalized;
- }
- }
- private GroupNormalization _groupNormalization;
- {
- if (useGroupNormalization)
- {
- _groupNormalization = new GroupNormalization(numGroups);
- }
- }
- private Matrix ApplyNormalization(Matrix input)
- {
- if (_groupNormalization != null)
- {
- return _groupNormalization.GroupNormalize(input);
- }
- else
- {
- return input;
- }
- }
- private Matrix OrthogonalInitialization(int rows, int cols, Random rand)
- {
- Matrix gaussianMatrix = Matrix.RandomMatrix(rows, cols, rand);
- Matrix orthogonalMatrix = gaussianMatrix.Orthogonalize();
- return orthogonalMatrix;
- }
- private double CyclicalLearningRate(int epoch, double min_lr, double max_lr, int step_size, double gamma)
- {
- int cycle = 1 + epoch / (2 * step_size);
- double x = Math.Abs(epoch / step_size - 2 * cycle + 1);
- double lr = min_lr + (max_lr - min_lr) * Math.Max(0, (1 - x)) * Math.Pow(gamma, epoch);
- return lr;
- }
- private double OneCyclePolicy(int epoch, double max_lr, int total_epochs, double pct_start = 0.3, double div_factor = 25.0, double final_div_factor = 1e4)
- {
- int phase_epoch = (int)(total_epochs * pct_start);
- int current_epoch = epoch + 1;
- if (current_epoch < phase_epoch)
- {
- double pct = current_epoch / phase_epoch;
- double lr = max_lr / div_factor * (1 + Math.Cos(Math.PI * pct)) / 2;
- return lr;
- }
- else
- {
- double pct = 1 - (current_epoch - phase_epoch) / (total_epochs - phase_epoch);
- double lr = max_lr / final_div_factor * (1 + Math.Cos(Math.PI * pct)) / 2;
- return lr;
- }
- }
- private void LearningRateScheduler(int epoch)
- {
- double min_lr = 0.001; // Minimum learning rate
- double max_lr = 0.01; // Maximum learning rate
- int step_size = 5; // Step size for CLR
- double gamma = 0.9; // Gamma for CLR
- _learningRate = CyclicalLearningRate(epoch, min_lr, max_lr, step_size, gamma);
- }
- public class GithubNeuralNetwork
- {
- private double _dropblockKeepProb;
- private int _dropblockSize;
- {
- _dropblockKeepProb = dropblockKeepProb;
- _dropblockSize = dropblockSize;
- }
- private Matrix ApplyDropBlock(Matrix input, bool training)
- {
- if (!training || _dropblockKeepProb == 1.0 || _dropblockSize <= 0)
- return input;
- Random rand = new Random();
- int height = input.RowCount;
- int width = input.ColumnCount;
- int hStart = rand.Next(height - _dropblockSize + 1);
- int wStart = rand.Next(width - _dropblockSize + 1);
- int hEnd = Math.Min(hStart + _dropblockSize, height);
- int wEnd = Math.Min(wStart + _dropblockSize, width);
- if (hEnd - hStart <= 0 || wEnd - wStart <= 0)
- return input;
- Matrix mask = Matrix.Zeros(height, width);
- mask.SetSubMatrix(hStart, hEnd, wStart, wEnd, 1.0);
- Matrix droppedInput = input.PointwiseMultiply(mask);
- double count = mask.Sum();
- droppedInput /= count;
- Matrix output = input.PointwiseMultiply(droppedInput);
- return output;
- }
- private Matrix FeedForwardWithDropBlock(Matrix input, bool training)
- {
- Matrix outputs = input;
- for (int i = 0; i < _weights.Length; i++)
- {
- outputs = ApplyDropBlock(outputs, training);
- }
- return outputs;
- }
- }
- private void MagnitudeBasedWeightPruning(double pruningThreshold)
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- Matrix weightMask = (_weights[i].Map(Math.Abs) >= pruningThreshold).ToMatrix();
- _weights[i] = _weights[i].PointwiseMultiply(weightMask);
- }
- }
- public void TrainWithPruning(Matrix[] inputs, Matrix[] targets, int epochs, int batchSize, double pruningThreshold)
- {
- Random rand = new Random();
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = FeedForward(batchInputs[j], true);
- Backpropagation(batchInputs[j], batchTargets[j]);
- }
- }
- LearningRateScheduler(epoch);
- MagnitudeBasedWeightPruning(pruningThreshold);
- }
- }
- }
- private void ClipGradients(Matrix[] gradients)
- {
- double clipThreshold = 1.0;
- foreach (Matrix gradient in gradients)
- {
- gradient.MapInplace(x => Math.Min(x, clipThreshold));
- gradient.MapInplace(x => Math.Max(x, -clipThreshold));
- }
- }
- ClipGradients(gradientsWeights);
- ClipGradients(gradientsBiases);
- ClipGradients(gradientsGamma);
- ClipGradients(gradientsBeta);
- private void RMSPropOptimizer(Matrix[] gradients, ref Matrix[] velocities, double learningRate, double decayRate, double epsilon)
- {
- for (int i = 0; i < gradients.Length; i++)
- {
- velocities[i] = (decayRate * velocities[i]) + ((1 - decayRate) * (gradients[i] * gradients[i]));
- _weights[i] -= (learningRate / (MatrixFunctions.Sqrt(velocities[i]) + epsilon)) * gradients[i];
- }
- }
- RMSPropOptimizer(gradientsWeights, ref _vWeights, _learningRate, _decayRate, _epsilon);
- RMSPropOptimizer(gradientsBiases, ref _vBiases, _learningRate, _decayRate, _epsilon);
- RMSPropOptimizer(gradientsGamma, ref _vGamma, _learningRate, _decayRate, _epsilon);
- RMSPropOptimizer(gradientsBeta, ref _vBeta, _learningRate, _decayRate, _epsilon);
- public void SnapshotEnsembleTrain(Matrix[] inputs, Matrix[] targets, int epochs, int batchSize, int numSnapshots)
- {
- List<GithubNeuralNetwork> snapshotModels = new List<GithubNeuralNetwork>();
- for (int snapshot = 0; snapshot < numSnapshots; snapshot++)
- {
- GithubNeuralNetwork snapshotModel = new GithubNeuralNetwork(double learningRate, double epsilon, string optimizer, double decayRate, double dropoutRate, Regularizer regularizer, double lambda, params int[] layers, double lookaheadAlpha = 0.5, double lookaheadBeta = 0.9, bool useGroupNormalization, int numGroups);
- snapshotModels.Add(snapshotModel);
- Random rand = new Random();
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- Matrix[] batchInputs = inputs.Skip(i).Take(batchSize).ToArray();
- Matrix[] batchTargets = targets.Skip(i).Take(batchSize).ToArray();
- for (int j = 0; j < batchSize; j++)
- {
- Matrix outputs = snapshotModel.FeedForward(batchInputs[j], true);
- snapshotModel.Backpropagation(batchInputs[j], batchTargets[j]);
- }
- }
- snapshotModel.LearningRateScheduler(epoch);
- }
- }
- Matrix[] ensemblePredictions = new Matrix[inputs.Length];
- for (int i = 0; i < inputs.Length; i++)
- {
- Matrix aggregatedPrediction = Matrix.Zeros(targets[0].RowCount, targets[0].ColumnCount);
- foreach (var snapshotModel in snapshotModels)
- {
- Matrix snapshotPrediction = snapshotModel.Predict(inputs[i]);
- aggregatedPrediction += snapshotPrediction;
- }
- aggregatedPrediction /= numSnapshots;
- ensemblePredictions[i] = aggregatedPrediction;
- }
- Matrix finalEnsemblePrediction = Matrix.Zeros(targets[0].RowCount, targets[0].ColumnCount);
- foreach (var prediction in ensemblePredictions)
- {
- finalEnsemblePrediction += prediction;
- }
- finalEnsemblePrediction /= inputs.Length;
- }
- }
- private void InitializeRAdamParameters()
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- _sWeights[i] = Matrix.Zeros(_weights[i].RowCount, _weights[i].ColumnCount);
- _rWeights[i] = Matrix.Zeros(_weights[i].RowCount, _weights[i].ColumnCount);
- _sBiases[i] = Matrix.Zeros(_biases[i].RowCount, _biases[i].ColumnCount);
- _rBiases[i] = Matrix.Zeros(_biases[i].RowCount, _biases[i].ColumnCount);
- }
- }
- private void RAdamOptimizer(Matrix[] gradientsWeights, Matrix[] gradientsBiases)
- {
- for (int i = 0; i < _weights.Length; i++)
- {
- _sWeights[i] = (_beta1 * _sWeights[i]) + ((1 - _beta1) * gradientsWeights[i]);
- _rWeights[i] = (_beta2 * _rWeights[i]) + ((1 - _beta2) * gradientsWeights[i].PointwiseMultiply(gradientsWeights[i]));
- Matrix sHatWeights = _sWeights[i] / (1 - Math.Pow(_beta1, _t));
- Matrix rHatWeights = _rWeights[i] / (1 - Math.Pow(_beta2, _t));
- Matrix updateWeights = sHatWeights.PointwiseDivide((MatrixFunctions.Sqrt(rHatWeights) + _epsilon));
- _weights[i] -= _learningRate * updateWeights;
- }
- }
- private void Backpropagation(Matrix input, Matrix target)
- {
- RAdamOptimizer(gradientsWeights, gradientsBiases);
- }
- private void LearningRateScheduler()
- {
- _iterations++;
- if (_iterations % (_stepSize * 2) == 0)
- {
- _cycle++;
- _iterations = 0;
- }
- double cycleFraction = Math.Abs(_iterations - _stepSize) / (_stepSize * 1.0);
- double newLearningRate = _baseLearningRate + (_maxLearningRate - _baseLearningRate) * Math.Max(0, 1 - cycleFraction);
- _learningRate = newLearningRate;
- }
- public void Train(double learningRate, double epsilon, string optimizer, double decayRate, double dropoutRate, Regularizer regularizer, double lambda, params int[] layers, double lookaheadAlpha = 0.5, double lookaheadBeta = 0.9, bool useGroupNormalization, int numGroups, int epochs, int batchSize)
- {
- for (int epoch = 0; epoch < epochs; epoch++)
- {
- for (int i = 0; i < inputs.Length; i += batchSize)
- {
- LearningRateScheduler();
- }
- }
- }
- }
- private Matrix BatchNormalization(Matrix x, int layerIndex)
- {
- Matrix mean = MatrixFunctions.Mean(x, axis: 1);
- Matrix variance = MatrixFunctions.Variance(x, axis: 1);
- Matrix normalized = (x - mean) / MatrixFunctions.Sqrt(variance + _epsilon);
- Matrix scaled = _gamma[layerIndex] * normalized + _beta[layerIndex];
- return scaled;
- }
- private Matrix LayerNormalization(Matrix x, int layerIndex)
- {
- Matrix mean = MatrixFunctions.Mean(x, axis: 1);
- Matrix variance = MatrixFunctions.Variance(x, axis: 1);
- Matrix normalized = (x - mean) / MatrixFunctions.Sqrt(variance + _epsilon);
- Matrix scaled = _gamma[layerIndex] * normalized + _beta[layerIndex];
- return scaled;
- }
- private Matrix FeedForward(Matrix input, bool training)
- {
- Matrix outputs = input;
- for (int i = 0; i < _weights.Length; i++)
- {
- if (training)
- {
- outputs = BatchNormalization(outputs, i);
- }
- }
- return outputs;
- }
- }
- private Matrix HeInitialization(int rows, int cols, Random rand)
- {
- double scale = Math.Sqrt(2.0 / rows);
- return Matrix.RandomMatrix(rows, cols, rand) * scale;
- }
- private Matrix VarianceScaling(int rows, int cols, Random rand)
- {
- double scale = Math.Sqrt(2.0 / (rows + cols));
- return Matrix.RandomMatrix(rows, cols, rand) * scale;
- }
- private void InitializeWeightsAndBiases()
- {
- Random rand = new Random();
- for (int i = 0; i < _weights.Length; i++)
- {
- if (_activationOptions[i] == ActivationFunction.ReLU)
- {
- _weights[i] = HeInitialization(_layers[i + 1], _layers[i], rand);
- }
- else
- {
- _weights[i] = VarianceScaling(_layers[i + 1], _layers[i], rand);
- }
- _biases[i] = Matrix.Zeros(_layers[i + 1], 1);
- }
- }
- private Matrix SELU(Matrix x)
- {
- double alpha = 1.6732632423543772848170429916717;
- double scale = 1.0507009873554804934193349852946;
- return x.Map(value => value > 0 ? scale * value : scale * (alpha * Math.Exp(value) - alpha));
- }
- private Matrix ApplyActivationFunction(Matrix x, ActivationFunction activation)
- {
- switch (activation)
- {
- case ActivationFunction.ReLU:
- return MatrixFunctions.ReLU(x);
- case ActivationFunction.Sigmoid:
- return MatrixFunctions.Sigmoid(x);
- case ActivationFunction.Tanh:
- return MatrixFunctions.Tanh(x);
- case ActivationFunction.LeakyReLU:
- return MatrixFunctions.LeakyReLU(x);
- case ActivationFunction.Swish:
- return Swish(x);
- case ActivationFunction.Mish:
- return Mish(x);
- case ActivationFunction.GELU:
- return GELU(x);
- case ActivationFunction.SELU:
- return SELU(x);
- default:
- return MatrixFunctions.ReLU(x);
- }
- }
- private void SetActivationFunctions()
- {
- Random rand = new Random();
- for (int i = 0; i < _activationOptions.Length; i++)
- {
- int choice = rand.Next(8);
- _activationOptions[i] = (ActivationFunction)choice;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement