neural_network.cpp

#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <cassert>
#include <algorithm>
#include <deque>
#include <tuple>

using namespace std;

class ReplayBuffer {
private:
    struct Transition {
        std::vector<double> state;
        int action;
        double reward;
        std::vector<double> nextState;
        bool done;
    };

    std::deque<Transition> buffer;
    size_t capacity;
    std::mt19937 rng;

public:
    ReplayBuffer(size_t capacity) : capacity(capacity) {
        std::random_device rd;
        rng = std::mt19937(rd());
    }

    void storeTransition(const std::vector<double>& state, int action, double reward, const std::vector<double>& nextState, bool done) {
        if (buffer.size() >= capacity) {
            buffer.pop_front();
        }
        buffer.push_back({state, action, reward, nextState, done});
    }

    std::tuple<std::vector<std::vector<double>>, std::vector<int>, std::vector<double>, std::vector<std::vector<double>>, std::vector<bool>>
    sampleBatch(size_t batchSize) {
        std::vector<std::vector<double>> states(batchSize);
        std::vector<int> actions(batchSize);
        std::vector<double> rewards(batchSize);
        std::vector<std::vector<double>> nextStates(batchSize);
        std::vector<bool> dones(batchSize);

        std::uniform_int_distribution<size_t> dist(0, buffer.size() - 1);

        for (size_t i = 0; i < batchSize; ++i) {
            size_t idx = dist(rng);
            const Transition& transition = buffer[idx];

            states[i] = transition.state;
            actions[i] = transition.action;
            rewards[i] = transition.reward;
            nextStates[i] = transition.nextState;
            dones[i] = transition.done;
        }

        return {states, actions, rewards, nextStates, dones};
    }

    bool isReady(size_t batchSize) const {
        return buffer.size() >= batchSize;
    }
};

class AdamOptimizer {
public:
    double lr;
    double beta1;
    double beta2;
    double epsilon;
    int t;

    AdamOptimizer(double learning_rate, double beta1, double beta2, double epsilon)
        : lr(learning_rate), beta1(beta1), beta2(beta2), epsilon(epsilon), t(0) {}

    void update(vector<vector<double>>& weights, vector<vector<double>>& m, vector<vector<double>>& v, const vector<vector<double>>& grads) {
        t++;
        for (size_t i = 0; i < weights.size(); ++i) {
            for (size_t j = 0; j < weights[0].size(); ++j) {
                m[i][j] = beta1 * m[i][j] + (1 - beta1) * grads[i][j];
                v[i][j] = beta2 * v[i][j] + (1 - beta2) * grads[i][j] * grads[i][j];
                double m_hat = m[i][j] / (1 - pow(beta1, t));
                double v_hat = v[i][j] / (1 - pow(beta2, t));
                weights[i][j] -= lr * m_hat / (sqrt(v_hat) + epsilon);
            }
        }
    }

    void update(vector<double>& biases, vector<double>& m, vector<double>& v, const vector<double>& grads) {
        t++;
        for (size_t i = 0; i < biases.size(); ++i) {
            m[i] = beta1 * m[i] + (1 - beta1) * grads[i];
            v[i] = beta2 * v[i] + (1 - beta2) * grads[i] * grads[i];
            double m_hat = m[i] / (1 - pow(beta1, t));
            double v_hat = v[i] / (1 - pow(beta2, t));
            biases[i] -= lr * m_hat / (sqrt(v_hat) + epsilon);
        }
    }
};

class AdamWOptimizer {
public:
    double lr;         // Learning rate
    double beta1;      // Exponential decay rate for the first moment estimates
    double beta2;      // Exponential decay rate for the second moment estimates
    double epsilon;    // Small constant to prevent division by zero
    double weightDecay; // Weight decay coefficient (L2 regularization)
    int t;             // Time step

    AdamWOptimizer(double learning_rate, double beta1, double beta2, double epsilon, double weightDecay)
        : lr(learning_rate), beta1(beta1), beta2(beta2), epsilon(epsilon), weightDecay(weightDecay), t(0) {}

    void update(vector<vector<double>>& weights, vector<vector<double>>& m, vector<vector<double>>& v, const vector<vector<double>>& grads) {
        t++;
        for (size_t i = 0; i < weights.size(); ++i) {
            for (size_t j = 0; j < weights[0].size(); ++j) {
                // Update biased first moment estimate
                m[i][j] = beta1 * m[i][j] + (1 - beta1) * grads[i][j];

                // Update biased second raw moment estimate
                v[i][j] = beta2 * v[i][j] + (1 - beta2) * grads[i][j] * grads[i][j];

                // Compute bias-corrected first moment estimate
                double m_hat = m[i][j] / (1 - pow(beta1, t));

                // Compute bias-corrected second raw moment estimate
                double v_hat = v[i][j] / (1 - pow(beta2, t));

                // Apply weight decay
                weights[i][j] -= lr * weightDecay * weights[i][j];

                // Update weights with AdamW rule
                weights[i][j] -= lr * m_hat / (sqrt(v_hat) + epsilon);
            }
        }
    }

    void update(vector<double>& biases, vector<double>& m, vector<double>& v, const vector<double>& grads) {
        t++;
        for (size_t i = 0; i < biases.size(); ++i) {
            // Update biased first moment estimate
            m[i] = beta1 * m[i] + (1 - beta1) * grads[i];

            // Update biased second raw moment estimate
            v[i] = beta2 * v[i] + (1 - beta2) * grads[i] * grads[i];

            // Compute bias-corrected first moment estimate
            double m_hat = m[i] / (1 - pow(beta1, t));

            // Compute bias-corrected second raw moment estimate
            double v_hat = v[i] / (1 - pow(beta2, t));

            // Apply weight decay (biases typically don't have weight decay, but adding for completeness)
            biases[i] -= lr * weightDecay * biases[i];

            // Update biases with AdamW rule
            biases[i] -= lr * m_hat / (sqrt(v_hat) + epsilon);
        }
    }
};


class Layer {
public:
    vector<vector<double>> weights;
    vector<double> biases;
    vector<vector<double>> grads_weights;
    vector<double> grads_biases;
    vector<vector<double>> m_weights;
    vector<vector<double>> v_weights;
    vector<double> m_biases;
    vector<double> v_biases;
    vector<vector<double>> cache_inputs;
    vector<vector<double>> cache_z;
    string activation;
    AdamWOptimizer optimizer;

    Layer(int input_dim, int output_dim, const string& activation, AdamWOptimizer optimizer)
        : optimizer(optimizer) {
        random_device rd;
        mt19937 gen(rd());
        normal_distribution<> d(0, 0.01);
        normal_distribution<> dist(0, std::sqrt(2.0 / input_dim));

        this->activation = activation;
        weights.resize(input_dim, vector<double>(output_dim));
        grads_weights.resize(input_dim, vector<double>(output_dim, 0.0));
        m_weights.resize(input_dim, vector<double>(output_dim, 0.0));
        v_weights.resize(input_dim, vector<double>(output_dim, 0.0));
        biases.resize(output_dim, 0.0);
        grads_biases.resize(output_dim, 0.0);
        m_biases.resize(output_dim, 0.0);
        v_biases.resize(output_dim, 0.0);

        for (int i = 0; i < input_dim; ++i) {
            for (int j = 0; j < output_dim; ++j) {
                if(activation != "relu"){
                    weights[i][j] = d(gen);
                } else {
                    weights[i][j] = dist(gen);
                }
            }
        }
    }

    vector<vector<double>> forward(const vector<vector<double>>& inputs) {
        int batch_size = inputs.size();
        int output_dim = weights[0].size();

        vector<vector<double>> z(batch_size, vector<double>(output_dim));
        vector<vector<double>> a(batch_size, vector<double>(output_dim));

        for (int i = 0; i < batch_size; ++i) {
            for (int j = 0; j < output_dim; ++j) {
                for (int k = 0; k < inputs[0].size(); ++k) {
                    z[i][j] += inputs[i][k] * weights[k][j];
                }
                z[i][j] += biases[j];
                a[i][j] = (activation == "relu") ? max(0.0, z[i][j]) : z[i][j];
            }
        }

        cache_inputs = inputs;
        cache_z = z;
        return a;
    }

    vector<vector<double>> backward(const vector<vector<double>>& grad_output, double& grad_clip) {
        int batch_size = cache_inputs.size();
        int input_dim = cache_inputs[0].size();
        int output_dim = weights[0].size();

        vector<vector<double>> grad_inputs(batch_size, vector<double>(input_dim));

        for (int i = 0; i < batch_size; ++i) {
            for (int j = 0; j < output_dim; ++j) {
                double grad_z = (activation == "relu") ? ((cache_z[i][j] > 0) ? grad_output[i][j] : 0.0) : grad_output[i][j];

                if(grad_z < grad_clip) {
                    grad_z = grad_clip;
                } else if(grad_z > grad_clip) {
                    grad_z = -grad_clip;
                }

                grads_biases[j] += grad_z;
                for (int k = 0; k < input_dim; ++k) {
                    grads_weights[k][j] += cache_inputs[i][k] * grad_z;
                    grad_inputs[i][k] += weights[k][j] * grad_z;
                }
            }
        }

        return grad_inputs;
    }

    void update_weights() {
        optimizer.update(weights, m_weights, v_weights, grads_weights);
        optimizer.update(biases, m_biases, v_biases, grads_biases);
    }

    void reset_gradients() {
        for (auto& row : grads_weights) {
            fill(row.begin(), row.end(), 0.0);
        }
        fill(grads_biases.begin(), grads_biases.end(), 0.0);
    }
private:
    double He_initialzation(double input_size) {
        // He initialization standard deviation
        double stddev = std::sqrt(2.0 / input_size);

        // Random number generator with normal distribution
        std::random_device rd;
        std::mt19937 gen(rd());
        std::normal_distribution<double> dist(0.0, stddev);

        return dist(gen);
    }
};

class NeuralNetwork {
public:
    vector<Layer> layers;

    void add_layer(const Layer& layer) {
        layers.push_back(layer);
    }

    vector<vector<double>> forward(const vector<vector<double>>& inputs) {
        vector<vector<double>> out = inputs;
        for (auto& layer : layers) {
            out = layer.forward(out);
        }
        return out;
    }

    void backward(const vector<vector<double>>& grad_output, double& grad_clip) {
        vector<vector<double>> grad = grad_output;
        for (auto it = layers.rbegin(); it != layers.rend(); ++it) {
            grad = it->backward(grad, grad_clip);
        }
    }

    void update_weights() {
        for (auto& layer : layers) {
            layer.update_weights();
            layer.reset_gradients();
        }
    }
};