Advertisement
Trainlover08

nclude/ai_folder/ai_versions/ai_v0.2/cart-pole.cpp

Oct 30th, 2024
28
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.92 KB | None | 0 0
  1. // (include/ai_folder/ai_versions/ai_v0.2/cart-pole.cpp)
  2. // This is my testbed for testing new rl advancements
  3.  
  4.  
  5. #include <iostream>
  6. #include <vector>
  7. #include <cmath>
  8. #include <random>
  9. #include <algorithm>
  10. #include <cassert>
  11. #include <cstdlib>
  12.  
  13. // Placeholder for the neural network and cart-pole environment
  14. #include "neural_network.cpp"
  15. #include "cart-pole-env.cpp"
  16.  
  17. bool E_NEW_NET = true;
  18.  
  19.  
  20. // Function to sample an action based on action probabilities
  21. int sampleAction(const std::vector<double>& actionProbs) {
  22. assert(actionProbs.size() == 1); // For CartPole, we have two actions (left, right)
  23.  
  24. double p = static_cast<double>(rand()) / RAND_MAX;
  25. return (p < actionProbs[0]) ? 0 : 1;
  26. }
  27.  
  28. // Main training loop for policy gradient
  29. void trainCartPolePolicyGradient(NeuralNetwork& actor, NeuralNetwork& critic, LinearRegression& env, int numEpisodes, double gamma, double learningRate, double GRADIENT_CLIP_THRESHOLD, double weight_decay) {
  30. AdamWOptimizer actorOptimizer(learningRate, 0.9, 0.999, 0.01, weight_decay);
  31. AdamWOptimizer criticOptimizer(learningRate, 0.9, 0.999, 0.01, weight_decay);
  32.  
  33. actor.add_layer(Layer(2, 128, "relu", actorOptimizer));
  34. actor.add_layer(Layer(128, 128, "relu", actorOptimizer));
  35. actor.add_layer(Layer(128, 1, "linear", actorOptimizer)); // Output probabilities for actions
  36.  
  37. critic.add_layer(Layer(2, 64, "relu", criticOptimizer));
  38. critic.add_layer(Layer(64, 64, "relu", criticOptimizer));
  39. critic.add_layer(Layer(64, 1, "linear", criticOptimizer)); // Single output for state value
  40.  
  41. if(E_NEW_NET == 1) {
  42. // Save neural network to file
  43. actor.save("actor_network_params.txt");
  44. critic.save("critic_network_params.txt");
  45. }
  46.  
  47. for (int episode = 0; episode <= numEpisodes; ++episode) {
  48. // Load neural network from file
  49. NeuralNetwork actorLoadedNN;
  50. NeuralNetwork criticLoadedNN;
  51. actorLoadedNN.load("actor_network_params.txt");
  52. criticLoadedNN.load("critic_network_params.txt");
  53.  
  54. actor = actorLoadedNN;
  55. critic = criticLoadedNN;
  56.  
  57. std::vector<std::vector<double>> states;
  58. std::vector<int> actions;
  59. std::vector<double> rewards, logProbs, values;
  60.  
  61. env.reset();
  62. while (!env.isDone()) {
  63. std::vector<double> state = env.getState();
  64. states.push_back(state);
  65.  
  66. // Get action probabilities from the actor network
  67. std::vector<std::vector<double>> actionProbs = actor.forward({state});
  68.  
  69. // Sample an action based on the probabilities
  70. double action = actionProbs[0][0]; // Use the actionProbs for action sampling
  71. actions.push_back(action);
  72.  
  73. // std::cout << "Action: " << action << '\n';
  74.  
  75. // Log probability of the action
  76. logProbs.push_back(std::log(std::max(action, 1e-8)));
  77.  
  78. // Get the value estimate from the critic network
  79. std::vector<std::vector<double>> valueEstimates = critic.forward({state});
  80. values.push_back(valueEstimates[0][0]);
  81.  
  82. // Take the action in the environment
  83. env.step(action);
  84.  
  85. // Store the reward
  86. rewards.push_back(env.getReward());
  87. }
  88.  
  89. // Compute the advantages using the critic network
  90. std::vector<double> advantages;
  91. double reward = 0;
  92.  
  93. for (int t = 0; t < rewards.size(); ++t) {
  94. double td_target = rewards[t] + (t < rewards.size() - 1 ? gamma * values[t + 1] : 0.0);
  95. advantages.push_back(td_target - values[t]);
  96. reward += rewards[t];
  97. }
  98.  
  99. // Compute the policy (actor) loss
  100. double actorLoss = computeLoss(logProbs, advantages);
  101.  
  102. if(episode % 100 == 0) {
  103. std::cout << "Episode " << episode << ", Actor Loss: " << actorLoss << ", Reward: " << reward << std::endl;
  104. }
  105.  
  106. // Compute the critic loss (mean squared error)
  107. double criticLoss = 0.0;
  108. for (size_t i = 0; i < rewards.size(); ++i) {
  109. double td_target = rewards[i] + (i < rewards.size() - 1 ? gamma * values[i + 1] : 0.0);
  110. criticLoss += pow(td_target - values[i], 2);
  111. }
  112. criticLoss /= rewards.size();
  113.  
  114. // Backpropagate and update actor network
  115. actor.backward({{actorLoss}}, GRADIENT_CLIP_THRESHOLD);
  116. actor.update_weights();
  117.  
  118. // Backpropagate and update critic network
  119. critic.backward({{criticLoss}}, GRADIENT_CLIP_THRESHOLD);
  120. critic.update_weights();
  121.  
  122. // Save neural network to file
  123. actor.save("actor_network_params.txt");
  124. critic.save("critic_network_params.txt");
  125. }
  126. }
  127.  
  128. int main() {
  129. LinearRegression env;
  130. NeuralNetwork actor;
  131. NeuralNetwork critic;
  132.  
  133. trainCartPolePolicyGradient(actor, critic, env, 1000, 0.99, 0.01, 0.1, 1e-4);
  134.  
  135. return 0;
  136. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement