Advertisement
jules0707

Nim.py

Mar 8th, 2024 (edited)
4
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.77 KB | None | 0 0
  1. import math
  2. import random
  3. import time
  4.  
  5.  
  6. class Nim():
  7.  
  8.     def __init__(self, initial=[1, 3, 5, 7]):
  9.         """
  10.        Initialize game board.
  11.        Each game board has
  12.            - `piles`: a list of how many elements remain in each pile
  13.            - `player`: 0 or 1 to indicate which player's turn
  14.            - `winner`: None, 0, or 1 to indicate who the winner is
  15.        """
  16.         self.piles = initial.copy()
  17.         self.player = 0
  18.         self.winner = None
  19.  
  20.     @classmethod
  21.     def available_actions(cls, piles):
  22.         """
  23.        Nim.available_actions(piles) takes a `piles` list as input
  24.        and returns all of the available actions `(i, j)` in that state.
  25.  
  26.        Action `(i, j)` represents the action of removing `j` items
  27.        from pile `i` (where piles are 0-indexed).
  28.        """
  29.         actions = set()
  30.         for i, pile in enumerate(piles):
  31.             for j in range(1, pile + 1):
  32.                 actions.add((i, j))
  33.         return actions
  34.  
  35.     @classmethod
  36.     def other_player(cls, player):
  37.         """
  38.        Nim.other_player(player) returns the player that is not
  39.        `player`. Assumes `player` is either 0 or 1.
  40.        """
  41.         return 0 if player == 1 else 1
  42.  
  43.     def switch_player(self):
  44.         """
  45.        Switch the current player to the other player.
  46.        """
  47.         self.player = Nim.other_player(self.player)
  48.  
  49.     def move(self, action):
  50.         """
  51.        Make the move `action` for the current player.
  52.        `action` must be a tuple `(i, j)`.
  53.        """
  54.         pile, count = action
  55.  
  56.         # Check for errors
  57.         if self.winner is not None:
  58.             raise Exception("Game already won")
  59.         elif pile < 0 or pile >= len(self.piles):
  60.             raise Exception("Invalid pile")
  61.         elif count < 1 or count > self.piles[pile]:
  62.             raise Exception("Invalid number of objects")
  63.  
  64.         # Update pile
  65.         self.piles[pile] -= count
  66.         self.switch_player()
  67.  
  68.         # Check for a winner
  69.         if all(pile == 0 for pile in self.piles):
  70.             self.winner = self.player
  71.  
  72.  
  73. class NimAI():
  74.  
  75.     def __init__(self, alpha=0.5, epsilon=0.1):
  76.         """
  77.        Initialize AI with an empty Q-learning dictionary,
  78.        an alpha (learning) rate, and an epsilon rate.
  79.  
  80.        The Q-learning dictionary maps `(state, action)`
  81.        pairs to a Q-value (a number).
  82.         - `state` is a tuple of remaining piles, e.g. (1, 1, 4, 4)
  83.         - `action` is a tuple `(i, j)` for an action
  84.        """
  85.         self.q = dict()
  86.         self.alpha = alpha
  87.         self.epsilon = epsilon
  88.  
  89.     def update(self, old_state, action, new_state, reward):
  90.         """
  91.        Update Q-learning model, given an old state, an action taken
  92.        in that state, a new resulting state, and the reward received
  93.        from taking that action.
  94.        """
  95.         old = self.get_q_value(old_state, action)
  96.         best_future = self.best_future_reward(new_state)
  97.         self.update_q_value(old_state, action, old, reward, best_future)
  98.  
  99.     def get_q_value(self, state, action):
  100.         """
  101.        Return the Q-value for the state `state` and the action `action`.
  102.        If no Q-value exists yet in `self.q`, return 0.
  103.        """
  104.         key = (tuple(state), action)
  105.         if key in self.q:
  106.             return self.q[key]
  107.         else:
  108.             return 0
  109.  
  110.     def update_q_value(self, state, action, old_q, reward, future_rewards):
  111.         """
  112.        Update the Q-value for the state `state` and the action `action`
  113.        given the previous Q-value `old_q`, a current reward `reward`,
  114.        and an estiamte of future rewards `future_rewards`.
  115.  
  116.        Use the formula:
  117.  
  118.        Q(s, a) <- old value estimate
  119.                   + alpha * (new value estimate - old value estimate)
  120.  
  121.        where `old value estimate` is the previous Q-value,
  122.        `alpha` is the learning rate, and `new value estimate`
  123.        is the sum of the current reward and estimated future rewards.
  124.        """
  125.         key = (tuple(state), action)
  126.         new_q = old_q + self.alpha * (reward+future_rewards-old_q)
  127.         self.q[key] = new_q
  128.  
  129.     def best_future_reward(self, state):
  130.         """
  131.        Given a state `state`, consider all possible `(state, action)`
  132.        pairs available in that state and return the maximum of all
  133.        of their Q-values.
  134.  
  135.        Use 0 as the Q-value if a `(state, action)` pair has no
  136.        Q-value in `self.q`. If there are no available actions in
  137.        `state`, return 0.
  138.        """
  139.         return self.best_future_reward_action(state,True,False)
  140.    
  141.  
  142.  
  143.     def choose_action(self, state, epsilon=True):
  144.         """
  145.        Given a state `state`, return an action `(i, j)` to take.
  146.  
  147.        If `epsilon` is `False`, then return the best action
  148.        available in the state (the one with the highest Q-value,
  149.        using 0 for pairs that have no Q-values).
  150.  
  151.        If `epsilon` is `True`, then with probability
  152.        `self.epsilon` choose a random available action,
  153.        otherwise choose the best action available.
  154.  
  155.        If multiple actions have the same Q-value, any of those
  156.        options is an acceptable return value.
  157.        """
  158.        
  159.         actions = Nim.available_actions(state)
  160.         random_action = random.choice(list(actions))
  161.         best_action = self.best_future_reward_action(state,False,True)
  162.  
  163.         if epsilon is False:
  164.             return best_action
  165.         else:
  166.             if random.random() < self.epsilon:  # This happens with probability epsilon
  167.                 return random_action
  168.             else:  # This happens with probability 1-epsilon
  169.                 return best_action
  170.  
  171.  
  172. ##################### UTILITY #########################
  173.            
  174.     def best_future_reward_action(self, state, best_reward, best_action):
  175.    
  176.         actions = Nim.available_actions(state)
  177.  
  178.         if len(actions)==0:
  179.             return 0
  180.         else:
  181.             random_action = random.choice(list(actions))
  182.             maximum = 0
  183.  
  184.             for action in actions:
  185.                 key = (tuple(state), action)
  186.                 if (key) in self.q:
  187.                     val = self.q[key]
  188.                 else:
  189.                     val = 0
  190.  
  191.                 if val > maximum:
  192.                     maximum = val
  193.                     best = action
  194.  
  195.             if best_reward == True:
  196.                 return maximum
  197.             if best_action == True:
  198.                 try:
  199.                     return best
  200.                 except UnboundLocalError:
  201.                     return random_action
  202.  
  203.  
  204. def train(n):
  205.     """
  206.    Train an AI by playing `n` games against itself.
  207.    """
  208.  
  209.     player = NimAI()
  210.  
  211.     # Play n games
  212.     for i in range(n):
  213.         print(f"Playing training game {i + 1}")
  214.         game = Nim()
  215.  
  216.         # Keep track of last move made by either player
  217.         last = {
  218.             0: {"state": None, "action": None},
  219.             1: {"state": None, "action": None}
  220.         }
  221.  
  222.         # Game loop
  223.         while True:
  224.  
  225.             # Keep track of current state and action
  226.             state = game.piles.copy()
  227.             action = player.choose_action(game.piles)
  228.  
  229.             # Keep track of last state and action
  230.             last[game.player]["state"] = state
  231.             last[game.player]["action"] = action
  232.  
  233.             # Make move
  234.             game.move(action)
  235.             new_state = game.piles.copy()
  236.  
  237.             # When game is over, update Q values with rewards
  238.             if game.winner is not None:
  239.                 player.update(state, action, new_state, -1)
  240.                 player.update(
  241.                     last[game.player]["state"],
  242.                     last[game.player]["action"],
  243.                     new_state,
  244.                     1
  245.                 )
  246.                 break
  247.  
  248.             # If game is continuing, no rewards yet
  249.             elif last[game.player]["state"] is not None:
  250.                 player.update(
  251.                     last[game.player]["state"],
  252.                     last[game.player]["action"],
  253.                     new_state,
  254.                     0
  255.                 )
  256.  
  257.     print("Done training")
  258.  
  259.     # Return the trained AI
  260.     return player
  261.  
  262.  
  263. def play(ai, human_player=None):
  264.     """
  265.    Play human game against the AI.
  266.    `human_player` can be set to 0 or 1 to specify whether
  267.    human player moves first or second.
  268.    """
  269.  
  270.     # If no player order set, choose human's order randomly
  271.     if human_player is None:
  272.         human_player = random.randint(0, 1)
  273.  
  274.     # Create new game
  275.     game = Nim()
  276.  
  277.     # Game loop
  278.     while True:
  279.  
  280.         # Print contents of piles
  281.         print()
  282.         print("Piles:")
  283.         for i, pile in enumerate(game.piles):
  284.             print(f"Pile {i}: {pile}")
  285.         print()
  286.  
  287.         # Compute available actions
  288.         available_actions = Nim.available_actions(game.piles)
  289.         time.sleep(1)
  290.  
  291.         # Let human make a move
  292.         if game.player == human_player:
  293.             print("Your Turn")
  294.             while True:
  295.                 pile = int(input("Choose Pile: "))
  296.                 count = int(input("Choose Count: "))
  297.                 if (pile, count) in available_actions:
  298.                     break
  299.                 print("Invalid move, try again.")
  300.  
  301.         # Have AI make a move
  302.         else:
  303.             print("AI's Turn")
  304.             pile, count = ai.choose_action(game.piles, epsilon=False)
  305.             print(f"AI chose to take {count} from pile {pile}.")
  306.  
  307.         # Make move
  308.         game.move((pile, count))
  309.  
  310.         # Check for winner
  311.         if game.winner is not None:
  312.             print()
  313.             print("GAME OVER")
  314.             winner = "Human" if game.winner == human_player else "AI"
  315.             print(f"Winner is {winner}")
  316.             return
  317.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement