markovDecision.py

import numpy as np
from tmc import TransitionMatrixCalculator as tmc

class MarkovDecisionProcess :
    def __init__(self, layout: list, circle: bool):
        # Initialize the Markov Decision Process solver with layout and game mode (circle or not)
        self.Numberk = 15
        self.tmc_instance = tmc()

        # Compute transition matrices for safe, normal, and risky scenarios
        self.safe_dice = self.tmc_instance._compute_safe_matrix()
        self.normal_dice, _ = self.tmc_instance._compute_normal_matrix(layout, circle)  
        self.risky_dice, _ = self.tmc_instance._compute_risky_matrix(layout, circle)    

        # Identify jail states in the layout
        self.jail = [i for i, x in enumerate(layout) if x == 3]

        # Initialize value and dice decision arrays
        self.ValueI = np.zeros(self.Numberk)
        self.Dice = np.zeros(self.Numberk - 1)

    def _compute_vi_safe(self, k : int ):
        # Compute the expected value using safe dice transition matrix for state k
        return np.dot(self.safe_dice[k], self.ValueI) + np.sum(self.normal_dice[k][self.jail])


    def _compute_vi_normal(self, k : int ):
        # Compute the expected value using normal dice transition matrix for state k
        vi_normal = np.dot(self.normal_dice[k], self.ValueI) + np.sum(self.normal_dice[k][self.jail])
        return vi_normal


    def _compute_vi_risky(self, k : int ):
        # Compute the expected value using risky dice transition matrix for state k
        vi_risky = np.dot(self.risky_dice[k], self.ValueI) + np.sum(self.risky_dice[k][self.jail])
        return vi_risky

    def solve(self):
        # Iteratively solve the Markov Decision Process until convergence
        i = 0
        while True:
            ValueINew = np.zeros(self.Numberk)
            i += 1

            for k in range(self.Numberk - 1):
                # Compute expected values for safe, normal, and risky decisions at state k
                vi_safe = self._compute_vi_safe(k)
                vi_normal = self._compute_vi_normal(k)
                vi_risky = self._compute_vi_risky(k)

                # Determine the minimum value among safe, normal, and risky decisions
                min_value = min(vi_safe, vi_normal, vi_risky)

                # Record the dice decision (safe=1, normal=2, risky=3) corresponding to the minimum value
                if min_value == vi_safe:
                    ValueINew[k] = 1 + vi_safe
                    self.Dice[k] = 1
                elif min_value == vi_normal:
                    ValueINew[k] = 1 + vi_normal
                    self.Dice[k] = 2
                else:
                    ValueINew[k] = 1 + vi_risky
                    self.Dice[k] = 3

            # Check for convergence
            if np.allclose(ValueINew, self.ValueI):
                self.ValueI = ValueINew
                break

            self.ValueI = ValueINew

        # Return the expected values and dice decisions for each state
        Expec = self.ValueI[:-1]
        return [Expec, self.Dice]

def markovDecision(layout : list, circle : bool):
    # Solve the Markov Decision Problem for the given layout and game mode
    solver = MarkovDecisionProcess(layout, circle)
    return solver.solve()