## ---- include = FALSE----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup---------------------------------------------------------------
library(matricks)

## ----init.gid.world------------------------------------------------------
# Defining possible actions
# FALSE = unavailable field
actions <- m(T, T, T, F, T|
             T, T, F, T, T|
             F, T, T, T, T|
             T, T, T, T, T|
             F, F, T, T, T|
             F, T, T, T, F)
plot(actions)
# Defining rewards matrix with two terminal states
rewards <- with_same_dims(actions, 0)
rewards <- sv(rewards,
              c(4, 4) ~ 1., # Win
              c(5, 3) ~ -1) # Lose
# We add small penalties, which encourages models
# to reduce number of steps it passes 
rewards[rewards == 0] <- -0.1
plot(rewards)
# List of possible state indices
states <- matrix_idx(actions, actions)

## ----fixed.policy--------------------------------------------------------
# Symbols for moves
U <- "U" # Up
D <- "D" # Down
L <- "L" # Left
R <- "R" # Right

fixed.policy <- m(R , D , L , NA, D |
                  R , D , NA, D , L |
                  NA, R , D , D , D |
                  R , R , D , NA, D |
                  NA, NA, NA, U , L |
                  NA, R , U , L , NA)

## ----as.idx--------------------------------------------------------------
as_idx <- function(x){
  n.row <- nrow(x)
  n.col <- ncol(x)

  result <- matrix(list(), nrow = n.row, ncol = n.col)

  for (i in 1:n.row){
    for (j in 1:n.col){
      coords <- c(i, j)
      
      if (is.na(at(x, coords)))
        next
      
      x.val <- x[i, j]

      switch (x.val,
        U = c(i - 1, j),
        D = c(i + 1, j),
        L = c(i, j - 1),
        R = c(i, j + 1),
      ) -> move.idx

      if (!is_idx_possible(x, move.idx))
        next

      result[i, j] <- list(move.idx)
    }
  }
  result
}

fixed.policy.idx <- as_idx(fixed.policy)
fixed.policy.idx

## ----policy.evaluation---------------------------------------------------
evaluate_policy <- function(actions, rewards, policy, 
                            epsilon = 1e-3, gamma = 0.9){
  
  V <- with_same_dims(policy, 0)
  policy.idx <- as_idx(policy)

  while (TRUE) {
    biggest.change <- 0
  
    for (move in seq_matrix(policy.idx)) {
      s <- move[[1]] # Action, value at index s
      a <- move[[2]]
   
      old.v <- at(V, s)

      if(!at(actions, s))
        next
      
      if(is.null(a))
        next
        
      r <- at(rewards, a)
      at(V, s) <- r + gamma * at(V, a)
      biggest.change <- max(biggest.change, abs(old.v - at(V, s)))
    }
    
    print(biggest.change)
    
    if (biggest.change < epsilon){
      break
    }
  }
  return(V)
}  

## ----experiment----------------------------------------------------------
V1 <- evaluate_policy(actions = actions, 
                      rewards = rewards,
                      policy  = fixed.policy)
V1
plot(V1)